In [2]:
pip install -r requirements.txt

In [9]:
import io
import time
from __future__ import print_function
from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseDownload
from googleapiclient.errors import HttpError
from oauth2client.service_account import ServiceAccountCredentials

from gdrive.gdrive_handler import GspreadHandler

from pdf2image import convert_from_path
import os
import pandas as pd
from settings import GEMINI_API_KEY
from llm_handler.GHandler import GHandler
import fitz  # Import PyMuPDF

# Replace with your credentials file path
CREDENTIALS_FILE = 'smart-platform.json'
SHEET_NAME = "Master Database" 
WORKSHEET_NAME = "inventory"

gspread_handler = GspreadHandler(credentials_filepath=CREDENTIALS_FILE)

def gemini_ocr(pdf_file):
    """Performs OCR on the given PDF using Gemini and returns extracted text."""
    ghandler = GHandler(GEMINI_API_KEY, generation_config={"temperature": 0.9, "top_p": 0.95, "top_k": 40, "max_output_tokens": 40000}, block_threshold="BLOCK_NONE")
    prompt = "You are an OCR bot. Extract ALL the text from the image as raw text. Ensure all pricing, phone numbers, and emails are extracted ACCURATELY. OCR text output is sometimes wrong, so correct it where needed."

    doc = fitz.open(pdf_file)
    extracted_text = ""
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72))  # Increase resolution for better OCR
        image_path = pdf_file.replace('.pdf', f'_page_{page_num + 1}.jpg')
        pix.save(image_path)  # Save each page as an image
        response = ghandler.prompt_image(image_path=image_path, prompt_1=prompt, prompt_2=None, model_name="gemini-pro-vision")
        print(response)
        extracted_text += response.text
        os.remove(image_path)  # Clean up the image file

    return extracted_text



def update_google_sheet(destination, title, text):
    """Updates the Google Sheet with the extracted text."""
    data = [{"Destination": destination, "Title": title, "Text": text}]
    df = pd.DataFrame(data)
    print(df)
    # replace with the correct sheet name 
    gspread_handler.update_cols(df, SHEET_NAME, WORKSHEET_NAME) #replace with the correct sheet name 


def get_google_drive_service():
  """Initializes the Google Drive API service."""
  scopes = ['https://www.googleapis.com/auth/drive.readonly']
  credentials = ServiceAccountCredentials.from_json_keyfile_name(CREDENTIALS_FILE, scopes)
  service = build('drive', 'v3', credentials=credentials)
  return service

def download_file(service, file_id, file_name):
    """Downloads the specified file from Google Drive."""
    os.makedirs(os.path.dirname(file_name), exist_ok=True)
    request = service.files().get_media(fileId=file_id)
    fh = io.FileIO(file_name, 'wb')
    downloader = MediaIoBaseDownload(fh, request)
    done = False
    while done is False:
        status, done = downloader.next_chunk()
        print(f"Download {int(status.progress() * 100)}%")

def get_folder_contents(service, folder_id):
    """Retrieves a list of PDF files within the specified folder and subfolders."""
    all_files = []
    page_token = None
    while True:
        try:
            results = service.files().list(
                pageSize=1000,  # Fetch a larger batch for efficiency
                fields="nextPageToken, files(id, name, mimeType)",
                q=f"'{folder_id}' in parents and mimeType='application/pdf'",  # Filter for PDFs
                pageToken=page_token
            ).execute()

            all_files.extend(results.get('files', []))
            page_token = results.get('nextPageToken')
            if not page_token:
                break  # No more pages

        except HttpError as error:
            print(f"An error occurred: {error}")
            return None

    return all_files


def get_download_link(item):
  """Retrieves the download link for a file based on its mimeType."""
  if item['mimeType'].startswith('application/'):  # Check if it's a Google Doc
    return None  # Google Docs don't have direct download links
  else:
    return f"https://drive.google.com/uc?export=download&id={item['id']}"


def run_ETL():
    """Main function to download, OCR, and update Google Sheets."""
    service = get_google_drive_service()
    folder_contents = get_folder_contents(service, FOLDER_ID)

    if folder_contents:
        print(f"Found {len(folder_contents)} PDF files in the specified folder.")
        for item in folder_contents:
            print(f"Processing: {item['name']}")
            file_name = item['name']
            file_id = item['id']
            # file_path = os.path.join("downloaded_pdfs", file_name)
            file_path = file_name

            print(f"Downloading: {file_name}")
            download_file(service, file_id, file_path)
            print(f"Downloaded: {file_name}")

            # Extract destination and title from file name
            parts = file_name.split('/')
            destination = parts[-2] if len(parts) > 1 else ""
            title = os.path.splitext(parts[-1])[0]

            # Perform OCR
            print(f"Performing OCR on: {file_name}")
            text = gemini_ocr(file_path)
            time.sleep(30)
            # Update Google Sheet
            print(f"Updating Google Sheet with: {file_name}")
            update_google_sheet(destination, title, text)
            
            os.remove(file_path) # remove file after using
    else:
        print("No PDF files found in the specified folder.")





In [11]:
# Replace with the ID of the target folder
FOLDER_ID = '1IYmwA194hmGB9v4yvbfG8ybH4p_t2jsz'
# FOLDER_ID = '1uD7SEGQ5Y2o6oXMp-s53kKnKXXsVSD-X'
# https://drive.google.com/drive/folders/1uD7SEGQ5Y2o6oXMp-s53kKnKXXsVSD-X?usp=drive_link -- this one not cos not png?
# https://drive.google.com/drive/folders/11h9zIUN9MU9m8MyEkIJzp-dAjjSNyyUp?usp=drive_link
run_ETL()

# Process flow:
# to_be_processed folder --> datapipeline 

KeyboardInterrupt: 

In [8]:
from __future__ import print_function
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from oauth2client.service_account import ServiceAccountCredentials

# Replace with your credentials file path
CREDENTIALS_FILE = 'smart-platform.json'

# Replace with the ID of the target folder
FOLDER_ID = '1nstotWI9LYvUamNw-NSew-jnVDD7VAaH'


def get_google_drive_service():
  """Initializes the Google Drive API service."""
  scopes = ['https://www.googleapis.com/auth/drive.readonly']
  credentials = ServiceAccountCredentials.from_json_keyfile_name(CREDENTIALS_FILE, scopes)
  service = build('drive', 'v3', credentials=credentials)
  return service


def get_folder_contents(service, folder_id):
  """Retrieves a list of files and folders within the specified folder."""
  try:
    results = service.files().list(
        pageSize=100,  # Adjust page size as needed
        fields="nextPageToken, files(id, name, mimeType)",
        q=f"'{folder_id}' in parents"
    ).execute()
    items = results.get('files', [])

    # Check for next page of results and recursively call if available
    if 'nextPageToken' in results:
      next_page_items = get_folder_contents(service, folder_id, results['nextPageToken'])
      items.extend(next_page_items)
    return items
  except HttpError as error:
    print(f"An error occurred: {error}")
    return None


def get_download_link(item):
  """Retrieves the download link for a file based on its mimeType."""
  if item['mimeType'].startswith('application/'):  # Check if it's a Google Doc
    return None  # Google Docs don't have direct download links
  else:
    return f"https://drive.google.com/uc?export=download&id={item['id']}"


def main():
  """Main function to get folder contents and download links."""
  service = get_google_drive_service()
  folder_contents = get_folder_contents(service, FOLDER_ID)

  if folder_contents:
    for item in folder_contents:
      download_link = get_download_link(item)
      if download_link:
        print(f"Name: {item['name']}, Download Link: {download_link}")
      else:
        print(f"Name: {item['name']}, Download Link: Not available (Google Doc)")
  else:
    print("No files found in the specified folder.")


if __name__ == '__main__':
  main()

Name: partners - malacca - attractions/menara taming sari.pdf, Download Link: Not available (Google Doc)
Name: partners - malacca - attractions/melaka river cruise.pdf, Download Link: Not available (Google Doc)
