<a href="https://colab.research.google.com/github/rawhide81/Git-Hub-Pages/blob/main/Google_Drive_PDF_Parser.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
from google.oauth2 import credentials
from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseDownload
from google.auth.transport.requests import Request
from google.auth.exceptions import RefreshError
import io
import PyPDF2  # For PDF parsing
import re  # For regular expressions (keyword/phrase search)
import spacy #For entity recognition

# Configure your Google Drive API credentials
SCOPES = ['https://www.googleapis.com/auth/drive.readonly']  # Read-only access
CREDENTIALS_FILE = 'token.json'  # Path to your credentials file

def get_credentials():
    creds = None
    if os.path.exists(CREDENTIALS_FILE):
        creds = credentials.Credentials.from_authorized_user_file(CREDENTIALS_FILE, SCOPES)
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            try:
                creds.refresh(Request())
            except RefreshError:
                os.remove(CREDENTIALS_FILE)
                return None
        else:
            return None
    return creds

def get_drive_service():
    creds = get_credentials()
    if not creds:
        print("Please authenticate. Run the google drive api quickstart example to generate a token.json file")
        return None
    return build('drive', 'v3', credentials=creds)

def search_files(service, query):
    results = service.files().list(q=query, fields="nextPageToken, files(id, name)").execute()
    items = results.get('files', [])
    return items

def download_file(service, file_id, filename):
    request = service.files().get_media(fileId=file_id)
    fh = io.BytesIO()
    downloader = MediaIoBaseDownload(fh, request)
    done = False
    while done is False:
        status, done = downloader.next_chunk()
    fh.seek(0)
    with open(filename, 'wb') as f:
        f.write(fh.read())

def parse_pdf(file_path, keywords=None, phrases=None, entity_types=None):
    try:
        with open(file_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            text = ""
            for page in reader.pages:
                text += page.extract_text() or "" #Handle cases of no text.

            results = {}

            if keywords:
                results['keywords'] = [keyword for keyword in keywords if re.search(r'\b' + re.escape(keyword) + r'\b', text, re.IGNORECASE)]

            if phrases:
                results['phrases'] = [phrase for phrase in phrases if re.search(re.escape(phrase), text, re.IGNORECASE)]

            if entity_types:
                nlp = spacy.load("en_core_web_sm")
                doc = nlp(text)
                found_entities = []
                for ent in doc.ents:
                    if ent.label_ in entity_types:
                        found_entities.append((ent.text, ent.label_))
                results['entities'] = found_entities

            return results

    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        return None
    except PyPDF2.errors.PdfReadError:
        print(f"Error: Could not read PDF at {file_path}")
        return None
    except Exception as e:
        print(f"An unexpected error occured when parsing {file_path}: {e}")
        return None

def main():
    service = get_drive_service()
    if not service:
        return

    query = "mimeType='application/pdf'"
    pdf_files = search_files(service, query)

    if not pdf_files:
        print("No PDF files found in My Drive.")
        return

    keywords = ["example", "report", "data"]  # Example keywords
    phrases = ["financial analysis", "project timeline"] #Example phrases
    entity_types = ["ORG", "PERSON", "GPE"] #Example entity types.

    for file in pdf_files:
        print(f"Processing: {file['name']}")
        file_id = file['id']
        file_name = file['name']

        download_file(service, file_id, file_name)

        results = parse_pdf(file_name, keywords, phrases, entity_types)

        if results:
            print(f"Results for {file_name}:")
            for key, value in results.items():
                if value: #Only print if results were found
                    print(f"  {key}: {value}")
        os.remove(file_name) #Delete local copy after parsing.
        print("-" * 20)

if __name__ == "__main__":
    main()

ModuleNotFoundError: No module named 'PyPDF2'