In [97]:
# connecting to Google Drive API

from __future__ import print_function

import os.path

from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError

# If modifying these scopes, delete the file token.json.
SCOPES = ['https://www.googleapis.com/auth/drive.metadata.readonly']


def main():
    """Shows basic usage of the Drive v3 API.
    Prints the names and ids of the first 10 files the user has access to.
    """
    creds = None
    # The file token.json stores the user's access and refresh tokens, and is
    # created automatically when the authorization flow completes for the first
    # time.
    if os.path.exists('token.json'):
        creds = Credentials.from_authorized_user_file('token.json', SCOPES)
    # If there are no (valid) credentials available, let the user log in.
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(
                'credentials.json', SCOPES)
            creds = flow.run_local_server(port=0)
        # Save the credentials for the next run
        with open('token.json', 'w') as token:
            token.write(creds.to_json())

    try:
        service = build('drive', 'v3', credentials=creds)

        # Call the Drive v3 API
        results = service.files().list(
            pageSize=10, fields="nextPageToken, files(id, name)").execute()
        items = results.get('files', [])

        if not items:
            print('No files found.')
            return
        print('Files:')
        for item in items:
            print(u'{0} ({1})'.format(item['name'], item['id']))
    except HttpError as error:
        # TODO(developer) - Handle errors from drive API.
        print(f'An error occurred: {error}')


if __name__ == '__main__':
    main()

Files:
150B Fall 2023 - Syllabus (1TfRgTBKC4tSOZ0p5HGt5W7xYzXmSJQBFRxAZsR9xe1k)
Assessment Registry (1ZwrE4AV0rsp6dk_2q4b2dmAWnuMzcx3L4CHVKm6jTUo)
Drop-in Hours (BPH Academic Advisor) (1Ml6lpkZ3TrqAj6r8Jza5gAX8qqyYz9HTMvPcpGQppgE)
SMUHSD Scholarship Opportunities 2023-24 (14GnmebMh-TpoIxSkKYq5gpbVrhbIIVrGRIi0BzgLMF0)
Resources for Student - Public Health Interest (1a0zHJH3SvJwCvPgYiwff114FXcx-N7tmCcPuNfk3uDs)
ISAB Committee Descriptions (1Sg1axAdJqOkUAAXKsxO8uCB4ZNxm8FhfTraajbo2vt8)
Miau_GameVideo.mp4 (19coyvk_TOaqfhVSo8Sj9V_4dYN63VFPh)
Month Dinners (1KRvR9J_uH-Vozq5mirfaLgB79M92vPSfH8Mb0Hw4P8M)
Comprehensive Recruitment Preparation FAQ (1ahxtZDqLib7-kKSa5zcsztxx4TaAsszIcWR_xwCFJsI)
Public Resume for Review- Latest (1lS2m7xpqoZHTOxIQb2Y0VYACIs8L4ed4tRF8weO-Uvw)


note to self --
API keys are stored in conda virtual environemnt 

In [1]:
# read in pdfs
import PyPDF2

# takes in a string (example.pdf) and returns a python list with the contents of the pdf 
def pdf_to_list(pdf_name): # i might need to change this when adding google drive integration
    all_text = ''
    file = open(pdf_name, 'rb')
    reader = PyPDF2.PdfFileReader(file)
    
    for page_number in range(len(reader.pages)):
        page = reader.pages[page_number]
        text = page.extract_text()
        all_text += text

    file.close()
    return [all_text]

import pandas as pd
from tqdm import tqdm # to track ingestion of pdfs via progress bar
import os

def ingest_pdfs(folder_path):
    data = []

    all_files = os.listdir(folder_path) 
    pdf_files = [file for file in all_files if file.endswith('.pdf')]

    for file in tqdm(pdf_files, desc="Processing PDFs"):
        filepath = os.path.join(folder_path, file)
        text = pdf_to_list(filepath)
        data.append({"filename": file, "directory": folder_path, "text": text})
    
    df = pd.DataFrame(data=data)
    return df 

folder_path = 'sample_files'
df = ingest_pdfs(folder_path)

import cohere
import os
cohere_api_key = os.environ.get('COHERE_API_KEY')
co = cohere.Client(cohere_api_key)

def embed(text):
    embedding = co.embed(texts=text) # using default model, returns embedding object. should i have made a file object instead? would have text, name, embedding 
    return embedding.embeddings  

embeddings = [] 
for text in df['text'].tolist():
    embedding = embed(text)
    embeddings += embedding

df['embedding'] = embeddings # this relies upon the fact that the list retains its order 

Processing PDFs: 100%|██████████| 11/11 [00:00<00:00, 23.00it/s]


In [80]:
# you should do this if EXPLAIN WHEN YOU NEED TO DO
# pinecone.create_index("semantic-search", dimension=4096, metric="euclidean")

In [7]:
import pinecone 
pinecone_api_key = os.environ.get('PINECONE_API_KEY')

pinecone_environment = 'gcp-starter'
pinecone.init(api_key=pinecone_api_key, environment=pinecone_environment)

# pinecone.create_index("semantic-search", dimension=8, metric="euclidean")
index = pinecone.Index("semantic-search")

embeddings = df['embedding'].tolist()

filenames = df['filename'].tolist()
to_upsert = zip(filenames, embeddings)
index.upsert(to_upsert) # might need to batch upsert if >100 pdfs 

user_input = input("Please enter your search: ")
user_embedding = embed([user_input]) 
most_similar = 3
similar_matches = index.query(user_embedding, top_k=most_similar, include_values=True)
similar_matches_filenames = [match['id'] for match in similar_matches['matches']]
for i in range(len(similar_matches_filenames)):
    print(str(i + 1) + '.' + ' ' + similar_matches_filenames[i])




1. rikio_resume.pdf
2. Green Gulch app.pdf
3. Launchpad application.pdf
