In [38]:
from sqlalchemy import create_engine, text
from sqlalchemy.orm import sessionmaker
from dotenv import load_dotenv
import numpy as np
import os, openai, glob
from pdfminer.high_level import extract_text as pdf_extract_text

In [39]:
class InformationRetrievalService:

    def __init__(self):
        # Load environment variables from .env file
        load_dotenv()
        openai.api_key = os.getenv("OPENAI_API_KEY")
        # Get database password from environment variable
        db_password = os.getenv("POSTGRES_PASSWORD")
        # SQLAlchemy engine
        self.engine = create_engine(f'postgresql://postgres:{db_password}@localhost:5432/pdf_db')
        self.Session = sessionmaker(bind=self.engine)

    def pdf_to_embeddings(self, pdf_path: str, chunk_length: int = 1000):
        chunks = []
        # Extract text from PDF
        text = pdf_extract_text(pdf_path)
        chunks.extend([text[i:i+chunk_length].replace('\n', '') for i in range(0, len(text), chunk_length)])
        # Generate embeddings for each chunk of text
        try:
            # Accessing the latest embedding model
            response = openai.embeddings.create(model='text-embedding-3-large', input=chunks)
            # Accessing the embeddings directly from the response object
            embeddings = []
            for embedding_info in response.data:  
                embedding_vector = embedding_info.embedding  # Access the embedding vector
                index = embedding_info.index  # Access the index if needed
                embeddings.append({
                    'vector': embedding_vector,
                    'text': chunks[index]  
                })
            return embeddings
        except Exception as e:
            print(f"An error occurred: {e}")
            return []

    def load_data_to_vector_store(self, embeddings):
        session = self.Session()
        try:
            # Truncate the table before inserting new data
            truncate_query = text("TRUNCATE TABLE pdf_holder RESTART IDENTITY")
            session.execute(truncate_query)

            # Now, insert new data
            for embedding in embeddings:
                vector = np.array(embedding["vector"], dtype=np.float64).tolist()
                insert_query = text("INSERT INTO pdf_holder (text, embedding) VALUES (:text, :embedding)")
                session.execute(insert_query, {"text": embedding["text"], "embedding": vector})
            
            session.commit()
        except Exception as e:
            session.rollback()  # Rollback in case of any error
            print(f"An error occurred: {e}")
        finally:
            session.close()  # Ensuring session is closed after operation

    
    def check_and_print_embeddings(self):
        with self.engine.connect() as connection:
            # Explicitly declare the query as text
            query = text("SELECT id, text, embedding FROM pdf_holder")
            result = connection.execute(query)
            for row in result:
                print(f"ID: {row.id}, Text: {row.text}, Embedding: {row.embedding}")

    def search_in_vector_store(self, user_query: str, k: int = 1):
        # Creates embedding vector from user query/question
        response = openai.embeddings.create(input=user_query, model="text-embedding-3-large")
        if response.data:
            embedded_query = response.data[0].embedding
            query_vector = np.array(embedded_query, dtype=np.float64).tolist()

            # Using cosine distance for nearest neighbor search
            sql_query = text("""
                            SELECT id, text, embedding <=> CAST(:query_vector AS VECTOR) AS distance
                            FROM pdf_holder
                            ORDER BY distance
                            LIMIT :k
                            """)
            
            with self.engine.connect() as conn:
                results = conn.execute(sql_query, {'query_vector': query_vector, 'k': k}).fetchall()
                # Convert results into a list of dictionaries
                search_results = [
                    {"id": row[0], "text": row[1], "distance": row[2]} 
                    for row in results
                ]

                return search_results[0]["text"]
        else:
            print("No data found in your response")
            return [] # Return an empty list if no data is found in the response

In [40]:
# Specify the directory path
DIRECTORY_PATH = '/Users/sternsemasuka/Desktop/ML/Project/Talk-to-your-PDF/pdf_folder' 
# Search for any PDF file in the directory
pdf_file_path = glob.glob(os.path.join(DIRECTORY_PATH, '*.pdf'))[0]

In [41]:
# Embedding the pdf
pdf_embedded = data_service_processor.pdf_to_embeddings(pdf_path = pdf_file_path)

In [42]:
# Load the embedding into the vector store
data_service_processor.load_data_to_vector_store(pdf_embedded)

In [43]:
#data_service_processor.check_and_print_embeddings()
search_results = data_service_processor.search_in_vector_store(user_query = 'main object canada nation digit talent strategy outline pdf')
search_results

'DIGITAL TALENTROAD  TO  2020 AND BEYONDA NATIONAL STRATEGY TO DEVELOP CANADA’S TALENT IN A GLOBAL DIGITAL ECONOMYINFORMATION AND COMMUNICATIONS TECHNOLOGY COUNCIL (ICTC) | www.ictc-ctic.ca\x0cTABLE  OF CONTENTS PREFACEACKNOWLEDGEMENTSEXECUTIVE SUMMARYINTRODUCTIONA SNAPSHOT OF CANADA’S DIGITAL ECONOMYTHEMES, TRENDS, CHALLENGES AND OPPORTUNITIES DIGITAL ADOPTIONEDUCATIONS AND SKILLSTALENT SUPPLYTHE CASE FOR A NATIONAL DIGITAL TALENT STRATEGYBENEFITS TO THE ECONOMY  BENEFITS TO SOCIETY  A NATIONAL DIGITAL TALENT STRATEGY1. NURTURING A STRONG YOUTH TALENT PIPELINE2. LEVERAGING CANADA’S DIVERSE TALENT3. SUPPORTING WORKFORCE UPSKILLING TO ENHANCE DIGITAL ADOPTION4. ATTRACTING AND RETAINING GLOBAL DIGITAL TALENT5. STRENGTHENING DIGITAL LITERACY AND DIGITAL SKILLS FOR CANADIANS6. FOSTERING DIGITAL ENTREPRENEURSHIP7. BUILDING LABOUR MOBILITY PATHWAYS TO FILL HIGH DEMAND OCCUPATIONSCONCLUSION…THE ROUTE TO 2020 AND BEYOND SUMMARY OF RECOMMENDATION'