In [23]:
import numpy as np
import openai, os, glob
from pypdf import PdfReader
from sqlalchemy import create_engine, Table, Column, Integer, String, MetaData, ARRAY, Float
from sqlalchemy.orm import sessionmaker
from dotenv import load_dotenv

In [24]:
class DataService():

    def __init__(self):
        # Load environment variables from .env file
        load_dotenv()
        # Get database password from environment variable
        db_password = os.getenv("DB_PASSWORD")
        openai.api_key = os.getenv("OPENAI_API_KEY")
        # SQLAlchemy engine
        self.engine = create_engine(f'postgresql://postgres:{db_password}@localhost:5432/mydatabase')
        # Create table and vector index
        self.create_vector_index()
        
    def create_vector_index(self):
        metadata = MetaData()
        # Define table
        documents = Table('documents', metadata,
                        Column('id', Integer, primary_key=True),
                        Column('text', String),
                        Column('vector', ARRAY(Float)))
        metadata.create_all(self.engine)
        # Create pgvector index
        with self.engine.connect() as conn:
            conn.execute("SELECT create_ivfflat_index('documents', 'vector', 512);")  # Adjust dimension (512) as needed

    def load_data_to_pgvector(self, embeddings):
        Session = sessionmaker(bind=self.engine)
        session = Session()
        for embedding in embeddings:
            vector = np.array(embedding["vector"], dtype=np.float32).tolist()
            session.execute(documents.insert(), {"text": embedding["text"], "vector": vector})
        session.commit()

    def pdf_to_embeddings(self, pdf_path: str, chunk_length: int = 1000):
        reader = PdfReader(pdf_path)
        chunks = []
        for page in reader.pages:
            text_page = page.extract_text()
            chunks.extend([text_page[i:i+chunk_length].replace('\n', '')
                        for i in range(0, len(text_page), chunk_length)])

        response = openai.Embedding.create(
            model='text-embedding-ada-002', input=chunks)
        return [{'id': value['index'], 'vector': value['embedding'], 'text': chunks[value['index']]}
                for value in response['data']]

    def search_pgvector(self, user_query: str, k: int = 5):
        # Creates embedding vector from user query
        embedded_query = openai.Embedding.create(
            input=user_query,
            model="text-embedding-ada-002")["data"][0]['embedding']

        # Convert query embedding to numpy array
        query_vector = np.array(embedded_query, dtype=np.float32).tolist()

        # SQL query for vector search
        sql_query = f"SELECT text, vector FROM documents ORDER BY vector <#> '{query_vector}'::float4[] LIMIT {k};"

        with self.engine.connect() as conn:
            results = conn.execute(sql_query)
            return [row for row in results]



In [25]:
# Specify the directory path
DIRECTORY_PATH = '/Users/sternsemasuka/Desktop/ML/Project/Talk-to-your-PDF/' 
# Search for any PDF file in the directory
pdf_file_path = glob.glob(os.path.join(DIRECTORY_PATH, '*.pdf'))[0]

In [26]:
# Usage example
data_service = DataService()
embeddings = data_service.pdf_to_embeddings(pdf_file_path)
data_service.load_data_to_pgvector(embeddings)
input_question = input("Enter your question here: ")
search_results = data_service.search_pgvector(input_question)
for result in search_results:
    print(result)


OperationalError: (psycopg2.OperationalError) connection to server at "localhost" (127.0.0.1), port 5432 failed: FATAL:  password authentication failed for user "postgres"

(Background on this error at: https://sqlalche.me/e/20/e3q8)