In [None]:
from qdrant_client import QdrantClient
import openai
import os
from dotenv import load_dotenv

load_dotenv()

client = QdrantClient(url="http://localhost:6333")

openai_client = openai.Client(
    api_key=os.getenv("OPENAI_API_KEY"),
)

In [None]:
dimensions = 1536
embedding_model = "text-embedding-3-small"
collection_name = "gates_notes"

In [None]:
from qdrant_client.models import Distance, VectorParams

client.create_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(size=dimensions, distance=Distance.COSINE),
)

In [None]:
import os
import PyPDF2
from qdrant_client.models import PointStruct

# Read a PDF file and split its content into chunks of text
def read_pdf_chunks(file_path, chunk_size=1000):
    chunks = []
    with open(file_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            text = page.extract_text()
            # Split text into chunks
            for i in range(0, len(text), chunk_size):
                chunks.append(text[i:i+chunk_size])
    return chunks

# List all files in the directory
pdf_directory = './gates_notes'
pdf_files = [file for file in os.listdir(pdf_directory) if file.endswith('.pdf')]

# Process each PDF file
for pdf_file in pdf_files:
    year = pdf_file[:4]

    pdf_file_path = os.path.join(pdf_directory, pdf_file)
    chunk_size = 1000  # Number of characters per chunk

    pdf_chunks = read_pdf_chunks(pdf_file_path, chunk_size)

    result = openai_client.embeddings.create(input=pdf_chunks, model=embedding_model, dimensions=dimensions)

    # Format data using PointStruct
    points = [
        PointStruct(
            id=idx,
            vector=data.embedding,
            payload={
                "year": year,
                "title": pdf_file,
                "text": ''.join(pdf_chunks),
            },
        )
        for idx, (data) in enumerate(result.data)
    ]

    print(points)

    client.upsert(collection_name, points)


In [None]:
def query_qdrant(query, top_k=1):

    embedded_query = openai_client.embeddings.create(
        input=query,
        model=embedding_model,
        dimensions=dimensions,
    ).data[0].embedding
    
    query_results = client.search(
        collection_name=collection_name,
        query_vector=(
            embedded_query
        ),
        limit=top_k,
    )
    
    return query_results

In [None]:
query_results = query_qdrant('Highest impact year')
for i, article in enumerate(query_results):
    print(article)

In [None]:
texts = [r.payload['text'] for r in query_results]
input_text = ' '.join(texts)

In [None]:
from langchain.prompts import PromptTemplate
from langchain_openai import ChatOpenAI

custom_prompt = "What did the Gates Foundation accomplish that year?"

template = PromptTemplate(template="{query} Context: {context}", input_variables=["query", "context"])
prompt_with_context = template.invoke({"query": custom_prompt, "context": input_text})

llm = ChatOpenAI(temperature=0.7)
results = llm.invoke(prompt_with_context)
print(results.content)