In [27]:
# 📦 Install dependencies (if needed)
# !pip install pypdf clip-by-openai

# 🐶 PuppyDB and CLIP setup
import os
import torch
import clip
import numpy as np
from pypdf import PdfReader
from puppydb.core import PuppyDB
from glob import glob
import re
from langchain.text_splitter import CharacterTextSplitter

device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)
from puppydb.core import PuppyDB

In [41]:
def generate_chunks_from_pdfs(pdf_folder="test_pdfs", chunk_size=500, chunk_overlap=50):
    pdf_files = glob(os.path.join(pdf_folder, "*.pdf"))
    splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap, separator=" ")

    all_chunks = []
    for path in pdf_files:
        try:
            reader = PdfReader(path)
            text = " ".join(page.extract_text() or "" for page in reader.pages)
            text = text.replace("\n", " ").strip()
            chunks = splitter.split_text(text)
            all_chunks.extend([(os.path.basename(path), chunk) for chunk in chunks])
        except Exception as e:
            print(f"❌ Error processing {path}: {e}")

    print(f"✅ Extracted {len(all_chunks)} chunks from {len(pdf_files)} PDFs.")
    return all_chunks

In [47]:
chunks = generate_chunks_from_pdfs("test_pdfs")

✅ Extracted 1907 chunks from 5 PDFs.


In [48]:
import numpy as np
from langchain_experimental.open_clip import OpenCLIPEmbeddings
from PIL import Image

# Embe images or text
clip_embedder = OpenCLIPEmbeddings(model_name="ViT-B-32", checkpoint="laion2b_s34b_b79k")

In [52]:
chunks[0]

('2025_Formula_One_World_Championship.pdf',
 "Oscar Piastri (left) and his team McLaren-Mercedes (right) are the World Drivers' and Constructors' Championship leaders, respectively. The logo for the 75th anniversary ofFormula One 2025 Formula One World Championship 2025 FIA FormulaOneWorldChampionshipPrevious:2024Next:2026Races by country ·Races by venueSupport series:Formula 2 ChampionshipFIA Formula 3ChampionshipF1 AcademyPorsche Supercup The 2025 FIA Formula OneWorld Championship is amotor racing championshipfor Formula One cars and")

In [59]:
embeddings = []
for filename, chunk in chunks:
    if not chunk.strip():
        continue
    try:
        embedding = clip_embedder.embed_documents([chunk])
        print(f"✅ Processed chunk from {filename} with embedding shape {len(embedding[0])}")
        embeddings.append((filename,embedding[0], chunk))
    except Exception as e:
        print(f"❌ Error processing chunk from {filename}: {e}")
print(f"✅ Total embeddings created: {len(embeddings)}")

✅ Processed chunk from 2025_Formula_One_World_Championship.pdf with embedding shape 512
✅ Processed chunk from 2025_Formula_One_World_Championship.pdf with embedding shape 512
✅ Processed chunk from 2025_Formula_One_World_Championship.pdf with embedding shape 512
✅ Processed chunk from 2025_Formula_One_World_Championship.pdf with embedding shape 512
✅ Processed chunk from 2025_Formula_One_World_Championship.pdf with embedding shape 512
✅ Processed chunk from 2025_Formula_One_World_Championship.pdf with embedding shape 512
✅ Processed chunk from 2025_Formula_One_World_Championship.pdf with embedding shape 512
✅ Processed chunk from 2025_Formula_One_World_Championship.pdf with embedding shape 512
✅ Processed chunk from 2025_Formula_One_World_Championship.pdf with embedding shape 512
✅ Processed chunk from 2025_Formula_One_World_Championship.pdf with embedding shape 512
✅ Processed chunk from 2025_Formula_One_World_Championship.pdf with embedding shape 512
✅ Processed chunk from 2025_Form

In [62]:
from puppydb.core import PuppyDB

db = PuppyDB("pdf_clip_vectors.bin", "pdf_metadata_store")


for i, (filename, embedding, chunk) in enumerate(embeddings):
    vector_id = f"{filename}_chunk_{i+1:04d}"
    metadata = {"source": filename, "text": chunk}
    embedding_vector = np.array(embedding, dtype=np.float32)
    db.insert_vector(vector_id, embedding_vector, metadata)

print("✅ All chunks inserted into PuppyDB.")


✅ All chunks inserted into PuppyDB.


In [63]:
#index the database
db.build_index(method="hnsw")


In [88]:
#testing db
query = "how is Kimi Antonelli performing?"
query_embedding = clip_embedder.embed_query(query)  
query_embedding = np.array(query_embedding, dtype=np.float32)    
results = db.search(query_embedding, k=5, method="hnsw")
retrieved_chunks = []
for i, r in enumerate(results):
    print(f"{i+1}. Vector ID: {r[0]}")
    print(f"   Similarity: {r[1]:.4f}")
    print(f"   Metadata: {r[2]['text']}\n")
    retrieved_chunks.append(r[2]['text'])


1. Vector ID: Andrea_Kimi_Antonelli.pdf_chunk_0472
   Similarity: 0.2747
   Metadata: Retrieved 6 April 2025.200. Smith, Luke (12 April 2025). "Kimi Antonelli is making Formula Onehistory as rookie – all while finishing school" (https://www.nytimes.com/athletic/6273087/2025/04/12/f1-formula-1-mercedes-kimi-antonelli/). The Athletic. The New York Times. ISSN 1553-8095 (https://search.worldcat.org/issn/1553-8095). Archived (https://web.archive.org/web/20250412221643/https://www.nytimes.com/athletic/6273087/2025/04/12/f1-formula-1-mercedes-kimi-antonelli/) from the originalon 12

2. Vector ID: Andrea_Kimi_Antonelli.pdf_chunk_0353
   Similarity: 0.3001
   Metadata: Formula Scout. Archived (https://web.archive.org/web/20240910114120/https://formulascout.com/podcast-kimi-antonelli-on-his-immense-f4-success-frec-and-mercedes/101373) fromthe original on 10 September 2024. Retrieved 29 December 2022.95. "Winter Single Seater Championships Roundup" (https://www.fia.com/news/winter-single-seater-

In [89]:

# Step 5: Build context prompt
context = "\n\n".join(retrieved_chunks)
final_prompt = f"""Use the following context to answer the question:

{context}

Question: {query}
Answer:"""

In [90]:
final_prompt

'Use the following context to answer the question:\n\nRetrieved 6 April 2025.200. Smith, Luke (12 April 2025). "Kimi Antonelli is making Formula Onehistory as rookie – all while finishing school" (https://www.nytimes.com/athletic/6273087/2025/04/12/f1-formula-1-mercedes-kimi-antonelli/). The Athletic. The New York Times. ISSN 1553-8095 (https://search.worldcat.org/issn/1553-8095). Archived (https://web.archive.org/web/20250412221643/https://www.nytimes.com/athletic/6273087/2025/04/12/f1-formula-1-mercedes-kimi-antonelli/) from the originalon 12\n\nFormula Scout. Archived (https://web.archive.org/web/20240910114120/https://formulascout.com/podcast-kimi-antonelli-on-his-immense-f4-success-frec-and-mercedes/101373) fromthe original on 10 September 2024. Retrieved 29 December 2022.95. "Winter Single Seater Championships Roundup" (https://www.fia.com/news/winter-single-seater-championships-roundup). FédérationInternationale de l\'Automobile. 8 March 2023. Archived\n\n2025). "Kimi Antonelli 

In [None]:
import os
os.environ["GOOGLE_API_KEY"] = "ENTER_YOUR_API_KEY_HERE"

In [92]:

from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", temperature=0.2)

response = llm.invoke(final_prompt)

print("Response from LLM:")
print(response)

Response from LLM:
content='Based on the provided context, Kimi Antonelli is:\n\n*   Making Formula One history as a rookie (according to the New York Times article).\n*   Finishing school while also competing in Formula One (according to the New York Times article).\n*   Gifted a Mercedes supercar, but unable to drive it legally (according to Motorsport.com).\n*   Passed his driving test weeks before his Formula One debut (according to CNN).' additional_kwargs={} response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'safety_ratings': []} id='run--98009612-1d01-4a2e-b412-143f9cbdf20f-0' usage_metadata={'input_tokens': 959, 'output_tokens': 92, 'total_tokens': 1051, 'input_token_details': {'cache_read': 0}}


In [95]:

# 🔹 Run with only the raw query
response_query_only = llm.invoke(query)

# 🔹 Run with the RAG-style prompt
response_rag = llm.invoke(final_prompt)


In [96]:

# Force cast as plain text (in case it's an object or Message wrapper)
response_query_only_text = str(response_query_only).strip()
response_rag_text = str(response_rag).strip()

# Print for inspection
print("🔸 Response (query only):")
print(response_query_only_text)

print("\n" + "="*80 + "\n")

print("🔹 Response (with RAG prompt):")
print(response_rag_text)


🔸 Response (query only):
content='Andrea "Kimi" Antonelli is currently competing in the **FIA Formula 2 Championship** with Prema Racing. His performance has been a subject of much discussion, as he made the jump directly from Formula Regional European Championship to F2, skipping Formula 3 entirely.\n\nHere\'s a general overview of his performance so far in the 2024 F2 season:\n\n*   **Early Season Struggles:** Antonelli has faced a challenging start to his F2 career. He\'s adapting to a more powerful car, more complex tires, and a highly competitive field. He\'s been working to improve his qualifying pace and race consistency.\n*   **Learning Curve:** It\'s important to remember that Antonelli is very young and relatively inexperienced compared to many of his competitors. He\'s on a steep learning curve, and Prema Racing is known for supporting young drivers through their development.\n*   **Potential:** Despite the challenges, Antonelli possesses immense talent and potential. He has