In [None]:
%pip install -q openai chromadb tiktoken


## Imports and OpenAI API Key setup

In [16]:
import os
import openai
import chromadb
from chromadb.config import Settings
from tiktoken import get_encoding

from dotenv import load_dotenv
load_dotenv()

import os

API_KEY = os.getenv("OPENAI_API_KEY")


## Load Raw Content

In [17]:
# Sample content - replace this with content from a file if needed
content = """
Virat Kohli is one of the most iconic cricketers in the world today. He was born on November 5, 1988, in Delhi, India.
Kohli rose through the ranks of domestic cricket and gained attention after leading India to victory in the 2008 U-19 Cricket World Cup.
He made his debut for the Indian national team later that year. Initially seen as an aggressive youngster, Kohli matured into a consistent and dependable batsman. 
His aggressive batting style and incredible fitness became his trademarks. 
Kohli quickly became known for his ability to chase down totals under pressure. 
He has often been referred to as the "Chase Master" in One Day Internationals. He was named vice-captain of India in 2012 and eventually became captain in all formats. Kohli succeeded MS Dhoni as Test captain in 2014. Under his leadership, India became the number one Test team in the world. He led India to a historic Test series win in Australia in 2018-19. Kohli holds the record for the fastest 8,000, 9,000, 10,000, and 11,000 runs in ODI cricket. He has scored over 70 international centuries. Kohli has won multiple ICC awards including the ICC Cricketer of the Year. He is also one of the most followed athletes globally on social media. Apart from cricket, he is passionate about fitness and healthy living. Kohli follows a strict diet and workout regimen. He is credited with revolutionizing fitness standards in Indian cricket. Kohli is married to Bollywood actress Anushka Sharma. The couple welcomed their daughter Vamika in January 2021. Kohli is a vocal advocate for animal rights and mental health awareness. He co-owns FC Goa, a football team in the Indian Super League. Kohli has been the face of several major brands and endorsements. He has played for Royal Challengers Bangalore (RCB) since the inception of the Indian Premier League. He holds the record for the most runs in a single IPL season. Despite not winning an IPL trophy, he remains a fan favorite. Kohli stepped down as India's T20 captain in 2021 and as ODI and Test captain soon after. He continues to play an important role in the team as a senior player and mentor. He is known for his intensity on the field and humility off it. Kohli remains one of the most inspiring figures in modern cricket. His discipline, consistency, and hunger for success continue to influence the next generation of cricketers.
"""


## Chunking the text

In [None]:
# Simple sentence-based chunking (naive)
chunks = [chunk.strip() for chunk in content.split('.') if chunk.strip()]
print(len(chunks))

for chunk in chunks:
    print(chunk)
    print("-"*100)


## Generate OpenAI embeddings

In [45]:
from openai import OpenAI
client = OpenAI()

def get_embeddings(text):
    response = client.embeddings.create(
        input=text,
        model="text-embedding-3-small"
    )
    return response.data[0].embedding



## Store Chunks in Chroma DB

In [None]:
# Initialize Chroma DB
chroma_client = chromadb.Client(Settings(
    persist_directory="./chroma_store"
))

collection = chroma_client.get_or_create_collection(name="kohli_v2")

i = 457
for chunk in chunks:
    collection.add(
        ids=[f"chunk-{i}"],
        documents=[chunk],
        embeddings=[get_embeddings(chunk)]
    )
    i+=1


## Query the Vector DB

In [None]:
# Ask a question

query1 = "Who is the life partner of Kohli?"
query2 = "Which historic Test series did India win under Kohli's captaincy?"
query3 = "What are some of Kohli's interests outside cricket?"

query = query1
query_embedding = get_embeddings([query])

results = collection.query(
    query_embeddings=[query_embedding],
    n_results=3
)

top_chunks = results['documents'][0]

for chunk in top_chunks:
    print(chunk)
    print("-"*100)


## Pass the context and query to the OpenAI API

In [None]:
context = "\n".join(top_chunks)
system_prompt = "You are a helpful AI assistant. Answer based on the given context only. Do not answer if the answer is not in context"

client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"),
)

response = client.responses.create(
    model="gpt-4o",
    instructions=f'''{system_prompt}''',
    input=f"Context: {context} and Queston {query}",
    temperature=0.1
)

print("🧠 Answer from OpenAI:")
print(response.output_text)
