In [6]:
!pip install chromadb sentence-transformers PyPDF2 chroma-migrate pydantic



In [17]:
!pip install --upgrade chromadb==0.3.29 pydantic
!pip install "pydantic<2.0"
!pip install chromadb --force-reinstall


Collecting pydantic
  Using cached pydantic-2.11.4-py3-none-any.whl.metadata (66 kB)
Collecting chromadb
  Using cached chromadb-1.0.8-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.9 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting pydantic>=1.9 (from chromadb)
  Using cached pydantic-2.11.4-py3-none-any.whl.metadata (66 kB)
Collecting fastapi==0.115.9 (from chromadb)
  Using cached fastapi-0.115.9-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Using cached uvicorn-0.34.2-py3-none-any.whl.metadata (6.5 kB)
Collecting numpy>=1.22.5 (from chromadb)
  Using cached numpy-2.2.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Using cached posthog-4.0.1-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting typing-extensions>=4.5.0 (from chromadb)
  Downloading typing_extens

In [None]:
from chromadb import PersistentClient
from sentence_transformers import SentenceTransformer
import PyPDF2
from langchain.text_splitter import RecursiveCharacterTextSplitter


In [11]:

# --- Step 1: Extract text from PDF ---
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as f:
        reader = PyPDF2.PdfReader(f)
        return "\n".join([page.extract_text() or "" for page in reader.pages])


# --- Step 2: Chunk the text ---
def split_text(text, chunk_size=200, chunk_overlap=20):
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    return splitter.split_text(text)


# --- Step 3: Embed and add to ChromaDB ---
pdf_path = "/content/Sachin_bio.pdf"  # Replace with your actual file
text = extract_text_from_pdf(pdf_path)
chunks = split_text(text)
chunks[0]

'Sachin Ramesh Tendulkar: The Master Blaster of Cricket  \n \nEarly Life:  \nSachin Tendulkar was born on April 24, 1973, in Mumbai, India. He was introduced to cricket at an'

In [12]:

# Initialize embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(chunks)

# Initialize ChromaDB (new version)
client = PersistentClient(path="chroma_db")
collection = client.get_or_create_collection(name="pdf_docs")

# Add documents and embeddings
for i, chunk in enumerate(chunks):
    collection.add(
        documents=[chunk],
        ids=[f"doc_{i}"],
        metadatas=[{"source": f"page_{i}"}],
        embeddings=[embeddings[i]]
    )


In [13]:

# --- Step 4: Query ---
query = "When Sachin was born?"
query_embedding = model.encode([query])[0]

results = collection.query(
    query_embeddings=[query_embedding],
    n_results=3
)

# --- Step 5: Display results ---
print("Top Matches:")
for doc in results["documents"][0]:
    print("-", doc)


Top Matches:
- Sachin Ramesh Tendulkar: The Master Blaster of Cricket  
 
Early Life:  
Sachin Tendulkar was born on April 24, 1973, in Mumbai, India. He was introduced to cricket at an 
early age by his elder brother Ajit Tendulkar, who recognized his extraordinary talent. Under the 
mentorship of coach Ramakant Achrekar, Sachin began honing his cricketing skills at Shivaji Park. As a 
young boy, he played for his school team and gained immense attention by scoring centuries
- regularly in school -level tournaments.  
 
Domestic Debut:  
At the age of 15, Sachin made his debut in first -class cricket for Mumbai in the Ranji Trophy. He 
scored a century in his debut match against Gujarat, becoming the youngest Indian to do so at the 
time. His performance in domestic cricket quickly earned him a place in the national team.  
 
International Debut:  
Sachin Tendulkar made his international debut for India in a Test match against Pakistan in Karachi on
- Post -Retirement:  
Sachin retire