In [2]:
# --- Step 1: Imports ---
import PyPDF2
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
# Load text from PDF
def load_text_from_pdf(file_path):
    text = ""
    with open(file_path, "rb") as f:
        reader = PyPDF2.PdfReader(f)
        for page in reader.pages:
            if page.extract_text():
                text += page.extract_text() + "\n"
    return text

# Split into smaller chunks
def chunk_text(text, chunk_size=300):
    words = text.split()
    return [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]

# Build embeddings
def build_index(chunks, model):
    embeddings = model.encode(chunks)
    return np.array(embeddings), chunks

# Search query
def search(query, embeddings, chunks, model, top_k=3):
    query_vec = model.encode([query])
    sims = cosine_similarity(query_vec, embeddings)[0]
    top_indices = sims.argsort()[-top_k:][::-1]
    results = [(chunks[i], sims[i]) for i in top_indices]
    return results

In [5]:
# Load sentence embedding model (downloaded once, cached locally afterwards)
model = SentenceTransformer("all-MiniLM-L6-v2")
print("✅ Model loaded successfully!")

✅ Model loaded successfully!


In [7]:
# Put a sample PDF in your app/ folder, e.g., app/sample.pdf
file_path = r"E:\Radhika\radhika-intern\testpdf.pdf"

text = load_text_from_pdf(file_path)
chunks = chunk_text(text)

embeddings, chunks = build_index(chunks, model)

print(f"✅ Indexed {len(chunks)} chunks from document.")

✅ Indexed 1 chunks from document.


In [9]:
from transformers import pipeline

summarizer = pipeline("summarization", model="t5-small")

query = "What is the main topic of this document?"

results = search(query, embeddings, chunks, model)

print("🔎 Query:", query)
for idx, (chunk, score) in enumerate(results, 1):
    print(f"\nResult {idx} (Score {score:.4f}):\n{chunk[:300]}...")

# Extract only the text part from the first tuple and truncate
top_chunk = results[0][0]  # extract chunk text properly
context_text = top_chunk[:500]

answer = summarizer(context_text, max_length=40, min_length=10, do_sample=False)[0]['summary_text']

print("\n📝 Summary Answer:")
print(answer)


Device set to use cpu


🔎 Query: What is the main topic of this document?

Result 1 (Score 0.0830):
Once upon a time in a small village, there was a boy named Sam who loved to explor e. Every day after school, Sam would go to the nearb y forest to disco ver new things. One afternoon, while walking along a narrow path, he found a shiny , golden key half-buried under some leaves. Curious, he picked ...


Both `max_new_tokens` (=256) and `max_length`(=40) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



📝 Summary Answer:
a boy named Sam explored e every day after school, he would go to the nearb y forest to explore new things . he found a shiny golden key half-buried under some leaves . after walking for a while he came across an old wooden door attached .


In [10]:
def handle_query(query, embeddings, chunks, model, mode="search"):
    results = search(query, embeddings, chunks, model)
    top_chunk = results[0][0]  # best text match
    
    if mode == "search":
        # Just return top matching text chunks
        return [chunk for chunk, score in results[:3]]
    
    elif mode == "summarize":
        summarizer = pipeline("summarization", model="t5-small")
        context_text = top_chunk[:500]
        return summarizer(context_text, max_length=50, min_length=15, do_sample=False)[0]['summary_text']
    
    elif mode == "qa":
        qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
        answer = qa_pipeline(question=query, context=top_chunk)
        return answer['answer']
    
    elif mode == "keyword":
        keyword = query.lower()
        matches = [chunk for chunk in chunks if keyword in chunk.lower()]
        return matches if matches else "No matches found."
    
    else:
        return "Unknown query mode."


In [11]:
# Example queries
print("Search mode:")
print(handle_query("forest", embeddings, chunks, model, mode="search"))

print("\nSummarize mode:")
print(handle_query("What is the main topic?", embeddings, chunks, model, mode="summarize"))

print("\nQA mode:")
print(handle_query("Who found the golden key?", embeddings, chunks, model, mode="qa"))

print("\nKeyword mode:")
print(handle_query("Sam", embeddings, chunks, model, mode="keyword"))



Search mode:
['Once upon a time in a small village, there was a boy named Sam who loved to explor e. Every day after school, Sam would go to the nearb y forest to disco ver new things. One afternoon, while walking along a narrow path, he found a shiny , golden key half-buried under some leaves. Curious, he picked it up and wonder ed what it might open. Sam decided to explor e deeper into the forest, hoping to ﬁnd a lock that matched his key. After walking for a while, he came across an old wooden door attached to a huge oak tree. The door look ed ancient and covered in vines, but to Sam ’s amaz ement, the golden key ﬁt perfectly into the lock. When he turned the key, the door creaked open slowly , revealing a hidden garden full of color ful ﬂowers and sparkling fountains. In the middle of the garden stood a beautiful old bench wher e an elderly woman sat, smiling warmly . She introduced herself as the guar dian of the secr et garden. The woman explained that the garden was magical and 

Device set to use cpu
Both `max_new_tokens` (=256) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


a boy named Sam explored e every day after school, he would go to the nearb y forest to explore new things . he found a shiny golden key half-buried under some leaves . after walking for a while he came across an old wooden door attached .

QA mode:


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Device set to use cpu


Sam ’s amaz ement

Keyword mode:
['Once upon a time in a small village, there was a boy named Sam who loved to explor e. Every day after school, Sam would go to the nearb y forest to disco ver new things. One afternoon, while walking along a narrow path, he found a shiny , golden key half-buried under some leaves. Curious, he picked it up and wonder ed what it might open. Sam decided to explor e deeper into the forest, hoping to ﬁnd a lock that matched his key. After walking for a while, he came across an old wooden door attached to a huge oak tree. The door look ed ancient and covered in vines, but to Sam ’s amaz ement, the golden key ﬁt perfectly into the lock. When he turned the key, the door creaked open slowly , revealing a hidden garden full of color ful ﬂowers and sparkling fountains. In the middle of the garden stood a beautiful old bench wher e an elderly woman sat, smiling warmly . She introduced herself as the guar dian of the secr et garden. The woman explained that the gar