In [1]:
#All Necessary Imports
import os
import gc  # Garbage Collector for memory management
import PyPDF2
import numpy as np
import faiss  
from sentence_transformers import SentenceTransformer
from transformers import pipeline
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [2]:
#Function to Load and Chunk the Document 
def load_and_chunk_text(file_path, chunk_size=500, chunk_overlap=50):
    """
    Loads text from a PDF and splits it into meaningful, overlapping chunks.
    """
    if not os.path.exists(file_path):
        print(f"Error: The file '{file_path}' was not found.")
        return None
    text = ""
    try:
        with open(file_path, "rb") as f:
            reader = PyPDF2.PdfReader(f)
            for page in reader.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"
    except Exception as e:
        print(f"An error occurred while reading the PDF: {e}")
        return None
    if not text.strip():
        print(f"Warning: No text could be extracted from '{file_path}'.")
        return []
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len
    )
    chunks = text_splitter.split_text(text)
    print(f"Successfully loaded and created {len(chunks)} chunks from '{file_path}'.")
    return chunks

In [3]:
#Function to Create a FAISS Index
def create_faiss_index(chunks, model):
    """
    Creates a FAISS index from the text chunks' embeddings.
    """
    if not chunks:
        print("No chunks available to create an index.")
        return None
    embeddings = model.encode(chunks, convert_to_tensor=True, show_progress_bar=True)
    embeddings = embeddings.cpu().numpy()
    faiss.normalize_L2(embeddings)
    index = faiss.IndexFlatIP(embeddings.shape[1])
    index.add(embeddings)
    print(f"✅ FAISS index created successfully with {len(chunks)} vectors.")
    return index

In [7]:
#Function to Handle Queries with On-Demand Model Loading 
def handle_query_with_faiss(query, index, chunks, embedding_model, mode="qa", top_k=3):
    """
    Handles a user query by loading the required model on-demand and
    clearing it from memory immediately after use to prevent crashes.
    """
    if index is None:
        return "Error: The document index is not available."

    query_embedding = embedding_model.encode([query])
    faiss.normalize_L2(query_embedding)
    _, indices = index.search(query_embedding, top_k)
    context = " ".join([chunks[i] for i in indices[0]])
    
    result = ""

    if mode == "qa":
        print("Loading QA model...")
        qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")
        answer = qa_pipeline(question=query, context=context)
        result = ' '.join(answer['answer'].split()) # Clean the answer
        del qa_pipeline # Delete the model from memory
        gc.collect() # Ask Python to collect the garbage
        print("QA model cleared from memory.")

    elif mode == "summarize":
        print("Loading Summarizer model...")
        summarizer = pipeline("summarization", model="t5-small")
        summary = summarizer(context, max_length=100, min_length=25, do_sample=False)
        result = summary[0]['summary_text']
        del summarizer # Delete the model from memory
        gc.collect() # Ask Python to collect the garbage
        print("Summarizer model cleared from memory.")

    else: # Search mode
        result = [chunks[i] for i in indices[0]]
        
    return result


In [9]:
# Main Execution Block

# Make sure you have all necessary imports
from sentence_transformers import SentenceTransformer
from transformers import pipeline
import faiss
import gc
# You will also need your custom 'load_and_chunk_text' and 'create_faiss_index' functions defined elsewhere

# 1. SET YOUR FILE PATH
# The 'r' is important for Windows paths.
FILE_PATH = r"E:\Radhika\radhika-intern\testpdf.pdf"

# 2. LOAD THE EMBEDDING MODEL
print("Loading the embedding model (all-MiniLM-L6-v2)...")
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
print("Embedding model loaded.")

# 3. PROCESS YOUR DOCUMENT
# Assuming your custom functions are available
text_chunks = load_and_chunk_text(FILE_PATH) 
faiss_index = create_faiss_index(text_chunks, embedding_model)

# 4. TEST THE QA MODEL (Original Queries)
if faiss_index:
    print("\n--- Starting Query Session ---")
    
    # QA Mode
    print("\n--- Testing QA Mode ---")
    query1 = "Who found the golden key?"
    print(f"\nQuery: {query1}")
    answer1 = handle_query_with_faiss(query1, faiss_index, text_chunks, embedding_model, mode="qa")
    print(f"Answer: {answer1}")

    query2 = "What is the reward for caring for the garden?"
    print(f"\nQuery: {query2}")
    answer2 = handle_query_with_faiss(query2, faiss_index, text_chunks, embedding_model, mode="qa")
    print(f"Answer: {answer2}")

    # --- Test Summarizer Mode ---
    print("\n--- Testing Summarizer Mode 📝 ---")
    summarize_query = "Summarize Sam's discovery in the forest."
    print(f"Query: {summarize_query}")
    summary_answer = handle_query_with_faiss(summarize_query, faiss_index, text_chunks, embedding_model, mode="summarize")
    print(f"Summary: {summary_answer}")

    # --- Test Search Mode ---
    print("\n--- Testing Search Mode 🔍 ---")
    search_query = "magical garden and its guardian"
    print(f"Query: {search_query}")
    search_results = handle_query_with_faiss(search_query, faiss_index, text_chunks, embedding_model, mode="search")
    
    print("Top Relevant Chunks Found:")
    if search_results:
        for i, chunk in enumerate(search_results, 1):
            print(f"  [{i}] {chunk}.")
    else:
        print("No relevant chunks found.")

else:
    print("Failed to create the FAISS index. Cannot proceed with queries.")

Loading the embedding model (all-MiniLM-L6-v2)...
Embedding model loaded.
Successfully loaded and created 5 chunks from 'E:\Radhika\radhika-intern\testpdf.pdf'.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

✅ FAISS index created successfully with 5 vectors.

--- Starting Query Session ---

--- Testing QA Mode ---

Query: Who found the golden key?
Loading QA model...


Device set to use cpu


QA model cleared from memory.
Answer: Sam

Query: What is the reward for caring for the garden?
Loading QA model...


Device set to use cpu


QA model cleared from memory.
Answer: rewar ded with happiness and the peacefulness of natur e

--- Testing Summarizer Mode 📝 ---
Query: Summarize Sam's discovery in the forest.
Loading Summarizer model...


Device set to use cpu
Both `max_new_tokens` (=256) and `max_length`(=100) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Summarizer model cleared from memory.
Summary: the garden grew more beautiful each day, and Sam felt happier than ever . he decided to explor e deeper into the forest, hoping to find a lock that matched his key .

--- Testing Search Mode 🔍 ---
Query: magical garden and its guardian
Top Relevant Chunks Found:
  [1] key,
 
the
 
door
 
creaked
 
open
 
slowly ,
 
revealing
 
a
 
hidden
 
garden
 
full
 
of
 
color ful
 
ﬂowers
 
and
 
sparkling
 
fountains.
 
In
 
the
 
middle
 
of
 
the
 
garden
 
stood
 
a
 
beautiful
 
old
 
bench
 
wher e
 
an
 
elderly
 
woman
 
sat,
 
smiling
 
warmly .
 
She
 
introduced
 
herself
 
as
 
the
 
guar dian
 
of
 
the
 
secr et
 
garden.
 
The
 
woman
 
explained
 
that
 
the
 
garden
 
was
 
magical
 
and
 
only
 
appear ed
 
to
 
kind
 
and
 
curious
 
hear ts.
  [2] appear ed
 
to
 
kind
 
and
 
curious
 
hear ts
 
like
 
Sam ’s.
 
She
 
told
 
him
 
that
 
anyone
 
who
 
cared
 
for
 
the
 
garden
 
would
 
be
 
rewar ded
 
with
 
happiness
 
and
