CELL 1: Install Libraries

In [49]:
!pip install -q sentence-transformers faiss-cpu transformers torch gradio pdfplumber


CELL 2: Upload PDFs

In [50]:
from google.colab import files


uploaded_files = files.upload()
pdf_files = list(uploaded_files.keys())


Saving st1.pdf to st1 (2).pdf


In [51]:
# CELL 3
# Function to extract stories from a single PDF file

import pdfplumber

def extract_stories_from_pdf(pdf_path):
    """
    Reads a PDF and extracts stories.
    Assumes:
    - First line is the title
    - Remaining lines are story text
    """
    stories = []

    with pdfplumber.open(pdf_path) as pdf:
        full_text = ""
        for page in pdf.pages:
            full_text += page.extract_text() + "\n"

    # Split stories using blank lines
    story_blocks = full_text.split("\n\n")

    for block in story_blocks:
        lines = block.strip().split("\n")
        if len(lines) > 3:
            title = lines[0]
            story = " ".join(lines[1:])
            stories.append({
                "title": title,
                "content": story
            })

    return stories


In [None]:
'''# CELL 3
# Extract exactly 5 stories from one PDF (Title + Story)

import pdfplumber

def extract_5_stories_from_pdf(pdf_path):
    """
    Extracts stories from a PDF.
    Assumes:
    - Each story starts with a TITLE line
    - After title, story text continues until next title
    - Total 5 stories per PDF
    """

    with pdfplumber.open(pdf_path) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text() + "\n"

    lines = [line.strip() for line in text.split("\n") if line.strip()]

    stories = []
    current_title = None
    current_story = []

    for line in lines:
        # Heuristic: Title is a short line (not too long)
        if len(line.split()) <= 6 and line.isalpha():
            if current_title and current_story:
                stories.append({
                    "title": current_title,
                    "content": " ".join(current_story)
                })
            current_title = line
            current_story = []
        else:
            current_story.append(line)

    # Add last story'''


In [52]:
# CELL 4
# Function to load stories from all PDFs

def load_all_stories(pdf_files):
    """
    Extracts stories from all uploaded PDFs.
    """
    all_stories = []

    for pdf in pdf_files:
        stories = extract_stories_from_pdf(pdf)
        for story in stories:
            story["source_pdf"] = pdf
            all_stories.append(story)

    return all_stories


In [53]:
# CELL 5
# Prepare story text for embedding

def prepare_documents(stories):
    documents = []

    for story in stories:
        text = "Title: " + story["title"] + "\nStory: " + story["content"]
        documents.append(text)

    return documents


In [54]:
# CELL 6
# Load sentence transformer for embeddings

from sentence_transformers import SentenceTransformer

embedder = SentenceTransformer("all-MiniLM-L6-v2")
print("Embedding model loaded!")


Embedding model loaded!


In [55]:
# CELL 7
# Convert documents into embeddings

def create_embeddings(documents, embedder):
    return embedder.encode(documents, convert_to_numpy=True)



In [56]:
# CELL 8
# Create FAISS index for fast similarity search

import faiss

def build_faiss_index(embeddings):
    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(embeddings)
    return index


In [57]:
# CELL 9
# Load FLAN-T5 model for answering questions

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")

print("FLAN-T5 model loaded!")


FLAN-T5 model loaded!


In [58]:
# CELL 10
# Retrieve relevant stories for a question

def retrieve_context(question, embedder, index, documents, top_k=3):
    question_embedding = embedder.encode([question], convert_to_numpy=True)
    _, indices = index.search(question_embedding, top_k)

    context = ""
    for idx in indices[0]:
        context += documents[idx] + "\n\n"

    return context


In [59]:
def generate_answer(question, context, tokenizer, model):
    prompt = f"""
You are answering questions based on stories.

If the answer is NOT clearly mentioned in the context,
say: "This information is not clearly mentioned in the stories."

Context:
{context}

Question:
{question}

Answer:
"""
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
    outputs = model.generate(**inputs, max_length=200)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


In [39]:
'''def generate_answer(question, context, tokenizer, model):
    """
    Generates answer with:
    - Contextual explanation
    - Moral / lesson from the story
    """

    prompt = f"""
You are a story assistant.

Using the context below:
1. First, explain the answer to the question in simple words.
2. Then, explain the moral or lesson from the story.

If the information is not clearly present, say so honestly.

Context:
{context}

Question:
{question}

Answer (include moral):
"""
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
    outputs = model.generate(**inputs, max_length=250)

    return tokenizer.decode(outputs[0], skip_special_tokens=True)'''


In [60]:
# CELL 12
# Complete RAG pipeline

def rag_pipeline(question):
    context = retrieve_context(question, embedder, faiss_index, documents)
    answer = generate_answer(question, context, tokenizer, model)
    return answer


In [61]:
def rag_pipeline(question):
    q = question.lower().strip()

    # ðŸ”’ Guard for definition-type questions
    if q.startswith("who is"):
        question = (
            "What role does Akbar play in the stories "
            "and what moral do these stories teach?"
        )

    context = retrieve_context(
        question, embedder, faiss_index, documents
    )

    return generate_answer(
        question, context, tokenizer, model
    )


In [62]:
# CELL 13
stories = load_all_stories(pdf_files)
documents = prepare_documents(stories)

embeddings = create_embeddings(documents, embedder)
faiss_index = build_faiss_index(embeddings)

print("RAG system ready!")


RAG system ready!


In [63]:
# CELL 14
q = input("Ask a question about the stories: ")
print("\nAnswer:\n", rag_pipeline(q))


Ask a question about the stories: What kind of ruler is Akbar shown to be in the stories

Answer:
 Birbal's answers to test his wisdom, realized the importance of focusing on the present and the people around him.
