In [None]:
!pip install -q sentence-transformers faiss-cpu transformers torch gradio pdfplumber

In [None]:
# CELL 2
# User decides how many PDFs to upload
# Each PDF must contain exactly 5 stories with titles

from google.colab import files

print("INSTRUCTIONS FOR DATASET UPLOAD")
print("---------------------------------")
print("• Each PDF MUST contain exactly 5 stories")
print("• Each story MUST have a clear title")
print("• Story format should be: Story 1, Story 2, ..., Story 5")
print("---------------------------------\n")

# Ask user how many PDFs they want to upload
num_pdfs = int(input("How many PDF files do you want to upload? "))

pdf_files = []

# Loop to upload PDFs one by one
for i in range(num_pdfs):
    print(f"\n Upload PDF {i+1} of {num_pdfs}")
    uploaded = files.upload()   # opens file chooser
    for filename in uploaded.keys():
        pdf_files.append(filename)

print("\n Upload complete!")
print("Uploaded PDF files:")
for pdf in pdf_files:
    print("-", pdf)


In [None]:
import pdfplumber
import re

def extract_stories_from_pdf(pdf_path):
    # PDF se text nikalna
    with pdfplumber.open(pdf_path) as pdf:
        text = ""
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"

    # Text cleaning: Newlines hatana taaki sentence na toote
    # "Story 1" dhundhne se pehle formatting fix karte hain
    text = re.sub(r'\s+', ' ', text).strip()

    # "Story X" pattern dhundhna
    matches = list(re.finditer(r'Story\s+\d+', text))

    stories = []
    for i in range(len(matches)):
        start = matches[i].start()
        # Agli story kahan shuru hoti hai? Wahan tak text lo
        end = matches[i + 1].start() if i + 1 < len(matches) else len(text)

        story_block = text[start:end].strip()

        # Title nikalna: "Story X" ke baad pehla sentence title hota hai
        # Hum pehle full stop (.) tak ka text title maante hain
        first_dot = story_block.find('.')
        if first_dot != -1:
            title = story_block[:first_dot+1].strip()
        else:
            title = story_block[:50].strip() # Fallback

        stories.append({
            "title": title,
            "content": story_block,
            "source": pdf_path
        })

    return stories

# Saari stories load karte hain
all_stories = []
for pdf in pdf_files:
    stories = extract_stories_from_pdf(pdf)
    all_stories.extend(stories)

print(f" Total stories extracted: {len(all_stories)}")
# Check karne ke liye pehli story print karte hain
if all_stories:
    print("\nPreview Story 1 Title:", all_stories[0]['title'])

In [None]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# 1. Documents prepare karna
documents = [s['content'] for s in all_stories]

# 2. Model load karna
embedder = SentenceTransformer("all-MiniLM-L6-v2")

# 3. Embeddings banana
embeddings = embedder.encode(documents, convert_to_numpy=True)

# 4. FAISS Index banana (Fast search ke liye)
d = embeddings.shape[1]
index = faiss.IndexFlatL2(d)
index.add(embeddings)

print("Embeddings & Index created successfully!")

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")

print("FLAN-T5 Model loaded!")

In [None]:
def ask_akbar_birbal(question):
    # 1. Sawal ko embed karo
    q_emb = embedder.encode([question], convert_to_numpy=True)

    # 2. Sirf TOP 1 matching story dhundo (taaki model confuse na ho)
    k = 1
    distances, indices = index.search(q_emb, k)

    # Best matching story ka text
    best_story_idx = indices[0][0]
    retrieved_story = documents[best_story_idx]

    print(f"Reading relevant story... (Story Index: {best_story_idx})")

    # 3. Prompt banana (Strict instructions ke saath)
    prompt = f"""
    Read the story below and answer the question.

    Story:
    {retrieved_story}

    Instructions:
    1. Explain what happens in the story related to the question.
    2. Provide the Moral of the story if mentioned or imply it.

    Question: {question}

    Answer:
    """

    # 4. Answer generate karna
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
    outputs = model.generate(
        inputs["input_ids"],
        max_length=150,
        min_length=20,
        length_penalty=2.0,
        num_beams=4,
        early_stopping=True
    )

    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

In [None]:
user_question = input(" Ask a question (e.g., 'Ask a question about the stories?'): ")

print("\n" + "-"*30)
final_answer = ask_akbar_birbal(user_question)

print("Birbal's Answer:\n")
print(final_answer)
print("-"*30)

#### Evaluation

In [None]:
# =====================================================
# FINAL EVALUATION CELL (WORKS WITH YOUR RAG SYSTEM)
# =====================================================

import pandas as pd
from google.colab import files

print("Upload an Excel file with a 'Question' column")
uploaded = files.upload()
file_name = list(uploaded.keys())[0]

# Load Excel
df = pd.read_excel(file_name)

if "query" not in df.columns:
    raise ValueError("Excel must contain a column named 'Question'")

# Function that uses your RAG QA system
def evaluate_question(q):
    try:
        return ask_akbar_birbal(q)
    except Exception as e:
        return f"Error: {e}"

print("Generating answers using your RAG pipeline...")
print(df["query"])
df["generated_answer"] = df["query"].apply(evaluate_question)

# Save result
output_file = "evaluation_results.xlsx"
df.to_excel(output_file, index=False)

print("Evaluation complete!")
print("Downloading:", output_file)
files.download(output_file)


In [None]:
!pip install bert-score


In [None]:
import pandas as pd
import torch
from bert_score import score
from google.colab import files

# ---------------------------
# 1. Upload Excel file
# ---------------------------
print("Please upload your Excel file (with generated_answer & ground_truth columns)")
uploaded = files.upload()

if len(uploaded) == 0:
    raise ValueError("No file uploaded!")

input_file = list(uploaded.keys())[0]
print("Uploaded file:", input_file)

# ---------------------------
# 2. Read Excel file
# ---------------------------
df = pd.read_excel(input_file)

required_cols = ["generated_answer", "ground_truth"]
for col in required_cols:
    if col not in df.columns:
        raise ValueError(f"Missing required column: {col}")

# ---------------------------
# 3. Prepare text for BERTScore
# ---------------------------
generated = df["generated_answer"].astype(str).tolist()
ground_truth = df["ground_truth"].astype(str).tolist()

device = "cuda" if torch.cuda.is_available() else "cpu"
print("⚙ Using device:", device)

# ---------------------------
# 4. Compute BERTScore
# ---------------------------
print("Computing BERTScores...")

precision, recall, f1 = score(
    generated,
    ground_truth,
    model_type="bert-base-uncased",
    device=device
)

# ---------------------------
# 5. Add new columns to SAME file
# ---------------------------
df["bertscore_precision"] = precision.cpu().numpy()
df["bertscore_recall"] = recall.cpu().numpy()
df["bertscore_f1"] = f1.cpu().numpy()

# ---------------------------
# 6. Save updated Excel
# ---------------------------
output_file = "bert_scored_output.xlsx"
df.to_excel(output_file, index=False)

print("BERTScore calculation completed.")
print("Downloading updated file:", output_file)

# Download
files.download(output_file)
