In [2]:
!pip install -q sentence-transformers faiss-cpu transformers torch gradio pdfplumber



[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.8/67.8 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m55.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m36.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
from google.colab import files


In [6]:


print("INSTRUCTIONS FOR DATASET UPLOAD")
print("---------------------------------")
print("• Each PDF MUST contain exactly 5 stories")
print("• Each story MUST have a clear title")
print("• Story format should be: Story 1, Story 2, ..., Story 5")
print("---------------------------------\n")

num_pdfs = int(input("How many PDF files do you want to upload? "))

pdf_files = []

for i in range(num_pdfs):
    print(f"\n Upload PDF {i+1} of {num_pdfs}")
    uploaded = files.upload()
    for filename in uploaded.keys():
        pdf_files.append(filename)

print("\n Upload complete!")
print("Uploaded PDF files:")
for pdf in pdf_files:
    print("-", pdf)

# Extract Stories from PDFs
import pdfplumber
import re

def extract_stories_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = ""
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"

    text = re.sub(r'\s+', ' ', text).strip()
    matches = list(re.finditer(r'Story\s+\d+', text))

    stories = []
    for i in range(len(matches)):
        start = matches[i].start()
        end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
        story_block = text[start:end].strip()

        first_dot = story_block.find('.')
        if first_dot != -1:
            title = story_block[:first_dot+1].strip()
        else:
            title = story_block[:50].strip()

        stories.append({
            "title": title,
            "content": story_block,
            "source": pdf_path
        })
    return stories

all_stories = []
for pdf in pdf_files:
    stories = extract_stories_from_pdf(pdf)
    all_stories.extend(stories)

print(f"\n Total stories extracted: {len(all_stories)}")
if all_stories:
    print("Preview Story 1 Title:", all_stories[0]['title'])

# Embeddings + FAISS
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

documents = [s['content'] for s in all_stories]

embedder = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedder.encode(documents, convert_to_numpy=True)

d = embeddings.shape[1]
index = faiss.IndexFlatL2(d)
index.add(embeddings)

print("Embeddings & FAISS index created!")

# Load FLAN-T5 Model
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")

print("FLAN-T5 Model loaded!")

# RAG QA Function
def ask_akbar_birbal(question):
    q_emb = embedder.encode([question], convert_to_numpy=True)

    k = 1

    distances, indices = index.search(q_emb, k)
    if distances[0][0]>1.2:
        return "NO RELEVANT INFO FOUND"

    best_story_idx = indices[0][0]
    retrieved_story = documents[best_story_idx]

    prompt = f"""
Read the story below and answer the question.

Story:
{retrieved_story}

Instructions:
1. Explain what happens in the story related to the question.
2. Provide the Moral of the story if mentioned or implied.

Question: {question}

Answer:
"""

    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
    outputs = model.generate(
        inputs["input_ids"],
        max_length=150,
        min_length=20,
        length_penalty=2.0,
        num_beams=4,
        early_stopping=True
    )

    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

#Gradio UI
import gradio as gr
def gradio_chat(question):
    if not question.strip():
        return "Please enter a valid question."
    return ask_akbar_birbal(question)

ui = gr.Interface(
    fn=gradio_chat,
    inputs=gr.Textbox(lines=2, placeholder="Ask a question about the stories..."),
    outputs=gr.Textbox(label=" Birbal's Answer"),
    title="RAG Assistant",
    description="Ask questions from your uploaded PDFs using FAISS + FLAN-T5"
)

ui.launch(share=True)


INSTRUCTIONS FOR DATASET UPLOAD
---------------------------------
• Each PDF MUST contain exactly 5 stories
• Each story MUST have a clear title
• Story format should be: Story 1, Story 2, ..., Story 5
---------------------------------

How many PDF files do you want to upload? 2

 Upload PDF 1 of 2


Saving 4. Story_by_shivank.pdf to 4. Story_by_shivank (1).pdf

 Upload PDF 2 of 2


Saving 5. Story_by_shivank.pdf to 5. Story_by_shivank (1).pdf

 Upload complete!
Uploaded PDF files:
- 4. Story_by_shivank (1).pdf
- 5. Story_by_shivank (1).pdf

 Total stories extracted: 10
Preview Story 1 Title: Story 1.
Embeddings & FAISS index created!
FLAN-T5 Model loaded!
Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://1de8eb0e7c06f22b0a.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


