<a href="https://colab.research.google.com/github/suchizz/doc_analyzer_frontend/blob/main/chatbot_theme_identifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
# ✅ Install required packages
!pip install -q fastapi uvicorn nest_asyncio pyngrok
!pip install -q langchain sentence-transformers faiss-cpu PyMuPDF transformers

In [13]:
!pip install -q langchain-community


In [14]:
# ✅ Imports
import os
import fitz  # PyMuPDF
import nest_asyncio
from fastapi import FastAPI, UploadFile, File
from fastapi.responses import JSONResponse
from pyngrok import ngrok
import uvicorn

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.docstore.document import Document

from transformers import PegasusTokenizer, PegasusForConditionalGeneration
import torch

In [15]:
# ✅ Initialize summarization model
model_name = "google/pegasus-xsum"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
summarizer = PegasusForConditionalGeneration.from_pretrained(model_name)

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
from fastapi import FastAPI, UploadFile, File, Form
from fastapi.responses import JSONResponse
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_core.documents import Document
from sentence_transformers import SentenceTransformer
import fitz  # PyMuPDF
import nltk
import tempfile
import os
nltk.download("punkt")
from nltk.tokenize import sent_tokenize


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
app = FastAPI()

In [17]:
@app.post("/analyze")
async def analyze(files: list[UploadFile] = File(...), question: str = Form(...)):
    all_chunks = []
    file_metadata = []

    for file in files:
        filename = file.filename
        temp_path = os.path.join(tempfile.gettempdir(), filename)
        with open(temp_path, "wb") as f:
            f.write(await file.read())

        pdf = fitz.open(temp_path)
        for page_num, page in enumerate(pdf, start=1):
            paragraphs = page.get_text("blocks")
            for para_idx, (x0, y0, x1, y1, text, block_no, block_type) in enumerate(paragraphs):
                sentences = sent_tokenize(text.strip())
                for sent_idx, sentence in enumerate(sentences, start=1):
                    meta = {
                        "doc_name": filename,
                        "page": page_num,
                        "paragraph": para_idx + 1,
                        "sentence": sent_idx
                    }
                    if len(sentence.split()) > 5:
                        all_chunks.append(Document(page_content=sentence, metadata=meta))

    embedder = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    db = FAISS.from_documents(all_chunks, embedder)
    results = db.similarity_search(question, k=6)

    answer_snippets = []
    doc_table = []
    theme_map = {}

    for doc in results:
        meta = doc.metadata
        citation = f"{meta['doc_name']} – Page {meta['page']}, Para {meta['paragraph']}, Sent {meta['sentence']}"
        link = f"[{citation}](#jump-to-{meta['doc_name']}-{meta['page']})"
        snippet = f"🔹 {doc.page_content}  \n📌 {link}"
        answer_snippets.append(snippet)

        doc_table.append({
            "document": meta['doc_name'],
            "page": meta['page'],
            "paragraph": meta['paragraph'],
            "sentence": meta['sentence'],
            "answer": doc.page_content
        })

        # Basic rule-based theme clustering
        if "graduation" in doc.page_content.lower():
            theme_map.setdefault("Education Impact", []).append((meta['doc_name'], doc.page_content))
        elif "job" in doc.page_content.lower():
            theme_map.setdefault("Career Loss", []).append((meta['doc_name'], doc.page_content))
        else:
            theme_map.setdefault("General Insight", []).append((meta['doc_name'], doc.page_content))

    theme_output = "Synthesized (theme) answer (chat format):\n"
    for idx, (theme, items) in enumerate(theme_map.items(), 1):
        theme_output += f"\nTheme {idx} – {theme}:\n"
        added = set()
        for doc_id, content in items:
            if (doc_id, content) not in added:
                theme_output += f"{doc_id}: {content[:120]}...\n"
                added.add((doc_id, content))

    return JSONResponse({
        "question": question,
        "direct_answers": answer_snippets,
        "documents": doc_table,
        "theme_summary": theme_output
    })


In [21]:
# %%
# ✅ Launch ngrok tunnel
# Set your ngrok authtoken here. Replace "YOUR_AUTHTOKEN" with your actual authtoken.
# You can find your authtoken at https://dashboard.ngrok.com/get-started/your-authtoken
try:
    ngrok.set_auth_token("2yQJk5RM2Fe1imsdbwyHk3nsJwF_7G9A5xE146vU8cXVc7UYk")
    public_url = ngrok.connect(8000)
    print(f"🔗 Your backend is live at: {public_url}")
except Exception as e:
    print(f"An error occurred while starting ngrok: {e}")
    print("Please ensure you have set your ngrok authtoken correctly.")

🔗 Your backend is live at: NgrokTunnel: "https://6dd3-34-90-102-224.ngrok-free.app" -> "http://localhost:8000"


In [None]:
# ✅ Start FastAPI server
# ✅ Start FastAPI server
# Apply nest_asyncio patch to allow running asyncio in a Jupyter notebook
nest_asyncio.apply()
uvicorn.run(app, host="0.0.0.0", port=8000)

INFO:     Started server process [179]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
ERROR:asyncio:Task exception was never retrieved
future: <Task finished name='Task-15' coro=<Server.serve() done, defined at /usr/local/lib/python3.11/dist-packages/uvicorn/server.py:68> exception=KeyboardInterrupt()>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/uvicorn/main.py", line 580, in run
    server.run()
  File "/usr/local/lib/python3.11/dist-packages/uvicorn/server.py", line 66, in run
    return asyncio.run(self.serve(sockets=sockets))
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/nest_asyncio.py", line 30, in run
    return loop.run_until_complete(task)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/nest_asyncio.py", line 92, in run_until_complete
    

INFO:     35.203.151.101:0 - "POST /analyze HTTP/1.1" 200 OK
INFO:     35.203.151.101:0 - "POST /analyze HTTP/1.1" 200 OK
