In [None]:
# ✅ Step-by-Step Jupyter Notebook Setup for Streamlit-based PDF Chatbot
# 📁 Folder: C:/Users/Admin/Desktop/chatbot_theme_identifier

import os

base_path = r"C:/Users/Admin/Desktop/chatbot_theme_identifier"
folders = [
    "backend/app/api", "backend/app/core", "backend/app/models",
    "backend/app/services", "backend/data", "docs", "tests", "demo"
]

for folder in folders:
    os.makedirs(os.path.join(base_path, folder), exist_ok=True)

# Create main.py, config.py, requirements.txt, Dockerfile, README.md
open(os.path.join(base_path, "backend/app/main.py"), 'w').close()
open(os.path.join(base_path, "backend/app/config.py"), 'w').close()
open(os.path.join(base_path, "backend/requirements.txt"), 'w').close()
open(os.path.join(base_path, "backend/Dockerfile"), 'w').close()
open(os.path.join(base_path, "README.md"), 'w').close()

print("✅ Folder structure and base files created.")

# 🔽 Step 2: Create requirements.txt
req_text = """streamlit
PyMuPDF
sentence-transformers
faiss-cpu
pytesseract
Pillow
"""
with open(os.path.join(base_path, "backend/requirements.txt"), "w") as f:
    f.write(req_text)

# 🔽 Step 3: Create utils.py inside services
utils_code = '''
import os
import json
import fitz  # PyMuPDF
import pytesseract
import faiss
import numpy as np
from PIL import Image
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")

def extract_text_from_pdf(path):
    text = ""
    try:
        doc = fitz.open(path)
        for page in doc:
            text += page.get_text()
    except:
        text = pytesseract.image_to_string(Image.open(path))
    return text

def load_docs(folder):
    files, texts = [], []
    for file in os.listdir(folder):
        if file.endswith(".pdf"):
            full = os.path.join(folder, file)
            text = extract_text_from_pdf(full)
            files.append(file)
            texts.append(text)
    return texts, files

def save_embeddings(texts, files, emb_path, meta_path):
    embeddings = model.encode(texts)
    index = faiss.IndexFlatL2(len(embeddings[0]))
    index.add(np.array(embeddings))
    faiss.write_index(index, emb_path)
    with open(meta_path, "w", encoding="utf-8") as f:
        json.dump({"files": files, "texts": texts}, f)
    return index, {"files": files, "texts": texts}

def search_answer(query, index, meta, top_k=3):
    query_vec = model.encode([query])
    dist, idxs = index.search(np.array(query_vec), top_k)
    return [(meta['files'][i], meta['texts'][i][:500]) for i in idxs[0]]
'''
with open(os.path.join(base_path, "backend/app/services/utils.py"), "w", encoding="utf-8") as f:
    f.write(utils_code)

# 🔽 Step 4: Create streamlit app file
app_code = '''
import streamlit as st
from app.services.utils import load_docs, save_embeddings, search_answer
import os

st.set_page_config(page_title="PDF Chatbot", layout="centered")
st.title("📄 Chat with your Documents")

DOC_PATH = "backend/data"
INDEX_PATH = "backend/data/index.faiss"
META_PATH = "backend/data/meta.json"

if 'index' not in st.session_state:
    with st.spinner("Embedding PDFs..."):
        texts, files = load_docs(DOC_PATH)
        index, meta = save_embeddings(texts, files, INDEX_PATH, META_PATH)
        st.session_state.index = index
        st.session_state.meta = meta

query = st.text_input("Ask a question")
if query:
    results = search_answer(query, st.session_state.index, st.session_state.meta)
    for filename, snippet in results:
        st.write(f"📘 **{filename}**")
        st.write(snippet)
        st.markdown("---")
'''
with open(os.path.join(base_path, "app.py"), "w", encoding="utf-8") as f:
    f.write(app_code)

print("✅ All files generated. Now ready for Streamlit run and Hugging Face deploy.")


In [4]:
open("backend/__init__.py", "w").close()
open("backend/app/__init__.py", "w").close()
open("backend/app/services/__init__.py", "w").close()

In [1]:
pip install -r backend/requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [6]:
!pip install pymupdf



In [7]:
pip install PyMuPDF

Note: you may need to restart the kernel to use updated packages.


In [9]:
pip install nltk





In [13]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [8]:
pip install pytesseract

Note: you may need to restart the kernel to use updated packages.


In [1]:
cd C:\Users\Admin\Desktop\chatbot_theme_identifier

C:\Users\Admin\Desktop\chatbot_theme_identifier


In [2]:
streamlit run app.py

SyntaxError: invalid syntax (3737097518.py, line 1)

In [3]:
def extract_text_from_pdf(path):
    text = ""
    try:
        doc = fitz.open(path)
        for page in doc:
            content = page.get_text()
            content = content.replace("United Nations", "").strip()
            text += content + "\n"
    except:
        text = pytesseract.image_to_string(Image.open(path))
    return text
