In [31]:
pip install streamlit pymupdf sentence-transformers scikit-learn transformers nltk



In [32]:
import streamlit as st
import fitz
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import torch
from transformers import pipeline


CHUNK_SIZE = 500
CHUNK_OVERLAP = 50
EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"
QA_MODEL_NAME = "google/flan-t5-base"
TOP_N_RELEVANT_CHUNKS = 3
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [33]:

with open("app.py", "a") as f:
    f.write("""

def extract_text_from_pdf(uploaded_file):
    text = ""
    try:
        pdf_document = fitz.open(stream=uploaded_file.read(), filetype="pdf")
        for page_num in range(pdf_document.page_count):
            page = pdf_document.load_page(page_num)
            text += page.get_text()
        pdf_document.close()
    except Exception as e:
        st.error(f"Error extracting text from PDF: {e}")
        return None
    return text
""")
print("Second chunk appended to app.py")

Second chunk appended to app.py


In [34]:

with open("app.py", "a") as f:
    f.write("""

from nltk.tokenize import sent_tokenize

def chunk_text(text, chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP):
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = ""
    for sentence in sentences:
        if len(current_chunk) + len(sentence) + 1 <= chunk_size:
            current_chunk += sentence + " "
        else:
            if current_chunk:
                chunks.append(current_chunk.strip())
            current_chunk = sentence + " "
    if current_chunk:
        chunks.append(current_chunk.strip())

    # Apply overlap (can be refined)
    overlapped_chunks = []
    for i in range(len(chunks)):
        start = max(0, i - (chunk_overlap // (len(chunks[i]) + 1) if chunks[i] else 1)) # Simple overlap
        end = min(len(chunks), i + (chunk_overlap // (len(chunks[i]) + 1) if chunks[i] else 1) + 1)
        overlapped_chunks.append(" ".join(chunks[start:end]))
    return overlapped_chunks
""")
print("Third chunk appended to app.py")

Third chunk appended to app.py


In [35]:

with open("app.py", "a") as f:
    f.write("""

@st.cache_resource
def load_embedding_model():
    model = SentenceTransformer(EMBEDDING_MODEL_NAME, device=DEVICE)
    return model

@st.cache_data
def embed_chunks(chunks, model):
    embeddings = model.encode(chunks)
    return embeddings
""")
print("Fourth chunk appended to app.py")

Fourth chunk appended to app.py


In [36]:

with open("app.py", "a") as f:
    f.write("""

def find_relevant_chunks(question_embedding, chunk_embeddings, chunks, top_n=TOP_N_RELEVANT_CHUNKS):
    similarity_scores = cosine_similarity([question_embedding], chunk_embeddings)[0]
    ranked_chunk_indices = similarity_scores.argsort()[::-1]
    relevant_chunks = [chunks[i] for i in ranked_chunk_indices[:top_n]]
    return relevant_chunks
""")
print("Fifth chunk appended to app.py")

Fifth chunk appended to app.py


In [37]:

with open("app.py", "a") as f:
    f.write("""

@st.cache_resource
def load_qa_pipeline():
    qa_pipeline = pipeline("text2text-generation", model=QA_MODEL_NAME, device=DEVICE)
    return qa_pipeline

def query_llm(question, context, qa_pipeline):
    prompt = f"Question: {question} Context: {context} Answer:"
    try:
        response = qa_pipeline(prompt, max_length=200, num_return_sequences=1)[0]['generated_text']
        return response.strip()
    except Exception as e:
        st.error(f"Error during LLM query: {e}")
        return "Sorry, I couldn't generate an answer."
""")
print("Sixth chunk appended to app.py")

Sixth chunk appended to app.py


In [38]:

with open("app.py", "a") as f:
    f.write("""
# --- 6. Frontend (Streamlit) ---
st.title("Chat With Your PDF")
st.subheader("Upload a PDF and ask questions about its content.")

uploaded_file = st.file_uploader("Upload your PDF file", type="pdf")

if uploaded_file is not None:
    with st.spinner("Processing PDF..."):
        text = extract_text_from_pdf(uploaded_file)
        if text:
            chunks = chunk_text(text)
            embedding_model = load_embedding_model()
            chunk_embeddings = embed_chunks(chunks, embedding_model)
            qa_pipeline = load_qa_pipeline()
            st.success("PDF processed. You can now ask questions!")

            if "messages" not in st.session_state:
                st.session_state["messages"] = [{"role": "assistant", "content": "Ask me anything about the PDF!"}]

            for msg in st.session_state["messages"]:
                st.chat_message(msg["role"]).write(msg["content"])

            if prompt := st.chat_input("Your question"):
                st.session_state["messages"].append({"role": "user", "content": prompt})
                st.chat_message("user").write(prompt)

                with st.spinner("Generating answer..."):
                    question_embedding = embedding_model.encode([prompt])[0]
                    relevant_chunks = find_relevant_chunks(question_embedding, chunk_embeddings, chunks)
                    context = "\\n\\n".join(relevant_chunks)
                    answer = query_llm(prompt, context, qa_pipeline)

                    st.session_state["messages"].append({"role": "assistant", "content": answer})
                    st.chat_message("assistant").write(answer)
        else:
            st.error("Failed to process the PDF. Please try again with a valid PDF file.")

# Keep the localtunnel part for running on Colab
import requests
ip_address = requests.get('https://api.ipify.org').text
print(f"Your Colab's public IP address (Tunnel Password): {ip_address}")
!streamlit run app.py & npx localtunnel --port 8501
""")
print("Seventh chunk appended to app.py. app.py is now complete.")

Seventh chunk appended to app.py. app.py is now complete.


In [39]:
import requests
ip_address = requests.get('https://api.ipify.org').text
print(f"Your Colab's public IP address (Tunnel Password): {ip_address}")


Your Colab's public IP address (Tunnel Password): 34.121.127.117


In [40]:
!pip install streamlit pymupdf sentence-transformers scikit-learn torch transformers nltk
import nltk
nltk.download('punkt')



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [43]:
!streamlit run app.py & npx localtunnel --port 8501


Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[1G[0K⠙[1G[0K⠹[1G[0K⠸[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.121.127.117:8501[0m
[0m
[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0Kyour url is: https://chatty-towns-sing.loca.lt
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
2025-04-16 11:22:42.896527: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:0