<a href="https://colab.research.google.com/github/ramtango007/Camera/blob/main/Snippets_Importing_libraries.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing a library that is not in Colaboratory

To import a library that's not in Colaboratory by default, you can use `!pip install` or `!apt-get install`.

In [10]:
!pip install matplotlib-venn



In [11]:
!apt-get -qq install -y libfluidsynth1

E: Package 'libfluidsynth1' has no installation candidate


# Install 7zip reader [libarchive](https://pypi.python.org/pypi/libarchive)

In [12]:
# https://pypi.python.org/pypi/libarchive
!apt-get -qq install -y libarchive-dev && pip install -U libarchive
import libarchive



# Install GraphViz & [PyDot](https://pypi.python.org/pypi/pydot)

In [13]:
# https://pypi.python.org/pypi/pydot
!apt-get -qq install -y graphviz && pip install pydot
import pydot



# Install [cartopy](http://scitools.org.uk/cartopy/docs/latest/)

In [14]:
!pip install cartopy
import cartopy



In [18]:
"""
PDF Q&A Chatbot with Contextual Conversation — Streamlit + FAISS + Sentence-Transformers + LLM

Quick start:
1) Install dependencies:
   pip install streamlit sentence-transformers faiss-cpu pypdf numpy requests openai

2) Run:
   streamlit run dashboard.py

Features:
- Upload PDFs
- Build / update index
- Conversational chat interface with memory (last 5 turns)
- Answers from LLM (Ollama or OpenAI) with sources
"""

import os
import io
import pickle
from pathlib import Path
import numpy as np
import streamlit as st
import requests
from pypdf import PdfReader
from sentence_transformers import SentenceTransformer

try:
    import faiss
except Exception:
    faiss = None

# Config
STORE_DIR = Path("rag_store")
INDEX_PATH = STORE_DIR / "faiss.index"
META_PATH = STORE_DIR / "meta.pkl"
CHUNK_SIZE = 900
CHUNK_OVERLAP = 150
EMBED_MODEL_NAME = "all-MiniLM-L6-v2"
DEFAULT_OLLAMA_MODEL = "llama3.1"
MAX_HISTORY = 5  # limit conversation history

# Utils
@st.cache_resource(show_spinner=False)
def load_embedder():
    return SentenceTransformer(EMBED_MODEL_NAME)

def chunk_text(text, chunk_size=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
    text = " ".join(text.split())
    chunks, start = [], 0
    while start < len(text):
        end = start + chunk_size
        chunks.append(text[start:end])
        start = end - overlap
        if start < 0:
            start = 0
    return chunks

def read_pdf(file_bytes):
    reader = PdfReader(io.BytesIO(file_bytes))
    pages = []
    for i, page in enumerate(reader.pages, start=1):
        try:
            pages.append((i, page.extract_text() or ""))
        except Exception:
            pages.append((i, ""))
    return pages

def build_docs_from_pdfs(uploaded_files):
    docs = []
    for file in uploaded_files:
        name = file.name
        bytes_data = file.read()
        for page_num, page_text in read_pdf(bytes_data):
            if not page_text.strip():
                continue
            for chunk in chunk_text(page_text):
                docs.append({"text": chunk, "metadata": {"source": name, "page": page_num}})
    return docs

def ensure_faiss_ready():
    if faiss is None:
        st.error("faiss-cpu not installed. Run: pip install faiss-cpu")
        st.stop()

def load_or_create_index(embedder, dim):
    STORE_DIR.mkdir(exist_ok=True)
    if INDEX_PATH.exists() and META_PATH.exists():
        index = faiss.read_index(str(INDEX_PATH))
        with open(META_PATH, "rb") as f:
            meta = pickle.load(f)
        return index, meta
    else:
        index = faiss.IndexFlatIP(dim)
        return index, []

def normalize(vecs):
    norms = np.linalg.norm(vecs, axis=1, keepdims=True) + 1e-10
    return vecs / norms

def upsert_docs(index, meta, docs, embedder):
    if not docs:
        return index, meta
    texts = [d["text"] for d in docs]
    vecs = embedder.encode(texts, convert_to_numpy=True, show_progress_bar=True)
    vecs = normalize(vecs.astype("float32"))
    index.add(vecs)
    meta.extend(docs)
    faiss.write_index(index, str(INDEX_PATH))
    with open(META_PATH, "wb") as f:
        pickle.dump(meta, f)
    return index, meta

def search(index, embedder, query, meta, k=5):
    q = embedder.encode([query], convert_to_numpy=True)
    q = normalize(q.astype("float32"))
    D, I = index.search(q, k)
    hits = []
    for idx, score in zip(I[0], D[0]):
        if idx == -1:
            continue
        item = meta[idx].copy()
        item["score"] = float(score)
        hits.append(item)
    return hits

# LLM backends
def call_ollama(messages, model=DEFAULT_OLLAMA_MODEL):
    url = "http://localhost:11434/api/chat"
    payload = {"model": model, "messages": messages, "stream": False}
    try:
        resp = requests.post(url, json=payload, timeout=120)
        resp.raise_for_status()
        return resp.json().get("message", {}).get("content", "")
    except Exception as e:
        return f"[Ollama error] {e}"

def call_openai(messages, model="gpt-4o-mini"):
    try:
        from openai import OpenAI
        api_key = os.getenv("OPENAI_API_KEY")
        if not api_key:
            return "[OpenAI error] No API key set"
        client = OpenAI(api_key=api_key)
        resp = client.chat.completions.create(
            model=model,
            messages=[{"role": "system", "content": "Answer using the context only. If not in context, say you don't know."}] + messages,
        )
        return resp.choices[0].message.content.strip()
    except Exception as e:
        return f"[OpenAI error] {e}"

def build_prompt(question, hits):
    context_blocks = []
    for h in hits:
        src = h["metadata"]["source"]
        page = h["metadata"].get("page", "?")
        snippet = h["text"]
        context_blocks.append(f"[Source: {src} | Page: {page}]\n{snippet}")
    context = "\n\n".join(context_blocks)
    return f"Context:\n{context}\n\nQuestion: {question}\n\nAnswer:"

# UI
st.set_page_config(page_title="PDF Q&A Chatbot", page_icon="💬", layout="wide")
st.title("💬 PDF Q&A Chatbot")

with st.sidebar:
    st.header("Settings")
    backend = st.selectbox("Backend", ["Ollama (local)", "OpenAI API"])
    top_k = st.slider("Top-k chunks", 2, 10, 5)
    rebuild = st.checkbox("Rebuild index", False)

st.subheader("📂 Upload Files")
uploaded_files = st.file_uploader("Choose PDF files", type=["pdf"], accept_multiple_files=True)

if "messages" not in st.session_state:
    st.session_state["messages"] = []

col1, col2 = st.columns([1, 2])
with col1:
    do_index = st.button("Build / Update Index")

with col2:
    user_input = st.text_input("Type your question and press Enter:")

embedder = load_embedder()

if do_index and uploaded_files:
    ensure_faiss_ready()
    if rebuild and STORE_DIR.exists():
        for p in STORE_DIR.glob("*"): p.unlink()
    index, meta = load_or_create_index(embedder, embedder.get_sentence_embedding_dimension())
    docs = build_docs_from_pdfs(uploaded_files)
    if docs:
        index, meta = upsert_docs(index, meta, docs, embedder)
        st.success(f"Indexed {len(docs)} chunks")
    else:
        st.warning("No text extracted.")

if user_input:
    ensure_faiss_ready()
    if not (INDEX_PATH.exists() and META_PATH.exists()):
        st.warning("No index found. Upload PDFs and build index first.")
    else:
        index = faiss.read_index(str(INDEX_PATH))
        with open(META_PATH, "rb") as f: meta = pickle.load(f)
        hits = search(index, embedder, user_input, meta, top_k)
        if hits:
            prompt = build_prompt(user_input, hits)
            # Append user message
            st.session_state["messages"].append({"role": "user", "content": user_input})

            # Keep only last MAX_HISTORY turns
            st.session_state["messages"] = st.session_state["messages"][-MAX_HISTORY*2:]

            if backend == "Ollama (local)":
                answer = call_ollama(st.session_state["messages"] + [{"role": "user", "content": prompt}])
            else:
                answer = call_openai(st.session_state["messages"] + [{"role": "user", "content": prompt}])

            st.session_state["messages"].append({"role": "assistant", "content": answer, "sources": hits})
        else:
            st.session_state["messages"].append({"role": "assistant", "content": "I couldn't find relevant info.", "sources": []})

# Display conversation
for i, msg in enumerate(st.session_state["messages"]):
    if msg["role"] == "user":
        st.markdown(f"**You:** {msg['content']}")
    else:
        st.markdown(f"**Bot:** {msg['content']}")
        if msg.get("sources"):
            with st.expander("Sources"):
                for h in msg["sources"]:
                    st.write(f"{h['metadata']['source']} (p.{h['metadata'].get('page','?')}) :: {h['text'][:300]}...")
    st.divider()


ModuleNotFoundError: No module named 'pypdf'

The `streamlit` library has been installed. You can now run the code cell below to start the PDF Q&A Chatbot application.

In [17]:
!pip install streamlit

Collecting streamlit
  Downloading streamlit-1.48.1-py3-none-any.whl.metadata (9.5 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.48.1-py3-none-any.whl (9.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m44.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m74.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pydeck, streamlit
Successfully installed pydeck-0.9.1 streamlit-1.48.1


In [22]:
"""
PDF Q&A Chatbot with Contextual Conversation — Streamlit + FAISS + Sentence-Transformers + LLM

Quick start:
1) Install dependencies:
   pip install streamlit sentence-transformers faiss-cpu pypdf numpy requests openai

2) Run:
   streamlit run dashboard.py

Features:
- Upload PDFs
- Build / update index
- Conversational chat interface with memory (last 5 turns)
- Answers from LLM (Ollama or OpenAI) with sources
"""

import os
import io
import pickle
from pathlib import Path
import numpy as np
import streamlit as st
import requests
from pypdf import PdfReader
from sentence_transformers import SentenceTransformer

try:
    import faiss
except Exception:
    faiss = None

# Config
STORE_DIR = Path("rag_store")
INDEX_PATH = STORE_DIR / "faiss.index"
META_PATH = STORE_DIR / "meta.pkl"
CHUNK_SIZE = 900
CHUNK_OVERLAP = 150
EMBED_MODEL_NAME = "all-MiniLM-L6-v2"
DEFAULT_OLLAMA_MODEL = "llama3.1"
MAX_HISTORY = 5  # limit conversation history

# Utils
@st.cache_resource(show_spinner=False)
def load_embedder():
    return SentenceTransformer(EMBED_MODEL_NAME)

def chunk_text(text, chunk_size=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
    text = " ".join(text.split())
    chunks, start = [], 0
    while start < len(text):
        end = start + chunk_size
        chunks.append(text[start:end])
        start = end - overlap
        if start < 0:
            start = 0
    return chunks

def read_pdf(file_bytes):
    reader = PdfReader(io.BytesIO(file_bytes))
    pages = []
    for i, page in enumerate(reader.pages, start=1):
        try:
            pages.append((i, page.extract_text() or ""))
        except Exception:
            pages.append((i, ""))
    return pages

def build_docs_from_pdfs(uploaded_files):
    docs = []
    for file in uploaded_files:
        name = file.name
        bytes_data = file.read()
        for page_num, page_text in read_pdf(bytes_data):
            if not page_text.strip():
                continue
            for chunk in chunk_text(page_text):
                docs.append({"text": chunk, "metadata": {"source": name, "page": page_num}})
    return docs

def ensure_faiss_ready():
    if faiss is None:
        st.error("faiss-cpu not installed. Run: pip install faiss-cpu")
        st.stop()

def load_or_create_index(embedder, dim):
    STORE_DIR.mkdir(exist_ok=True)
    if INDEX_PATH.exists() and META_PATH.exists():
        index = faiss.read_index(str(INDEX_PATH))
        with open(META_PATH, "rb") as f:
            meta = pickle.load(f)
        return index, meta
    else:
        index = faiss.IndexFlatIP(dim)
        return index, []

def normalize(vecs):
    norms = np.linalg.norm(vecs, axis=1, keepdims=True) + 1e-10
    return vecs / norms

def upsert_docs(index, meta, docs, embedder):
    if not docs:
        return index, meta
    texts = [d["text"] for d in docs]
    vecs = embedder.encode(texts, convert_to_numpy=True, show_progress_bar=True)
    vecs = normalize(vecs.astype("float32"))
    index.add(vecs)
    meta.extend(docs)
    faiss.write_index(index, str(INDEX_PATH))
    with open(META_PATH, "wb") as f:
        pickle.dump(meta, f)
    return index, meta

def search(index, embedder, query, meta, k=5):
    q = embedder.encode([query], convert_to_numpy=True)
    q = normalize(q.astype("float32"))
    D, I = index.search(q, k)
    hits = []
    for idx, score in zip(I[0], D[0]):
        if idx == -1:
            continue
        item = meta[idx].copy()
        item["score"] = float(score)
        hits.append(item)
    return hits

# LLM backends
def call_ollama(messages, model=DEFAULT_OLLAMA_MODEL):
    url = "http://localhost:11434/api/chat"
    payload = {"model": model, "messages": messages, "stream": False}
    try:
        resp = requests.post(url, json=payload, timeout=120)
        resp.raise_for_status()
        return resp.json().get("message", {}).get("content", "")
    except Exception as e:
        return f"[Ollama error] {e}"

def call_openai(messages, model="gpt-4o-mini"):
    try:
        from openai import OpenAI
        api_key = os.getenv("OPENAI_API_KEY")
        if not api_key:
            return "[OpenAI error] No API key set"
        client = OpenAI(api_key=api_key)
        resp = client.chat.completions.create(
            model=model,
            messages=[{"role": "system", "content": "Answer using the context only. If not in context, say you don't know."}] + messages,
        )
        return resp.choices[0].message.content.strip()
    except Exception as e:
        return f"[OpenAI error] {e}"

def build_prompt(question, hits):
    context_blocks = []
    for h in hits:
        src = h["metadata"]["source"]
        page = h["metadata"].get("page", "?")
        snippet = h["text"]
        context_blocks.append(f"[Source: {src} | Page: {page}]\n{snippet}")
    context = "\n\n".join(context_blocks)
    return f"Context:\n{context}\n\nQuestion: {question}\n\nAnswer:"

# UI
st.set_page_config(page_title="PDF Q&A Chatbot", page_icon="💬", layout="wide")
st.title("💬 PDF Q&A Chatbot")

with st.sidebar:
    st.header("Settings")
    backend = st.selectbox("Backend", ["Ollama (local)", "OpenAI API"])
    top_k = st.slider("Top-k chunks", 2, 10, 5)
    rebuild = st.checkbox("Rebuild index", False)

st.subheader("📂 Upload Files")
uploaded_files = st.file_uploader("Choose PDF files", type=["pdf"], accept_multiple_files=True)

if "messages" not in st.session_state:
    st.session_state["messages"] = []

col1, col2 = st.columns([1, 2])
with col1:
    do_index = st.button("Build / Update Index")

with col2:
    user_input = st.text_input("Type your question and press Enter:")

embedder = load_embedder()

if do_index and uploaded_files:
    ensure_faiss_ready()
    if rebuild and STORE_DIR.exists():
        for p in STORE_DIR.glob("*"): p.unlink()
    index, meta = load_or_create_index(embedder, embedder.get_sentence_embedding_dimension())
    docs = build_docs_from_pdfs(uploaded_files)
    if docs:
        index, meta = upsert_docs(index, meta, docs, embedder)
        st.success(f"Indexed {len(docs)} chunks")
    else:
        st.warning("No text extracted.")

if user_input:
    ensure_faiss_ready()
    if not (INDEX_PATH.exists() and META_PATH.exists()):
        st.warning("No index found. Upload PDFs and build index first.")
    else:
        index = faiss.read_index(str(INDEX_PATH))
        with open(META_PATH, "rb") as f: meta = pickle.load(f)
        hits = search(index, embedder, user_input, meta, top_k)
        if hits:
            prompt = build_prompt(user_input, hits)
            # Append user message
            st.session_state["messages"].append({"role": "user", "content": user_input})

            # Keep only last MAX_HISTORY turns
            st.session_state["messages"] = st.session_state["messages"][-MAX_HISTORY*2:]

            if backend == "Ollama (local)":
                answer = call_ollama(st.session_state["messages"] + [{"role": "user", "content": prompt}])
            else:
                answer = call_openai(st.session_state["messages"] + [{"role": "user", "content": prompt}])

            st.session_state["messages"].append({"role": "assistant", "content": answer, "sources": hits})
        else:
            st.session_state["messages"].append({"role": "assistant", "content": "I couldn't find relevant info.", "sources": []})

# Display conversation
for i, msg in enumerate(st.session_state["messages"]):
    if msg["role"] == "user":
        st.markdown(f"**You:** {msg['content']}")
    else:
        st.markdown(f"**Bot:** {msg['content']}")
        if msg.get("sources"):
            with st.expander("Sources"):
                for h in msg["sources"]:
                    st.write(f"{h['metadata']['source']} (p.{h['metadata'].get('page','?')}) :: {h['text'][:300]}...")
    st.divider()


2025-08-21 10:40:20.359 
  command:

    streamlit run /usr/local/lib/python3.12/dist-packages/colab_kernel_launcher.py [ARGUMENTS]
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]



In [21]:
!pip install pypdf

Collecting pypdf
  Downloading pypdf-6.0.0-py3-none-any.whl.metadata (7.1 kB)
Downloading pypdf-6.0.0-py3-none-any.whl (310 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/310.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/310.5 kB[0m [31m2.3 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m307.2/310.5 kB[0m [31m4.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.5/310.5 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf
Successfully installed pypdf-6.0.0
