<a href="https://colab.research.google.com/github/salimaliraja/salimaliraja/blob/main/SalimAliraja_Idea3_AI_powered_search_document_and_summerisation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
# app.py
"""
Streamlit app: Document ingestion -> FAISS semantic search -> Summarize relevant sections
Features:
 - Upload PDF / DOCX / TXT
 - Extract text (PyPDF2, python-docx)
 - Chunk & preprocess text
 - Embed chunks using sentence-transformers/all-MiniLM-L6-v2 (batch)
 - Index into FAISS (IndexFlatIP with normalized vectors)
 - Query interface: embed query, search top-k chunks
 - Summarize top-k chunks via a small summarization model (t5-small or distilbart)
 - Simple persistence: SQLite stores metadata; FAISS stored to disk
 - Retry/backoff on external calls (tenacity) prepared (not heavily used locally)
"""
!pip install streamlit
import streamlit as st
import os
import sqlite3
import json
import time
from typing import List, Dict, Any, Tuple
from dataclasses import dataclass

st.file_uploader("https://1drv.ms/w/c/77ccf9612c46e853/EexiJltnvXBOgP_q5oms9-MBRHKq-CtT_26kF5SqBTf86Q?e=JTVGNL", type=["pdf", "docx", "txt"])


# IO / Parsing
import io
try:
    import PyPDF2
except Exception:
    PyPDF2 = None
try:
    import docx
except Exception:
    docx = None

# Embeddings & FAISS
try:
    from sentence_transformers import SentenceTransformer
    import faiss
    import numpy as np
except Exception:
    SentenceTransformer = None
    faiss = None
    np = None

# Summarization model
try:
    from transformers import pipeline
except Exception:
    pipeline = None

# Optional evaluation
try:
    from rouge_score import rouge_scorer
except Exception:
    rouge_scorer = None

# Retry
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type

# ----------------------------
# Config
# ----------------------------
DB_PATH = "docs_index.db"
FAISS_INDEX_PATH = "faiss_index.bin"
EMBED_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"  # fast & small
SUMMARIZER_MODEL = "sshleifer/distilbart-cnn-12-6"  # small distilbart
CHUNK_SIZE = 500  # characters (approx ~200-300 tokens depending)
CHUNK_OVERLAP = 100
EMBED_BATCH_SIZE = 16
TOP_K = 5

# ----------------------------
# Utilities: DB / persist
# ----------------------------
def init_db():
    conn = sqlite3.connect(DB_PATH)
    cur = conn.cursor()
    cur.execute("""CREATE TABLE IF NOT EXISTS chunks (
                    id INTEGER PRIMARY KEY AUTOINCREMENT,
                    doc_name TEXT,
                    chunk_index INTEGER,
                    text TEXT,
                    metadata TEXT,
                    created_at REAL
                )""")
    conn.commit()
    conn.close()

def save_chunk(doc_name: str, chunk_index: int, text: str, metadata: Dict[str, Any]):
    conn = sqlite3.connect(DB_PATH)
    cur = conn.cursor()
    cur.execute("INSERT INTO chunks (doc_name, chunk_index, text, metadata, created_at) VALUES (?, ?, ?, ?, ?)",
                (doc_name, chunk_index, text, json.dumps(metadata), time.time()))
    conn.commit()
    conn.close()

def list_chunks(limit=20):
    conn = sqlite3.connect(DB_PATH)
    cur = conn.cursor()
    cur.execute("SELECT id, doc_name, chunk_index, text, metadata FROM chunks ORDER BY id DESC LIMIT ?", (limit,))
    rows = cur.fetchall()
    conn.close()
    return rows

def get_all_chunks():
    conn = sqlite3.connect(DB_PATH)
    cur = conn.cursor()
    cur.execute("SELECT id, doc_name, chunk_index, text, metadata FROM chunks ORDER BY id ASC")
    rows = cur.fetchall()
    conn.close()
    return rows

# ----------------------------
# Text extraction
# ----------------------------
def extract_text_from_pdf(file_bytes: bytes) -> str:
    if PyPDF2 is None:
        raise RuntimeError("PyPDF2 not installed")
    reader = PyPDF2.PdfReader(io.BytesIO(file_bytes))
    texts = []
    for p in reader.pages:
        try:
            texts.append(p.extract_text() or "")
        except Exception:
            texts.append("")
    return "\n".join(texts)

def extract_text_from_docx(file_bytes: bytes) -> str:
    if docx is None:
        raise RuntimeError("python-docx not installed")
    tmp_path = "temp_upload.docx"
    with open(tmp_path, "wb") as f:
        f.write(file_bytes)
    doc = docx.Document(tmp_path)
    texts = [p.text for p in doc.paragraphs]
    os.remove(tmp_path)
    return "\n".join(texts)

def extract_text_from_txt(file_bytes: bytes) -> str:
    return file_bytes.decode("utf-8", errors="ignore")

# ----------------------------
# Chunking / preprocessing
# ----------------------------
def clean_text(s: str) -> str:
    # simple normalization: collapse whitespace, remove multiple newlines, strip headers/footers heuristics
    s = s.replace("\r", "\n")
    # remove repeated page headers/footers if simple pattern detected (e.g., "Page X")
    s = "\n".join([line for line in s.splitlines() if not line.strip().lower().startswith("page ")])
    s = " ".join(s.split())
    return s.strip()

def chunk_text(text: str, size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> List[str]:
    text = clean_text(text)
    if not text:
        return []
    chunks = []
    start = 0
    L = len(text)
    while start < L:
        end = min(start + size, L)
        chunks.append(text[start:end])
        start = end - overlap
        if start < 0:
            start = 0
        if start >= L:
            break
    return chunks

# ----------------------------
# FAISS store (IndexFlatIP + normalized embeddings)
# ----------------------------
class FaissStore:
    def __init__(self, embed_model_name=EMBED_MODEL_NAME, index_path=FAISS_INDEX_PATH):
        if SentenceTransformer is None or faiss is None or np is None:
            raise RuntimeError("Install sentence-transformers, faiss-cpu, numpy")
        self.model = SentenceTransformer(embed_model_name)
        self.index_path = index_path
        self.index = None
        self.ids = []  # metadata order: list of chunk DB ids
        self._load()

    def _load(self):
        if os.path.exists(self.index_path):
            try:
                self.index = faiss.read_index(self.index_path)
                meta_path = self.index_path + ".meta.json"
                if os.path.exists(meta_path):
                    with open(meta_path, "r", encoding="utf-8") as f:
                        self.ids = json.load(f)
            except Exception:
                self.index = None
                self.ids = []
        if self.index is None:
            dim = self.model.get_sentence_embedding_dimension()
            self.index = faiss.IndexFlatIP(dim)
            self.ids = []

    def persist(self):
        faiss.write_index(self.index, self.index_path)
        with open(self.index_path + ".meta.json", "w", encoding="utf-8") as f:
            json.dump(self.ids, f)

    def add_texts(self, texts: List[str], chunk_db_ids: List[int]):
        # batch embedding
        embs = self.model.encode(texts, batch_size=EMBED_BATCH_SIZE, show_progress_bar=False)
        embs = np.array(embs).astype("float32")
        faiss.normalize_L2(embs)
        self.index.add(embs)
        self.ids.extend(chunk_db_ids)
        self.persist()

    def query(self, query_text: str, top_k=TOP_K) -> List[Tuple[int, float]]:
        q_emb = self.model.encode([query_text], show_progress_bar=False)
        q_emb = np.array(q_emb).astype("float32")
        faiss.normalize_L2(q_emb)
        D, I = self.index.search(q_emb, top_k)
        results = []
        for score, idx in zip(D[0], I[0]):
            if idx == -1:
                continue
            db_id = self.ids[idx]
            results.append((int(db_id), float(score)))
        return results

# ----------------------------
# Summarization (transformers pipeline)
# ----------------------------
def get_summarizer(model_name=SUMMARIZER_MODEL):
    if pipeline is None:
        raise RuntimeError("transformers not available")
    return pipeline("summarization", model=model_name, truncation=True)

# ----------------------------
# Retry wrapper for heavy ops (example)
# ----------------------------
retry_decorator = retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=1, max=10),
                        retry=retry_if_exception_type(Exception))

# ----------------------------
# Streamlit UI
# ----------------------------
st.set_page_config(page_title="DocIngest · Semantic Search & Summarize", layout="wide")
st.title("AI Document Ingest → Semantic Search → Summarize")

init_db()
faiss_store = None
try:
    faiss_store = FaissStore()
    st.sidebar.success("FAISS store ready")
except Exception as e:
    st.sidebar.warning(f"FAISS not ready: {e}")

with st.sidebar:
    st.header("Settings")
    st.write("Chunk size (chars) and overlap")
    csize = st.number_input("Chunk size", value=CHUNK_SIZE, min_value=200, max_value=2000, step=50)
    cover = st.number_input("Chunk overlap", value=CHUNK_OVERLAP, min_value=0, max_value=500, step=10)
    st.write("---")
    st.write("Embedding model:")
    emb_model = st.text_input("SentenceTransformer model", value=EMBED_MODEL_NAME)
    st.write("Summarizer model:")
    summ_model = st.text_input("Summarizer model", value=SUMMARIZER_MODEL)
    st.write("---")
    if st.button("Re-init FAISS with chosen embed model"):
        try:
            faiss_store = FaissStore(embed_model_name=emb_model)
            st.success("FAISS reinitialized.")
        except Exception as e:
            st.error(str(e))

# Upload files
st.header("Upload documents (PDF, DOCX, TXT)")
uploaded = st.file_uploader("/content/Island tourism and sustainability.pdf", accept_multiple_files=True, type=["/content/Island tourism and sustainability.pdf", "docx", "txt"])
if uploaded:
    for f in uploaded:
        name = f.name
        b = f.read()
        try:
            if name.lower().endswith(".pdf"):
                text = extract_text_from_pdf(b)
            elif name.lower().endswith(".docx"):
                text = extract_text_from_docx(b)
            else:
                text = extract_text_from_txt(b)
            chunks = chunk_text(text, size=csize, overlap=cover)
            chunk_db_ids = []
            for i, ch in enumerate(chunks):
                save_chunk(name, i, ch, {"source": name})
                # get last inserted id
                conn = sqlite3.connect(DB_PATH)
                cur = conn.cursor()
                cur.execute("SELECT last_insert_rowid()")
                last_id = cur.fetchone()[0]
                conn.close()
                chunk_db_ids.append(last_id)
            # add to FAISS
            if faiss_store and chunks:
                faiss_store.add_texts(chunks, chunk_db_ids)
                st.success(f"Indexed {len(chunks)} chunks from {name}")
            else:
                st.warning("No FAISS store available; chunks saved to DB only.")
        except Exception as e:
            st.error(f"Failed to process {name}: {e}")

# Query UI
st.markdown("---")
st.header("Query / Search")
query = st.text_input("write a summerise version of the article")
k = st.slider("Top K chunks", 1, 10, TOP_K)
if st.button("Search"):
    if not query:
        st.error("Enter a query")
    else:
        if faiss_store is None:
            st.error("FAISS store not initialized")
        else:
            with st.spinner("Searching..."):
                hits = faiss_store.query(query, top_k=k)
                st.write(f"Found {len(hits)} hits")
                # display and collect texts for summarization
                collected_texts = []
                for db_id, score in hits:
                    conn = sqlite3.connect(DB_PATH)
                    cur = conn.cursor()
                    cur.execute("SELECT doc_name, chunk_index, text FROM chunks WHERE id = ?", (db_id,))
                    r = cur.fetchone()
                    conn.close()
                    if r:
                        doc_name, cidx, txt = r
                        st.markdown(f"**Doc:** {doc_name} — chunk {cidx} — score {score:.3f}")
                        st.write(txt[:1000])
                        collected_texts.append(txt)
                st.session_state["last_hits"] = hits
                st.session_state["collected_texts"] = collected_texts

# Summarize retrieved content
st.markdown("---")
st.header("Summarize top results")
summ_model_choice = st.text_input("Summarizer (model id)", value=SUMMARIZER_MODEL)
summarize_btn = st.button("Summarize retrieved top-k")
if summarize_btn:
    texts = st.session_state.get("collected_texts", [])
    if not texts:
        st.error("No retrieved texts to summarize. Run a search first.")
    else:
        summarizer = get_summarizer(model_name=summ_model_choice)
        # fuse top texts into manageable size; chunk if necessary
        combined = "\n\n".join(texts)
        # summarizer may have input length limits; split if needed by characters
        max_in = 4000
        parts = [combined[i:i+max_in] for i in range(0, len(combined), max_in)]
        summaries = []
        for p in parts:
            out = summarizer(p, max_length=150, min_length=60, do_sample=False)
            summaries.append(out[0]["summary_text"])
        final_summary = "\n\n".join(summaries)
        st.subheader("Summary")
        st.write(final_summary)
        # save as artifact
        st.session_state["last_summary"] = final_summary

# Display last summary and hits
st.markdown("---")
if st.session_state.get("last_summary"):
    st.subheader("Last summary")
    st.write(st.session_state["last_summary"])

st.markdown("---")
st.subheader("Recent chunks (DB preview)")
rows = list_chunks(limit=8)
for r in rows:
    id_, doc, idx, txt, meta = r
    st.write(f"{id_} | {doc} | chunk {idx} | {txt[:200]}")

st.markdown("---")
st.write("Notes: tune chunk size, overlap, and model choices. For large corpora, use batching and incremental indexing.")




