In [1]:
from rag_core.index_builder import *

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [39]:
def load_web_docs(urls: List[str]):
    """
    Load documents from a list of URLs.

    - HTML URLs are loaded with WebBaseLoader.
    - PDF URLs (including Google Drive file links) are loaded with OnlinePDFLoader.
    """
    html_urls: List[str] = []
    pdf_urls: List[str] = []

    for url in urls:
        u = url.strip()
        if not u:
            continue
        
        html_urls.append(u)

    docs = []

    # --- HTML pages via WebBaseLoader ---
    if html_urls:
        html_loader = WebBaseLoader(
            web_paths=html_urls,
            bs_kwargs=dict(
                parse_only=bs4.SoupStrainer(
                    ["article", "main", "div", "section", "body"]
                )
            ),
        )
        html_docs = html_loader.load()
        for d in html_docs:
            d.metadata["source"] = d.metadata.get("source", d.metadata.get("url"))
        docs.extend(html_docs)

    # --- PDF files (including Drive) via OnlinePDFLoader ---
    for pdf_url in pdf_urls:
        try:
            pdf_loader = OnlinePDFLoader(pdf_url)
            pdf_docs = pdf_loader.load()
            for d in pdf_docs:
                d.metadata["source"] = d.metadata.get("source", pdf_url)
            docs.extend(pdf_docs)
        except Exception as e:
            print(f"[index_builder] Failed to load PDF from {pdf_url}: {e}")

    return docs

In [29]:
n, chunks = build_and_save_index()

[index_builder] Crawled 12 URLs under https://fearless-writers-028990.framer.app/project/
[index_builder] Total URLs to load: 17
[index_builder] Loaded 17 raw documents
[index_builder] Split into 51 chunks
[index_builder] Saved FAISS index to /Users/ritamupadhyay/Documents/Ritam_QA/data/vectorstore/career_faiss_index (chunks=51)


In [30]:
chunks

[Document(metadata={'source': 'https://fearless-writers-028990.framer.app/old-home', 'start_index': 4}, page_content="Ritam UpadhyayData ScientistHomeAboutProjectsStacksSearchMenuAvailable for workLet's Transform Ideas into RealityRitam UpadhyayData ScientistProblem SolverData ScientistProblem SolverData ScientistProblem SolverData ScientistProblem SolverRitam UpadhyayData ScientistProblem SolverData ScientistProblem SolverData ScientistProblem SolverData ScientistProblem SolverRitam UpadhyayData ScientistProblem SolverData ScientistProblem SolverData ScientistProblem SolverData ScientistProblem SolverWelcome to my portfolio! I am Ritam Upadhyay, passionate in the field of Data Science and Artificial Intelligence. I have 2 years of work experience in the field of Data Science at Paytm and I am currently pursuing my Master's in Data Science from Arizona State University.More about meMore about meMore about meContactContactContactMy Latest WorksI present my top-tier projects, meticulousl

In [40]:
docs = load_web_docs(["https://fearless-writers-028990.framer.app/"])

In [41]:
docs

[Document(metadata={'source': 'https://fearless-writers-028990.framer.app/'}, page_content='\n\n\n\nRitam UpadhyayData ScientistHomeAboutProjectsStacksSearchMenuAvailable for workBack to homepageBack to homepageBack to homepageRitam UpadhyayData ScientistProblem SolverData ScientistProblem SolverData ScientistProblem SolverData ScientistProblem SolverRitam UpadhyayData ScientistProblem SolverData ScientistProblem SolverData ScientistProblem SolverData ScientistProblem SolverRitam UpadhyayData ScientistProblem SolverData ScientistProblem SolverData ScientistProblem SolverData ScientistProblem SolverAboutWelcome to my portfolio! I am an ASU Master\'s student (\'26) with experience at Juniper Networks and Paytm Money. I leverage ML/AI to solve complex business challenges, specializing in NLP, computer vision, and predictive analytics. Seeking roles in Data Science, AI/ML, and NLP to drive impact at scale.My projectsMy projectsMy projectsContactContactContactContactLinkedinritam.upadhyayri

In [42]:
print(docs[0].page_content)





Ritam UpadhyayData ScientistHomeAboutProjectsStacksSearchMenuAvailable for workBack to homepageBack to homepageBack to homepageRitam UpadhyayData ScientistProblem SolverData ScientistProblem SolverData ScientistProblem SolverData ScientistProblem SolverRitam UpadhyayData ScientistProblem SolverData ScientistProblem SolverData ScientistProblem SolverData ScientistProblem SolverRitam UpadhyayData ScientistProblem SolverData ScientistProblem SolverData ScientistProblem SolverData ScientistProblem SolverAboutWelcome to my portfolio! I am an ASU Master's student ('26) with experience at Juniper Networks and Paytm Money. I leverage ML/AI to solve complex business challenges, specializing in NLP, computer vision, and predictive analytics. Seeking roles in Data Science, AI/ML, and NLP to drive impact at scale.My projectsMy projectsMy projectsContactContactContactContactLinkedinritam.upadhyayritam.upadhyayritam.upadhyayGithubritam3ritam3ritam3Emailrupadh17@asu.edurupadh17@asu.edurupadh17@as

In [43]:
split_docs(docs)

[Document(metadata={'source': 'https://fearless-writers-028990.framer.app/', 'start_index': 4}, page_content="Ritam UpadhyayData ScientistHomeAboutProjectsStacksSearchMenuAvailable for workBack to homepageBack to homepageBack to homepageRitam UpadhyayData ScientistProblem SolverData ScientistProblem SolverData ScientistProblem SolverData ScientistProblem SolverRitam UpadhyayData ScientistProblem SolverData ScientistProblem SolverData ScientistProblem SolverData ScientistProblem SolverRitam UpadhyayData ScientistProblem SolverData ScientistProblem SolverData ScientistProblem SolverData ScientistProblem SolverAboutWelcome to my portfolio! I am an ASU Master's student ('26) with experience at Juniper Networks and Paytm Money. I leverage ML/AI to solve complex business challenges, specializing in NLP, computer vision, and predictive analytics. Seeking roles in Data Science, AI/ML, and NLP to drive impact at scale.My projectsMy projectsMy"),
 Document(metadata={'source': 'https://fearless-w

In [1]:
# rag_core/index_builder.py

import re
from pathlib import Path
from typing import List

import requests
from langchain_community.document_loaders import (
    OnlinePDFLoader,
    PyPDFLoader,
)
from langchain_text_splitters import (
    HTMLSectionSplitter,
    RecursiveCharacterTextSplitter,
)
from langchain_community.vectorstores import FAISS


# ---------- Helpers for URLs ----------

def _is_gdrive_file(url: str) -> bool:
    """Return True if this looks like a Google Drive file view URL."""
    return "drive.google.com" in url and "/file/d/" in url


def _gdrive_view_to_download(url: str) -> str:
    """
    Convert a Google Drive view URL to a direct download URL.

    Example:
      https://drive.google.com/file/d/<ID>/view
      -> https://drive.google.com/uc?export=download&id=<ID>
    """
    m = re.search(r"/file/d/([^/]+)/", url)
    if not m:
        return url
    file_id = m.group(1)
    return f"https://drive.google.com/uc?export=download&id={file_id}"


def _infer_section_label_from_url(url: str) -> str:
    """
    Heuristic: guess a section label from the URL path.
    e.g.
      https://your-site.com/about                 -> 'about'
      https://your-site.com/experience/juniper   -> 'experience/juniper'
    """
    try:
        path = url.split("://", 1)[-1].split("/", 1)[-1]
    except Exception:
        return url
    path = path.strip("/")
    if not path:
        return "root"
    return path


# ---------- Local resume PDF ----------


# ---------- Remote docs: HTML (via HTMLSectionSplitter) + PDFs ----------

def load_remote_docs(urls: List[str]):
    """
    Load documents from remote URLs.

    - HTML URLs: use `requests` to fetch HTML string, then HTMLSectionSplitter.split_text(html_string).
    - PDF URLs (including Google Drive /file/d/.../view): use OnlinePDFLoader.
    """
    html_urls: List[str] = []
    pdf_urls: List[str] = []

    for url in urls:
        u = url.strip()
        if not u:
            continue

        html_urls.append(u)

    docs = []

    # --- HTML: fetch raw HTML and split with HTMLSectionSplitter ---
    if html_urls:
        print(f"[index_builder] Loading HTML for {len(html_urls)} URLs with HTMLSectionSplitter")

        headers_to_split_on = [
            ("h1", "Header 1"),
            ("h2", "Header 2"),
        ]
        html_splitter = HTMLSectionSplitter(headers_to_split_on=headers_to_split_on)

        for url in html_urls:
            try:
                print(f"[index_builder]   Fetching HTML from {url}")
                resp = requests.get(url, timeout=15)
                resp.raise_for_status()
                html_string = resp.text

                # This is exactly the pattern you requested:
                # html_header_splits = html_splitter.split_text(html_string)
                html_header_splits = html_splitter.split_text(html_string)

                section_label = _infer_section_label_from_url(url)
                print(f"[index_builder]   {url}: {len(html_header_splits)} HTML sections")

                for d in html_header_splits:
                    d.metadata["source"] = url
                    d.metadata["section_label"] = section_label  # existing label you already set
                    # extract a sensible header line to use when matching queries
                    # prefer an explicit header tag if present (HTMLSectionSplitter often places it at the top)
                    first_lines = [ln.strip() for ln in d.page_content.splitlines() if ln.strip()]
                    header_line = first_lines[0] if first_lines else ""
                    # normalize header to be short (cut long inline text)
                    if len(header_line) > 200:
                        header_line = header_line[:200] + "..."
                    d.metadata["section_header"] = header_line
                    d.metadata["section_type"] = "remote_html"
                # then append these docs to your docs list as usual
                docs.extend(html_header_splits)
            except Exception as e:
                print(f"[index_builder] Error processing HTML from {url}: {e}")

    # --- PDFs (remote, including Drive) ---
    for pdf_url in pdf_urls:
        print(f"[index_builder] Loading PDF from {pdf_url}")
        try:
            pdf_loader = OnlinePDFLoader(pdf_url)
            pdf_docs = pdf_loader.load()
            section_label = _infer_section_label_from_url(pdf_url)
            for d in pdf_docs:
                d.metadata["source"] = pdf_url
                d.metadata["section_label"] = section_label
                d.metadata["section_type"] = "remote_pdf"
            docs.extend(pdf_docs)
            print(f"[index_builder]   {pdf_url}: {len(pdf_docs)} PDF pages")
        except Exception as e:
            print(f"[index_builder] Failed to load PDF from {pdf_url}: {e}")

    return docs


# ---------- Chunking ----------

def split_docs(docs):
    """
    Split documents into chunks.

    - HTML docs: already split into section-level chunks by HTMLSectionSplitter → keep as-is.
    - Non-HTML docs (PDF/resume): use RecursiveCharacterTextSplitter.
    """
    html_docs = [d for d in docs if d.metadata.get("section_type") == "remote_html"]
    other_docs = [d for d in docs if d.metadata.get("section_type") != "remote_html"]

    chunks: List = []

    # HTML docs are already section chunks
    chunks.extend(html_docs)

    # PDFs / resume get text splitting
    if other_docs:
        splitter = RecursiveCharacterTextSplitter(
            chunk_size=2000,
            chunk_overlap=200,
            add_start_index=True,
        )
        other_chunks = splitter.split_documents(other_docs)
        for c in other_chunks:
            c.metadata.setdefault("section_label", c.metadata.get("source", "unknown"))
            c.metadata.setdefault(
                "section_type",
                c.metadata.get("section_type", "unknown")
            )
        chunks.extend(other_chunks)

    print(
        f"[index_builder] split_docs: {len(html_docs)} HTML section chunks, "
        f"{len(chunks) - len(html_docs)} non-HTML chunks"
    )
    return chunks


# ---------- Build & load index ----------


In [2]:
docs = load_remote_docs(["https://fearless-writers-028990.framer.app"])

[index_builder] Loading HTML for 1 URLs with HTMLSectionSplitter
[index_builder]   Fetching HTML from https://fearless-writers-028990.framer.app
[index_builder]   https://fearless-writers-028990.framer.app: 9 HTML sections


In [3]:
print(docs[4].page_content)

Work experience 
 
 $ 
 June 2025 - August 2025 
 
 $ 
 $ 
 Product Management Intern : Data Science 
 
 
 /$ 
 
 
 $ 
 Product Management Intern : Data Science 
 
 
 /$ 
 
 
 $ 
 Product Management Intern : Data Science 
 
 
 /$ 
 
 /$ 
 Juniper Networks / Mist 
 â Engineered an end-to-end classification pipeline that processed over 13,000 customer support tickets, achieving a 100% automated classification rate and providing proactive insights into emerging issues.
â Developed a multi-stage data processing module using LLMs to automatically extract and normalize critical information, including site, MAC address, and issue description, from raw Salesforce ticket data.
â Implemented an advanced LLM-based analysis to parse support agent-customer conversations, automatically identifying root causes and agent actions, which helped streamline case resolution and improve the agent knowledge base.
â Leveraged BERTopic on a dataset of over 13,000 "complete issue" data points to uncover

In [4]:
docs

[Document(metadata={'Header 1': '#TITLE#', 'source': 'https://fearless-writers-028990.framer.app', 'section_label': 'fearless-writers-028990.framer.app', 'section_header': 'Start of bodyStart', 'section_type': 'remote_html'}, page_content='Start of bodyStart  \n  End of bodyStart  \n \n $ html body { background: var(--token-29c1a320-cdc0-4d5e-a54f-37299311641d, rgb(255, 255, 255)); } \n \n \n \n \n $ \n \n Ritam Upadhyay \n Data Scientist \n /$ \n \n \n $ \n $ \n Home \n /$ \n \n /$ $ \n $ \n About \n /$ \n \n /$ $ \n $ \n Projects \n /$ \n \n /$ $ \n $ \n Stacks \n /$ \n \n /$ \n \n \n \n \n \n \n \n \n \n Search \n \n \n \n \n $ /$ \n \n Menu \n \n \n \n \n \n \n \n \n $ /$ \n \n \n $ \n $ /$ \n \n /$ $ \n $ /$ \n \n /$ $ \n $ /$ \n \n /$ $ \n $ /$ \n \n /$ \n \n \n \n \n \n \n \n \n \n \n \n \n \n Available for work \n \n \n \n $ \n $ \n Back to homepage \n \n \n /$ \n \n \n $ \n Back to homepage \n \n \n /$ \n \n \n $ \n Back to homepage \n \n \n /$ \n \n /$ \n \n \n \n $ \n \n Rit

In [10]:
# === Notebook cell: Load your FAISS vectorstore and inspect retrieved docs ===
import importlib
import traceback
from typing import List, Any, Set, Tuple
from pprint import pprint

# LangChain objects
from langchain.schema import Document, BaseRetriever

# Try to import your project's load_vectorstore (index_builder.load_vectorstore)
# Adjust the module path if your repo layout differs (you showed rag_core/index_builder.py).
load_vectorstore = None
try:
    mod = importlib.import_module("rag_core.index_builder")
    if hasattr(mod, "load_vectorstore"):
        load_vectorstore = getattr(mod, "load_vectorstore")
        print("Using rag_core.index_builder.load_vectorstore()")
except Exception as e:
    print("Could not import rag_core.index_builder.load_vectorstore():", e)
    traceback.print_exc()

if load_vectorstore is None:
    # fallback: try index_builder at repo root
    try:
        mod = importlib.import_module("index_builder")
        if hasattr(mod, "load_vectorstore"):
            load_vectorstore = getattr(mod, "load_vectorstore")
            print("Using index_builder.load_vectorstore()")
    except Exception as e:
        print("Fallback import index_builder.load_vectorstore() failed:", e)

if load_vectorstore is None:
    raise RuntimeError("Could not find load_vectorstore(). Edit the import above to point to your index_builder module.")

# 1) Load vectorstore (this uses get_embeddings() internally per your index_builder.py)
print("Loading vectorstore from disk (this may take a moment)...")
vectorstore = load_vectorstore()
print("Vectorstore loaded:", type(vectorstore))

# 2) Extract all Documents from the vectorstore docstore (works for FAISS/most langchain stores)
def extract_all_docs_from_vectorstore(vs) -> List[Document]:
    # Preferred: docstore._dict
    if hasattr(vs, "docstore") and hasattr(vs.docstore, "_dict"):
        docs = list(vs.docstore._dict.values())
        return docs
    # If not, try similarity_search trick (get top candidates, then dedupe)
    try:
        cand = vs.similarity_search("test", k=100)
        # ensure Document objects
        docs = [d for d in cand if isinstance(d, Document)]
        if docs:
            return docs
    except Exception as e:
        print("similarity_search fallback failed:", e)
    raise RuntimeError("Could not enumerate docs from vectorstore. Inspect object manually in the notebook.")

all_docs = extract_all_docs_from_vectorstore(vectorstore)
print(f"Extracted {len(all_docs)} documents from vectorstore")

# 3) Build or import PrefixRetriever (use your local one if present)
PrefixRetriever = None
try:
    pr_mod = importlib.import_module("PrefixRetreiver")
    PrefixRetriever = getattr(pr_mod, "PrefixRetriever", None)
    if PrefixRetriever:
        print("Imported PrefixRetriever from PrefixRetreiver.py")
except Exception:
    pass

if PrefixRetriever is None:
    print("Using notebook fallback PrefixRetriever")
    class PrefixRetriever:
        def __init__(self, docs: List[Document], k:int=3, max_lines:int=8):
            self.docs = docs
            self.k = k
            self.max_lines = max_lines
        def _head(self, content: str) -> str:
            lines = [ln for ln in content.splitlines() if ln.strip()]
            return "\n".join(lines[: self.max_lines])
        def get_relevant_documents(self, query: str) -> List[Document]:
            q = (query or "").lower().strip()
            tokens = [t for t in q.split() if t]
            out = []
            for d in self.docs:
                header = d.metadata.get("section_header") or d.metadata.get("section_label") or ""
                head_text = (header + "\n" + self._head(d.page_content)).lower()
                if q and (q in head_text or all(tok in head_text for tok in tokens)):
                    out.append(d)
                if len(out) >= self.k:
                    break
            return out

# 4) Build a small Pydantic-friendly fused retriever (works with your vector retriever)
# try to get a vector retriever using vectorstore.as_retriever
vector_retriever = None
try:
    vector_retriever = vectorstore.as_retriever(search_kwargs={"k": 8})
    print("Constructed vector_retriever using vectorstore.as_retriever(...)")
except Exception as e:
    print("vectorstore.as_retriever(...) failed:", e)

# Define FusedRetriever that is accepted by LangChain (Pydantic BaseModel-backed)
class FusedRetriever(BaseRetriever):
    prefix_retriever: Any
    vector_retriever: Any
    k: int = 4
    prefix_first: bool = True
    class Config:
        arbitrary_types_allowed = True
    def __init__(self, prefix_retriever: Any, vector_retriever: Any = None, k: int = 4, prefix_first: bool = True, **kwargs):
        super().__init__(prefix_retriever=prefix_retriever, vector_retriever=vector_retriever, k=k, prefix_first=prefix_first, **kwargs)
    def get_relevant_documents(self, query: str) -> List[Document]:
        # get prefix candidates
        prefix_docs = []
        if hasattr(self.prefix_retriever, "get_relevant_documents"):
            prefix_docs = self.prefix_retriever.get_relevant_documents(query)
        elif hasattr(self.prefix_retriever, "_get_relevant_documents"):
            prefix_docs = self.prefix_retriever._get_relevant_documents(query)
        # get vector candidates
        vector_docs = []
        if self.vector_retriever is not None:
            try:
                if hasattr(self.vector_retriever, "get_relevant_documents"):
                    vector_docs = self.vector_retriever.get_relevant_documents(query)
                elif hasattr(self.vector_retriever, "retrieve"):
                    vector_docs = self.vector_retriever.retrieve(query)
            except Exception:
                vector_docs = []
        # fuse + dedupe
        seen = set()
        out = []
        def add_docs(docs):
            for d in docs:
                key = (d.metadata.get("source"), d.page_content[:200])
                if key in seen:
                    continue
                seen.add(key)
                out.append(d)
                if len(out) >= self.k:
                    return True
            return False
        if self.prefix_first:
            finished = add_docs(prefix_docs)
            if not finished:
                add_docs(vector_docs)
        else:
            finished = add_docs(vector_docs)
            if not finished:
                add_docs(prefix_docs)
        return out[: self.k]

# instantiate retrievers
prefix_retriever = PrefixRetriever(docs=all_docs, k=6, max_lines=8)
fused_retriever = FusedRetriever(prefix_retriever=prefix_retriever, vector_retriever=vector_retriever, k=6, prefix_first=True)

# Helper to pretty-print retrieved docs
def inspect_docs(docs: List[Document], title: str, preview_chars: int = 400):
    print("\n" + "="*80)
    print(title)
    print("="*80)
    for i, d in enumerate(docs, 1):
        print(f"\n--- DOC {i} ---")
        print("HEADER       :", d.metadata.get("section_header"))
        print("SECTION LABEL:", d.metadata.get("section_label"))
        print("SOURCE       :", d.metadata.get("source"))
        print("CONTENT LEN  :", len(d.page_content))
        print("CONTENT PREV :")
        print(d.page_content[:preview_chars].strip())
        print("...")

# 5) Run a few queries and inspect results
queries = ["tell me about your work experience"]

for q in queries:
    print("\n\n" + "#"*20 + f" QUERY: {q}" + "#"*20)
    try:
        pdocs = prefix_retriever.get_relevant_documents(q)
        inspect_docs(pdocs, "PREFIX RETRIEVER RESULTS")
    except Exception as e:
        print("Prefix retriever error:", e, traceback.format_exc())
    try:
        vdocs = []
        if vector_retriever is not None:
            vdocs = vector_retriever.get_relevant_documents(q)
            inspect_docs(vdocs, "VECTOR RETRIEVER RESULTS")
        else:
            print("Vector retriever not available (vectorstore.as_retriever failed earlier).")
    except Exception as e:
        print("Vector retriever error:", e, traceback.format_exc())
    try:
        fdocs = fused_retriever.get_relevant_documents(q)
        inspect_docs(fdocs, "FUSED RETRIEVER RESULTS")
    except Exception as e:
        print("Fused retriever error:", e, traceback.format_exc())

print("\nDone — inspect the HEADER and CONTENT LEN in FUSED RETRIEVER RESULTS for 'work experience'. If it looks like fragments (short content), we'll adjust chunking / merging next.")


Using rag_core.index_builder.load_vectorstore()
Loading vectorstore from disk (this may take a moment)...


RuntimeError: Error in faiss::FileIOReader::FileIOReader(const char *) at /Users/runner/work/faiss-wheels/faiss-wheels/faiss/faiss/impl/io.cpp:68: Error: 'f' failed: could not open /Users/ritamupadhyay/Documents/Ritam_QA/data/vectorstore/career_faiss_index/index.faiss for reading: No such file or directory

In [6]:
# === Notebook cell: Map query -> section labels with BART zero-shot, then fetch docs ===
import importlib
from typing import List, Dict, Tuple
from collections import defaultdict
import math
import json
import pprint

# HF pipeline
from transformers import pipeline

# LangChain Document
from langchain.schema import Document

# ---- helper: load your docs / vectorstore (adapt if needed) ----
# Try to import your loader; adjust module path if different
try:
    idx_mod = importlib.import_module("rag_core.index_builder")
    load_vectorstore = getattr(idx_mod, "load_vectorstore")
    vs = load_vectorstore()
    all_docs = list(vs.docstore._dict.values())
    vector_retriever = vs.as_retriever(search_kwargs={"k": 6})
    print(f"Loaded {len(all_docs)} docs from FAISS vectorstore")
except Exception as e:
    print("Could not load vectorstore via rag_core.index_builder.load_vectorstore(); try loading serialized docs instead:", e)
    # fallback: try a docs pickle/json in data/ (you can plug your loader here)
    raise

# ---- 1) build a label vocabulary from indexed docs ----
def build_label_vocab(docs: List[Document]) -> List[str]:
    labels = []
    seen = set()
    for d in docs:
        header = (d.metadata.get("section_header") or "").strip()
        s_label = (d.metadata.get("section_label") or "").strip()
        # prefer a short header label if available
        candidates = [header, s_label]
        for c in candidates:
            if not c:
                continue
            # Normalization: remove newlines, collapse whitespace, limit length
            normalized = " ".join(c.split())
            if len(normalized) > 120:
                normalized = normalized[:120] + "..."
            if normalized not in seen:
                seen.add(normalized)
                labels.append(normalized)
    return labels

labels = build_label_vocab(all_docs)
print(f"Found {len(labels)} candidate labels (examples):", labels[:20])

# ---- 2) zero-shot classifier using BART-MNLI ----
# This uses HuggingFace pipeline ("zero-shot-classification") with a NLI model.
# It scores each label as how well the query fits that label.
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

def map_query_to_labels_zero_shot(query: str, candidate_labels: List[str], top_k:int=5, score_threshold:float=0.40) -> List[Tuple[str,float]]:
    """
    Returns a list of (label, score) pairs selected for this query.
    - top_k: number of highest scoring labels to consider
    - score_threshold: min probability for accepting a label
    """
    if not candidate_labels:
        return []

    # The pipeline accepts a list; we pass all labels and request top_k
    out = classifier(query, candidate_labels, multi_label=True)
    # out structure: {'sequence':..., 'labels':[...], 'scores':[...]}
    labels_out = out["labels"]
    scores_out = out["scores"]
    selected = []
    for lbl, score in zip(labels_out[:top_k], scores_out[:top_k]):
        if score >= score_threshold:
            selected.append((lbl, float(score)))
    # If nothing passed threshold, return the top label (best guess) to avoid empty result
    if not selected and labels_out:
        selected = [(labels_out[0], float(scores_out[0]))]
    return selected

# ---- 3) fetch docs that match chosen labels ----
def fetch_docs_by_labels(selected_labels: List[str], docs: List[Document]) -> List[Document]:
    """
    Return all docs whose section_header or section_label matches any selected_label.
    Matching is fuzzy: case-insensitive substring match against metadata fields.
    """
    if not selected_labels:
        return []
    out = []
    sel_norm = [s.lower() for s in selected_labels]
    for d in docs:
        header = (d.metadata.get("section_header") or "").lower()
        s_label = (d.metadata.get("section_label") or "").lower()
        combined = header + " " + s_label
        for s in sel_norm:
            if not s:
                continue
            if s in combined:
                out.append(d)
                break
    return out

# ---- 4) end-to-end helper: map query -> docs (with vector fallback) ----
def retrieve_by_label_mapping(query: str, docs: List[Document], classifier_pipeline=None, top_k_labels=5, score_threshold=0.40, vector_retriever=None):
    if classifier_pipeline is None:
        # use the prebuilt classifier if not passed
        classifier_pipeline = classifier

    candidate_labels = build_label_vocab(docs)
    mapped = map_query_to_labels_zero_shot(query, candidate_labels, top_k=top_k_labels, score_threshold=score_threshold)
    print("Mapped labels (label,score):", mapped)

    chosen_labels = [lbl for lbl, _ in mapped]
    fetched = fetch_docs_by_labels(chosen_labels, docs)
    print(f"Found {len(fetched)} docs matching labels")

    if not fetched and vector_retriever is not None:
        # fallback: return semantic vector search results
        print("Label mapping returned nothing; falling back to vector retriever.")
        fetched = vector_retriever.get_relevant_documents(query)
    return mapped, fetched

# ---- 5) try it ----
queries = [
    #"show me work experience",
    "where did you work and what did you do at Juniper?",
    #"education details",
    #"list publications",
]

for q in queries:
    print("\n" + "="*80)
    print("QUERY:", q)
    mapped, docs_fetched = retrieve_by_label_mapping(q, all_docs, classifier_pipeline=classifier, top_k_labels=5, score_threshold=0, vector_retriever=vector_retriever)
    for i, d in enumerate(docs_fetched, 1):
        print(f"\n--- FETCHED DOC {i} ---")
        print("HEADER:", d.metadata.get("section_header"))
        print("LABEL:", d.metadata.get("section_label"))
        print("SOURCE:", d.metadata.get("source"))
        print("CONTENT LEN:", len(d.page_content))
        print("PREVIEW:", d.page_content[:600].strip()[:600])
    print("\n" + "="*80)

# ---- end cell ----


Loaded 69 docs from FAISS vectorstore
Found 34 candidate labels (examples): ['Start of bodyStart', 'RAG-PDF QA Method -', 'Project Overview', 'Key Highlights', 'Rag Pdf Qa Method', 'Account Statement Verification -', 'Account Statement Verification', 'Python', 'Other stack', "Let's Transform Ideas into Reality", 'My Latest Works', 'Stack', 'Streamlit', 'Next Word - Next Word Prediction', 'Next Word', 'About', 'Contact', 'Education', 'Work experience', 'Projects']


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.



QUERY: where did you work and what did you do at Juniper?
Mapped labels (label,score): [('Work experience', 0.9035468697547913), ('Low Cost Vision Based Gripper', 0.4155164361000061), ('Project Overview', 0.29346197843551636), ('Low Cost Vision-Based Gripper - Camera based gripper to locate, localize and grasp object', 0.24844245612621307), ('Key Highlights', 0.24241366982460022)]
Found 19 docs matching labels

--- FETCHED DOC 1 ---
HEADER: Project Overview
LABEL: None
SOURCE: https://fearless-writers-028990.framer.app/project/rag-pdf-qa-method
CONTENT LEN: 372
PREVIEW: Project Overview 
 
 This project aimed to build an interface so that user could provide a PDF and ask specific questions to the LLM based on the pdf content. The LLM also indicated the paragraphs and texts that led to the answer that it provided. This reduced the chances of hallucinations by the model. 
 $ https://github.com/ritam3/RAG-PDF-HuggingFace-Groq/tree/main /$

--- FETCHED DOC 2 ---
HEADER: Key Highlights
LAB

In [10]:
# === Notebook cell: Map query -> section labels with BART zero-shot, then fetch docs ===
import importlib
from typing import List, Dict, Tuple
from collections import defaultdict
import math
import json
import pprint

# HF pipeline
from transformers import pipeline

# LangChain Document
from langchain.schema import Document

# ---- helper: load your docs / vectorstore (adapt if needed) ----
# Try to import your loader; adjust module path if different
try:
    idx_mod = importlib.import_module("rag_core.index_builder")
    load_vectorstore = getattr(idx_mod, "load_vectorstore")
    vs = load_vectorstore()
    all_docs = list(vs.docstore._dict.values())
    vector_retriever = vs.as_retriever(search_kwargs={"k": 6})
    print(f"Loaded {len(all_docs)} docs from FAISS vectorstore")
except Exception as e:
    print("Could not load vectorstore via rag_core.index_builder.load_vectorstore(); try loading serialized docs instead:", e)
    # fallback: try a docs pickle/json in data/ (you can plug your loader here)
    raise

# ---- 1) build a label vocabulary from indexed docs ----
def build_label_vocab(docs: List[Document]) -> List[str]:
    labels = []
    seen = set()
    for d in docs:
        header = (d.metadata.get("section_header") or "").strip()
        s_label = (d.metadata.get("section_label") or "").strip()
        # prefer a short header label if available
        candidates = [header, s_label]
        for c in candidates:
            if not c:
                continue
            # Normalization: remove newlines, collapse whitespace, limit length
            normalized = " ".join(c.split())
            if len(normalized) > 120:
                normalized = normalized[:120] + "..."
            if normalized not in seen:
                seen.add(normalized)
                labels.append(normalized)
    return labels

labels = build_label_vocab(all_docs)
print(f"Found {len(labels)} candidate labels (examples):", labels[:20])

# ---- 2) zero-shot classifier using BART-MNLI ----
# This uses HuggingFace pipeline ("zero-shot-classification") with a NLI model.
# It scores each label as how well the query fits that label.
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

def map_query_to_labels_zero_shot(query: str, candidate_labels: List[str], top_k:int=5, score_threshold:float=0.40) -> List[Tuple[str,float]]:
    """
    Returns a list of (label, score) pairs selected for this query.
    - top_k: number of highest scoring labels to consider
    - score_threshold: min probability for accepting a label
    """
    if not candidate_labels:
        return []

    # The pipeline accepts a list; we pass all labels and request top_k
    out = classifier(query, candidate_labels, multi_label=True)
    # out structure: {'sequence':..., 'labels':[...], 'scores':[...]}
    labels_out = out["labels"]
    scores_out = out["scores"]
    selected = []
    for lbl, score in zip(labels_out[:top_k], scores_out[:top_k]):
        if score >= score_threshold:
            selected.append((lbl, float(score)))
    # If nothing passed threshold, return the top label (best guess) to avoid empty result
    if not selected and labels_out:
        selected = [(labels_out[0], float(scores_out[0]))]
    return selected

from typing import List, Tuple, Dict

def fetch_docs_by_labels_with_scores(selected_labels: List[Tuple[str, float]], docs: List[Document]) -> List[Tuple[Document, float, List[str]]]:
    """
    For each doc, determine which of the selected_labels it matches (substring match
    against section_header or section_label). Return list of tuples:
      (Document, score, matched_label_list)
    where score is the max label score among matched labels.
    """
    if not selected_labels:
        return []

    # map label text -> score
    label_score: Dict[str, float] = {lbl.lower(): sc for lbl, sc in selected_labels}

    out: List[Tuple[Document, float, List[str]]] = []
    for d in docs:
        header = (d.metadata.get("section_header") or "").lower()
        s_label = (d.metadata.get("section_label") or "").lower()
        combined = header + " " + s_label

        matched_labels = []
        matched_scores = []
        for lbl_lower, sc in label_score.items():
            if lbl_lower and lbl_lower in combined:
                matched_labels.append(lbl_lower)
                matched_scores.append(sc)

        if matched_labels:
            # score for this doc is the max score among matched labels
            doc_score = max(matched_scores)
            out.append((d, doc_score, matched_labels))

    return out


def retrieve_by_label_mapping_ranked(query: str, docs: List[Document], classifier_pipeline=None, top_k_labels=5, score_threshold=0.40, vector_retriever=None):
    """
    Map query -> labels (with scores), fetch docs that match those labels, and
    return (mapped_labels, ranked_docs) where ranked_docs is a list of tuples:
      (Document, score, matched_labels)
    sorted by score desc.
    Falls back to vector retriever if no docs found via labels.
    """
    if classifier_pipeline is None:
        classifier_pipeline = classifier

    candidate_labels = build_label_vocab(docs)
    mapped = map_query_to_labels_zero_shot(query, candidate_labels, top_k=top_k_labels, score_threshold=score_threshold)
    print("Mapped labels (label,score):", mapped)

    # fetch docs with associated label scores
    docs_with_scores = fetch_docs_by_labels_with_scores(mapped, docs)
    print(f"Found {len(docs_with_scores)} docs matching labels")

    if not docs_with_scores and vector_retriever is not None:
        # fallback: return semantic vector search results (assign them a default score based on similarity rank)
        print("Label mapping returned nothing; falling back to vector retriever.")
        vec_docs = vector_retriever.get_relevant_documents(query)
        # assign decreasing confidence scores starting from 0.6 downwards to indicate fallback
        ranked = []
        base = 0.6
        step = 0.02
        for i, d in enumerate(vec_docs):
            score = max(0.0, base - i * step)
            ranked.append((d, score, ["vector_fallback"]))
        return mapped, ranked

    # If a doc matched multiple labels, we already collapsed to max score in fetch helper.
    # Now sort descending by score, and dedupe by (source + content[:200]) keeping highest-scored occurrence.
    seen_keys = set()
    deduped_sorted = []
    for d, sc, matched in sorted(docs_with_scores, key=lambda x: x[1], reverse=True):
        key = (d.metadata.get("source"), d.page_content[:200])
        if key in seen_keys:
            continue
        seen_keys.add(key)
        deduped_sorted.append((d, sc, matched))

    return mapped, deduped_sorted
 

# ---- 5) try it ----
queries = [
    # "Tell me about your work in python and LLMs", 
    # "What are the companies you have worked at ?",
    # "Tell me the companies you have visited so far ?",
    # "show me work experience and projects",
    # "where did you work and what did you do at Juniper?",
    # "education details",
    # "list publications",
    # "Tell me about the project regarding account statement verification",
    # "Tell me about the improper face uploads",
    # "Tell me something about yourself",
    # "Describe your biggest struggle",
    # "Explain anything where you have used python",
    "DO you have experience with NLP if so then describe where you have used it"
    
]

for q in queries:
    print("\n" + "="*80)
    print("QUERY:", q)
    mapped, ranked_docs = retrieve_by_label_mapping_ranked(q, all_docs, classifier_pipeline=classifier, top_k_labels=5, score_threshold=0.35, vector_retriever=vector_retriever)
    for i, (d, score, matched_labels) in enumerate(ranked_docs, 1):
        print(f"\n--- FETCHED DOC {i} ---")
        print("SCORE:", score)
        print("MATCHED LABELS:", matched_labels)
        print("HEADER:", d.metadata.get("section_header"))
        print("LABEL:", d.metadata.get("section_label"))
        print("SOURCE:", d.metadata.get("source"))
        print("CONTENT LEN:", len(d.page_content))
        print("PREVIEW:", d.page_content[:600].strip())
    print("\n" + "="*80)

# ---- end cell ----


Loaded 57 docs from FAISS vectorstore
Found 29 candidate labels (examples): ['Start of bodyStart', 'Account Statement Verification -', 'Project Overview', 'Key Highlights', 'Account Statement Verification', 'About', 'Contact', 'Education', 'Work experience', 'Projects', 'Stack', 'Certifications', 'Research Papers', 'RAG-PDF QA Method -', 'Rag Pdf Qa Method', 'AI based Name Matcher - Matching 2 Indian Names to match the similarity', 'Ai Based Name Matcher', 'My Latest Works', 'Improper Face Detection at Frontend - Finding faces that do not meet requirements', 'Improper Face Detection At Frontend']


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.



QUERY: DO you have experience with NLP if so then describe where you have used it
Mapped labels (label,score): [('About', 0.5211944580078125), ('Contact', 0.48030656576156616), ('Next Word - Next Word Prediction', 0.35503268241882324)]
Found 3 docs matching labels

--- FETCHED DOC 1 ---
SCORE: 0.5211944580078125
MATCHED LABELS: ['about']
HEADER: About
LABEL: None
SOURCE: https://fearless-writers-028990.framer.app/
CONTENT LEN: 459
PREVIEW: About 
 Welcome to my portfolio! I am an ASU Master's student ('26) with experience at Juniper Networks and Paytm Money. I leverage ML/AI to solve complex business challenges, specializing in NLP, computer vision, and predictive analytics. Seeking roles in Data Science, AI/ML, and NLP to drive impact at scale. 
 
 
 $ 
 $ My projects 
 /$ 
 
 
 $ My projects 
 /$ 
 
 
 $ My projects 
 /$ 
 
 /$ $ 
 $ Contact /$ 
 
 
 $ Contact /$ 
 
 
 $ Contact /$ 
 
 /$

--- FETCHED DOC 2 ---
SCORE: 0.48030656576156616
MATCHED LABELS: ['contact']
HEADER: Contact
L

In [11]:
all_docs

[Document(metadata={'Header 1': '#TITLE#', 'source': 'https://fearless-writers-028990.framer.app/project/account-statement-verification', 'section_header': 'Start of bodyStart', 'section_type': 'remote_html'}, page_content='Start of bodyStart  \n  End of bodyStart  \n \n $ \n \n \n \n $ \n \n Ritam Upadhyay \n Data Scientist \n /$ \n \n \n $ \n $ \n Home \n /$ \n \n /$ $ \n $ \n About \n /$ \n \n /$ $ \n $ \n Projects \n /$ \n \n /$ $ \n $ \n Stacks \n /$ \n \n /$ \n \n \n \n \n \n \n \n \n \n Search \n \n \n \n \n $ /$ \n \n \n $ \n $ /$ \n \n /$ $ \n $ /$ \n \n /$ $ \n $ /$ \n \n /$ $ \n $ /$ \n \n /$ \n \n \n \n \n \n \n \n \n \n \n \n $ /$ \n \n Menu \n \n \n \n \n \n \n \n \n \n \n Available for work'),
 Document(metadata={'Header 1': 'Account Statement Verification -', 'source': 'https://fearless-writers-028990.framer.app/project/account-statement-verification', 'section_header': 'Account Statement Verification -', 'section_type': 'remote_html'}, page_content='Account Statement V