## Data Ingestion

In [2]:
from langchain_core.documents import Document

In [3]:
from langchain_community.document_loaders import JSONLoader
from langchain_community.document_loaders import TextLoader

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
loader_text =TextLoader("../data/ssa_faq.txt", encoding = "utf-8")
#print(loader_text)
document = loader_text.load()
print(document)

[Document(metadata={'source': '../data/ssa_faq.txt'}, page_content='Title: Will I get my payments during a Federal Government shutdown?\nWill I get my payments during a Federal Government shutdown?\nOctober 1, 2025\n\xa0·\xa0\nEn español\n\xa0·\xa0\nShare\nIn the event of a Federal Government shutdown due to a lapse in funding, payments to Social Security beneficiaries and SSI recipients will continue with no change in payment dates.\n\n---\n\nTitle: Will Social Security operate normally during the Federal Government shutdown?\nWill Social Security operate normally during the Federal Government shutdown?\nOctober 1, 2025\n\xa0·\xa0\nEn español\n\xa0·\xa0\nShare\nPlease visit our \nFederal Government Shutdown\n webpage to learn which services we will provide and which services we cannot provide during the shutdown.\n\n---\n\nTitle: If I am not a U.S. citizen, can I get a Social Security number?\nIf I am not a U.S. citizen, can I get a Social Security number?\nSeptember 11, 2025\n\xa0·\x

In [5]:
import glob

In [6]:
file_paths= glob.glob("../data/*.json")

In [7]:
print(file_paths)

['../data\\ssa_disability_structured.json', '../data\\ssa_family_structured.json', '../data\\ssa_retirement_structured.json', '../data\\ssa_ssi_structured.json', '../data\\ssa_survivor_structured.json']


In [8]:
from langchain_community.document_loaders import JSONLoader

all_docs = []

for file_path in file_paths:
    loader_json = JSONLoader(
        file_path=file_path,
        jq_schema="""
        (
          .sections[] | {heading: .heading, paragraphs: .paragraphs, list_items: .list_items, link: .link}
        ),
        (
          .subpages[]?.sections[] | {heading: .heading, paragraphs: .paragraphs, list_items: .list_items, link: .link}
        )
        """,
        text_content=False
    )
    docs = loader_json.load()
    all_docs.extend(docs)

print(f"✅ Loaded {len(all_docs)} documents from {len(file_paths)} files.")


✅ Loaded 50 documents from 5 files.


In [9]:
all_docs

[Document(metadata={'source': 'F:\\Utah_State\\AI_Deploy\\rag_langchain\\data\\ssa_disability_structured.json', 'seq_num': 1}, page_content='{"heading": "", "paragraphs": [], "list_items": [], "link": "https://www.ssa.gov/apply?benefits=disability&age=adult"}'),
 Document(metadata={'source': 'F:\\Utah_State\\AI_Deploy\\rag_langchain\\data\\ssa_disability_structured.json', 'seq_num': 2}, page_content='{"heading": "Who can get Disability", "paragraphs": ["Individuals may be eligible for Disability if they have:", "See if you might be eligible for Disability"], "list_items": ["A disability or blindness, and", "Enough work history."], "link": "https://www.ssa.gov/disability/eligibility"}'),
 Document(metadata={'source': 'F:\\Utah_State\\AI_Deploy\\rag_langchain\\data\\ssa_disability_structured.json', 'seq_num': 3}, page_content='{"heading": "What you could get", "paragraphs": ["Your benefits may include a monthly payment and Medicare. Your payment amount is based on your work history befor

# For Pdf loader

In [10]:
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_community.document_loaders import DirectoryLoader

In [11]:
import tqdm

In [12]:
pdf_loader = DirectoryLoader(
    "../data/ssn_pdf",
    glob = "**/*.pdf",
    loader_cls= PyMuPDFLoader,
    show_progress= True
)
pdf_documents = pdf_loader.load()
pdf_documents

100%|██████████| 7/7 [00:01<00:00,  4.31it/s]


[Document(metadata={'producer': 'Adobe PDF Library 15.0', 'creator': 'Adobe InDesign 16.2 (Windows)', 'creationdate': '2021-10-01T10:38:38-04:00', 'source': '..\\data\\ssn_pdf\\EN-05-10009.pdf', 'file_path': '..\\data\\ssn_pdf\\EN-05-10009.pdf', 'total_pages': 1, 'format': 'PDF 1.6', 'title': 'Social Security Number and Card — Deferred Action For Childhood Arrivals', 'author': 'Social Security Administration', 'subject': '', 'keywords': '', 'moddate': '2021-10-01T10:44:58-04:00', 'trapped': '', 'modDate': "D:20211001104458-04'00'", 'creationDate': "D:20211001103838-04'00'", 'page': 0}, page_content='SSA.gov\nSocial Security Administration\nPublication No. 05-10009\nSeptember 2021 (Recycle prior editions)\nSocial Security Number and Card — Deferred Action For Childhood Arrivals\nProduced and published at U.S. taxpayer expense\nSocial Security Number and Card — \nDeferred Action For Childhood Arrivals\nHow do I apply for a Social Security \nnumber (SSN)?\nDeferred Action for Childhood Ar

In [13]:
type(pdf_documents[0])

langchain_core.documents.base.Document

## Chunking

In [14]:
all_documents = document + all_docs + pdf_documents

In [15]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,     # each chunk ~1000 characters
    chunk_overlap=200,   # overlap for context continuity
    length_function=len, # how to measure length (you can replace with a token counter later)
    separators=["\n\n", "\n", ".", "!", "?", " ", ""]
)

chunks = text_splitter.split_documents(all_documents)

print(f"✅ Created {len(chunks)} chunks from {len(all_documents)} documents.")


✅ Created 167 chunks from 74 documents.


In [16]:
len(chunks)

167

## Embedding and VectorDB

In [17]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity

In [18]:
class EmbeddingManager:
    def __init__(self, model_name: str = 'all-MiniLM-L6-v2'):
        self.model_name = model_name
        self.model = None
        self._load_model()

    def _load_model(self):
        try:
            print(f"Loading embedding model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model loaded successfully. Embedding dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            raise

    def generate_embedding(self, texts: List[str]) -> np.ndarray:
        if not self.model:
            raise ValueError("Model not loaded")

        print(f"Generating embeddings for {len(texts)} texts...")
        embeddings = self.model.encode(texts, show_progress_bar=True)
        embeddings = np.array(embeddings, dtype=np.float32)
        print(f"✅ Generated embeddings with shape: {embeddings.shape}")
        return embeddings

    def get_embedding_dimension(self)->int:

        if not self.model:
            raise ValueError("Model not loaded")
        return self.model.get_sentence_embedding_dimension()


In [19]:
# initiation of embedding manager
embedding_manager = EmbeddingManager()
embedding_manager

Loading embedding model: all-MiniLM-L6-v2
Model loaded successfully. Embedding dimension: 384


<__main__.EmbeddingManager at 0x25dc502b410>

# Vector DB

In [20]:
import os
import uuid
import numpy as np
import chromadb
from typing import List, Any


In [21]:
class VectorStore:
    def __init__(self, collection_name: str = "custom_database", persist_directory: str = "../data/vector_store"):
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()

    def _initialize_store(self):
        try:
            os.makedirs(self.persist_directory, exist_ok=True)

            self.client = chromadb.PersistentClient(path=self.persist_directory)

            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description": "PDF and text document embeddings for RAG"}
            )

            print(f"✅ Vector store initialized. Collection: {self.collection_name}")
            print(f"📦 Existing documents in collection: {self.collection.count()}")

        except Exception as e:
            print(f"❌ Error initializing vector store: {e}")
            raise

    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        if len(documents) != len(embeddings):
            raise ValueError("Number of documents must match number of embeddings")

        print(f"Adding {len(documents)} documents to vector store...")

        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []

        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)

            # Prepare metadata
            metadata = dict(doc.metadata)
            metadata["doc_index"] = i
            metadata["content_length"] = len(doc.page_content)
            metadatas.append(metadata)

            # Document content
            documents_text.append(doc.page_content)

            # Embedding
            embeddings_list.append(embedding.tolist())

        try:
            self.collection.add(
                ids=ids,
                embeddings=embeddings_list,
                metadatas=metadatas,
                documents=documents_text
            )
            print(f"✅ Successfully added {len(documents)} documents to vector store.")
            print(f"📈 Total documents in collection: {self.collection.count()}")

        except Exception as e:
            print(f"❌ Error adding documents to vector store: {e}")
            raise


In [22]:
len(chunks)

167

In [23]:
texts = [doc.page_content for doc in chunks]
texts

['Title: Will I get my payments during a Federal Government shutdown?\nWill I get my payments during a Federal Government shutdown?\nOctober 1, 2025\n\xa0·\xa0\nEn español\n\xa0·\xa0\nShare\nIn the event of a Federal Government shutdown due to a lapse in funding, payments to Social Security beneficiaries and SSI recipients will continue with no change in payment dates.\n\n---\n\nTitle: Will Social Security operate normally during the Federal Government shutdown?\nWill Social Security operate normally during the Federal Government shutdown?\nOctober 1, 2025\n\xa0·\xa0\nEn español\n\xa0·\xa0\nShare\nPlease visit our \nFederal Government Shutdown\n webpage to learn which services we will provide and which services we cannot provide during the shutdown.\n\n---',
 '---\n\nTitle: If I am not a U.S. citizen, can I get a Social Security number?\nIf I am not a U.S. citizen, can I get a Social Security number?\nSeptember 11, 2025\n\xa0·\xa0\nEn español\n\xa0·\xa0\nShare\nIn general, only nonciti

In [24]:
# ✅ Generate embeddings
embeddings_manager = EmbeddingManager(model_name="all-MiniLM-L6-v2")
embeddings = embeddings_manager.generate_embedding(texts)

# ✅ Add to vector store
vectorstore = VectorStore()
vectorstore.add_documents(chunks, embeddings)

Loading embedding model: all-MiniLM-L6-v2
Model loaded successfully. Embedding dimension: 384
Generating embeddings for 167 texts...


Batches: 100%|██████████| 6/6 [00:07<00:00,  1.17s/it]


✅ Generated embeddings with shape: (167, 384)
✅ Vector store initialized. Collection: custom_database
📦 Existing documents in collection: 167
Adding 167 documents to vector store...
✅ Successfully added 167 documents to vector store.
📈 Total documents in collection: 334


# Rag Retrieval Pipeline

In [25]:
from typing import List, Dict, Any

class RAGRetriever:
    def __init__(self, vector_store: VectorStore, embedding_manager: EmbeddingManager):
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager

    def retrieve(self, query: str, top_k: int = 5, score_threshold: float = 0.0) -> List[Dict[str, Any]]:
        print(f"🔍 Retrieving documents for query: '{query}'")
        print(f"Top K: {top_k}, Score threshold: {score_threshold}")

        # Generate query embedding
        query_embedding = self.embedding_manager.generate_embedding([query])[0]

        try:
            results = self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k
            )

            retrieved_docs = []

            if results.get("documents") and results["documents"][0]:
                documents = results["documents"][0]
                metadatas = results["metadatas"][0]
                distances = results["distances"][0]
                ids = results["ids"][0]

                for i, (doc_id, doc_text, metadata, distance) in enumerate(zip(ids, documents, metadatas, distances)):
                    # Convert cosine distance to similarity score
                    similarity_score = 1 - distance

                    if similarity_score >= score_threshold:
                        retrieved_docs.append({
                            "id": doc_id,
                            "content": doc_text,
                            "metadata": metadata,
                            "similarity_score": similarity_score,
                            "distance": distance,
                            "rank": i + 1
                        })

                print(f"✅ Retrieved {len(retrieved_docs)} documents (after filtering)")
            else:
                print("⚠️ No documents found")

            return retrieved_docs

        except Exception as e:
            print(f"❌ Error during retrieval: {e}")
            return []

rag_retriever = RAGRetriever(vectorstore, embedding_manager)

In [26]:
rag_retriever

<__main__.RAGRetriever at 0x25dc6423450>

In [27]:
rag_retriever.retrieve("What is SSA benefits")

🔍 Retrieving documents for query: 'What is SSA benefits'
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 23.40it/s]

✅ Generated embeddings with shape: (1, 384)
✅ Retrieved 5 documents (after filtering)





[{'id': 'doc_d580b8fb_75',
  'content': '{"heading": "How income affects your payment", "paragraphs": ["For every $2 you earn from work, we reduce your SSI payment by about $1. Work includes:", "For every $1 you get from non-work sources, we reduce your SSI payment by about $1. Non-work sources includes:", "If you live with a spouse, their income may affect your payment.", "Children on SSI who live with their parents may have their payments lowered based on their income or their parents\' income.", "Learn more about how income affects your payment"], "list_items": ["A job", "Self-employment", "Any activity that earns money", "Disability benefits", "Unemployment payments", "Pensions"], "link": "https://www.ssa.gov/ssi/text-income-ussi.htm"}',
  'metadata': {'seq_num': 6,
   'source': 'F:\\Utah_State\\AI_Deploy\\rag_langchain\\data\\ssa_ssi_structured.json',
   'content_length': 707,
   'doc_index': 75},
  'similarity_score': 0.09693664312362671,
  'distance': 0.9030633568763733,
  'rank

In [28]:
rag_retriever.retrieve("How can students make SSN")

🔍 Retrieving documents for query: 'How can students make SSN'
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 44.50it/s]

✅ Generated embeddings with shape: (1, 384)
✅ Retrieved 5 documents (after filtering)





[{'id': 'doc_d928747c_145',
  'content': '(over)\nInternational Students and Social Security Numbers\nInternational Students and Social \nSecurity Numbers\nAre you temporarily in the United States to \nattend a college, or a language, vocational, \nor nonacademic school? Do you have a \nnonimmigrant F-1, M-1, or J-1 student \nclassification? Your school may ask you for your \nSocial Security number (SSN). Some colleges \nand schools use SSNs as student identification \nnumbers. If you don’t have an SSN, your \ncollege or school should be able to give you \nanother identification number. \nSSNs generally are assigned to people who \nare authorized to work in the United States. \nThey are also used to report your wages to \nthe government and to determine eligibility \nfor Social Security benefits. An SSN will not \nbe issued just for the purpose of enrolling in a \ncollege or school.\nIf you want to get a job on campus, you should \ncontact your designated school official for \ninternat

In [29]:
# rag_retriever = RAGRetriever(vectorstore, embeddings_manager)

# query = "How can I restart my disability benefits?"
# results = rag_retriever.retrieve(query, top_k=5, score_threshold=0.7)

# for doc in results:
#     print(f"\nRank: {doc['rank']}, Score: {doc['similarity_score']:.4f}")
#     print(f"Content: {doc['content'][:300]}...")


## Simple RAG pipeline with Grok LLM


In [31]:
import google.genai as genai

In [32]:
import os

from dotenv import load_dotenv

In [40]:
import os
import google.genai as genai
from dotenv import load_dotenv

# Load environment variables
load_dotenv()  # loads .env file variables into environment

# Initialize Gemini client
client = genai.Client(api_key=os.getenv("GEMINI_API_KEY"))

def rag_simple(query, retriever, top_k=3):
    """
    RAG pipeline:
    1. Retrieve top-k relevant context chunks.
    2. Construct a context-aware prompt.
    3. Send to Gemini for grounded generation.
    """
    # Step 1: Retrieve top-k relevant chunks
    results = retriever.retrieve(query, top_k=top_k)
    context = "\n\n".join([doc["content"] for doc in results]) if results else ""

    if not context:
        return "No relevant context found to answer the question."

    # Step 2: Build prompt
    prompt = f"""
Use the following context to answer the question concisely and accurately.

Context:
{context}

Question: {query}

Answer:
"""

    # Step 3: Generate response using Gemini
    response = client.models.generate_content(
        model="gemini-2.5-pro",  # or gemini-2.5-pro if you have access
        contents=prompt
    )

    # Step 4: Return model output
    return response.text.strip() if hasattr(response, "text") else "No answer generated."


# ✅ Example usage
answer = rag_simple("What are SSA disability benefits?", rag_retriever, top_k=3)
print("🧠 Gemini RAG Answer:\n", answer)

# # ✅ Close the client gracefully to avoid cleanup warnings
# client.close()


🔍 Retrieving documents for query: 'What are SSA disability benefits?'
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 20.65it/s]

✅ Generated embeddings with shape: (1, 384)
✅ Retrieved 3 documents (after filtering)





🧠 Gemini RAG Answer:
 Based on the context provided, disability benefits are a type of non-work income, similar to unemployment payments and pensions, that can reduce your SSI payment. For every $1 received from disability benefits, your SSI payment is reduced by about $1.


# Enhanced RAG pipeline Features

In [None]:
def rag_advanced(query, retriever, client, top_k=5, min_score=0.2, return_context=False):
    # Retrieve relevant documents
    results = retriever.retrieve(query, top_k=top_k, score_threshold=min_score)

    if not results:
        return {
            "answer": "No relevant context found.",
            "sources": [],
            "confidence": 0.0,
            "context": ""
        }

    # Merge the top documents as context
    context = "\n\n".join([doc["content"] for doc in results])

    # Build source summaries
    sources = [{
        "source": doc["metadata"].get("source_file", doc["metadata"].get("source", "unknown")),
        "page": doc["metadata"].get("page", "unknown"),
        "score": doc["similarity_score"],
        "preview": doc["content"][:120] + "..."
    } for doc in results]

    # Compute a basic confidence score
    confidence = max([doc["similarity_score"] for doc in results])

    # Create the LLM prompt
    prompt = f"""
Use the following context to answer the question concisely and accurately.

Context:
{context}

Question: {query}

Answer:
"""

    # Generate response using Gemini
    response = client.models.generate_content(
        model="gemini-2.5-pro",   # or gemini-2.5-pro if you have access
        contents=prompt
    )

    output = {
        "answer": response.text.strip() if hasattr(response, "text") else "No answer generated.",
        "sources": sources,
        "confidence": confidence
    }

    if return_context:
        output["context"] = context

    return output


# ✅ Example usage
result = rag_advanced("Documents required for international students to make ssn?", rag_retriever, client, top_k=3, min_score=0.1, return_context=True)

print("🧠 Answer:", result["answer"])
print("\n📚 Sources:", result["sources"])
print("\n🔢 Confidence:", result["confidence"])
print("\n📄 Context Preview:", result["context"][:300])


🔍 Retrieving documents for query: 'Documents required for international students to make ssn?'
Top K: 3, Score threshold: 0.1
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 24.78it/s]

✅ Generated embeddings with shape: (1, 384)
✅ Retrieved 3 documents (after filtering)





🧠 Answer: The document doesn't specify all documents required to make an SSN, but it mentions that your employer may use your immigration documents as proof of your authorization to work in the United States. It also notes that all documents must be originals or copies certified by the issuing agency and that a receipt showing you applied for the document is not acceptable.

📚 Sources: [{'source': '..\\data\\ssn_pdf\\EN-05-10181.pdf', 'page': 0, 'score': 0.315851092338562, 'preview': '(over)\nInternational Students and Social Security Numbers\nInternational Students and Social \nSecurity Numbers\nAre you te...'}, {'source': '..\\data\\ssn_pdf\\EN-05-10181.pdf', 'page': 0, 'score': 0.315851092338562, 'preview': '(over)\nInternational Students and Social Security Numbers\nInternational Students and Social \nSecurity Numbers\nAre you te...'}, {'source': '..\\data\\ssn_pdf\\EN-05-10181.pdf', 'page': 1, 'score': 0.29055362939834595, 'preview': 'immigration documents.\nAll documents must be 