In [2]:
import os
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from pathlib import Path
import json

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def process_all_jsons(json_directory):
    all_documents = []
    json_dir = Path(json_directory)
    
    json_files = list(json_dir.glob("**/*.json"))
    print(f"Found {len(json_files)} JSON files to process")
    
    for json_file in json_files:
        print(f"\nProcessing: {json_file.name}")
        try:
            with open(json_file, "r", encoding="utf-8") as f:
                data = json.load(f)
            
            # If the JSON file contains a list of items
            if isinstance(data, list):
                for item in data:
                    doc = {
                        "content": item,  # store the JSON item as content
                        "metadata": {
                            "source_file": json_file.name,
                            "file_type": "json"
                        }
                    }
                    all_documents.append(doc)
            else:
                # If the JSON is a single dict
                doc = {
                    "content": data,
                    "metadata": {
                        "source_file": json_file.name,
                        "file_type": "json"
                    }
                }
                all_documents.append(doc)
            
            print(f"Loaded 1 JSON document from {json_file.name}" if not isinstance(data, list) else f"Loaded {len(data)} items from {json_file.name}")
        
        except Exception as e:
            print(f"Error: {e}")
    
    print(f"All JSON documents loaded: {len(all_documents)}")
    return all_documents

# Example usage
all_json_documents = process_all_jsons("../data")


Found 2 JSON files to process

Processing: amazon_jobs.json
Loaded 159 items from amazon_jobs.json

Processing: google_jobs.json
Loaded 271 items from google_jobs.json
All JSON documents loaded: 430


In [4]:
all_json_documents

[{'content': {'Title': 'Data Center Technician (DCO)',
   'URL': 'https://www.amazon.jobs/en/jobs/3111205/data-center-technician-dco',
   'Location': 'Mumbai, MH, IND',
   'JobID': '3111205',
   'Posted': 'Posted October 17, 2025',
   'Description': 'Basic qualifications:\n\nKnowledge of server hardware and components\nKnowledge of fiber optic and copper cabling standards, testing equipment and troubleshooting methodologies\n2+ years of data center experience'},
  'metadata': {'source_file': 'amazon_jobs.json', 'file_type': 'json'}},
 {'content': {'Title': 'Data Center Engineering Operations Engineer, HYD-Infinity - DCEO',
   'URL': 'https://www.amazon.jobs/en/jobs/3120322/data-center-engineering-operations-engineer-hyd-infinity-dceo',
   'Location': 'Hyderabad, TS, IND',
   'JobID': '3120322',
   'Posted': 'Posted November 4, 2025',
   'Description': "Basic qualifications:\n\nHigh school or equivalent diploma\nBachelor's degree in engineering or equivalent, or experience from a techni

In [37]:
def convert_json_to_documents(all_json_documents):
    documents = []
    for item in all_json_documents:
        data = item["content"]

        # Convert the JSON dictionary into clean readable text
        text = ""
        for key, value in data.items():
            text += f"{key}: {value}\n\n"

        documents.append(
            Document(
                page_content=text.strip(),
                metadata=item["metadata"]
            )
        )

    print(f"Converted {len(documents)} JSON entries into Document objects")
    return documents


In [38]:
jobs_docs = convert_json_to_documents(all_json_documents)

Converted 430 JSON entries into Document objects


In [39]:
### Text Splitting into Chunks

def split_documents(documents, chunk_size=1000, chunk_overlap=200):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")
    
    if split_docs:
        print(f"\nExample chunk:")
        print(f"Content: {split_docs[0].page_content[:200]}...")
        print(f"Metadata: {split_docs[0].metadata}")
        
    return split_docs

In [40]:
chunks = split_documents(jobs_docs)
chunks

Split 430 documents into 442 chunks

Example chunk:
Content: Title: Data Center Technician (DCO)

URL: https://www.amazon.jobs/en/jobs/3111205/data-center-technician-dco

Location: Mumbai, MH, IND

JobID: 3111205

Posted: Posted October 17, 2025

Description: B...
Metadata: {'source_file': 'amazon_jobs.json', 'file_type': 'json'}


[Document(metadata={'source_file': 'amazon_jobs.json', 'file_type': 'json'}, page_content='Title: Data Center Technician (DCO)\n\nURL: https://www.amazon.jobs/en/jobs/3111205/data-center-technician-dco\n\nLocation: Mumbai, MH, IND\n\nJobID: 3111205\n\nPosted: Posted October 17, 2025\n\nDescription: Basic qualifications:\n\nKnowledge of server hardware and components\nKnowledge of fiber optic and copper cabling standards, testing equipment and troubleshooting methodologies\n2+ years of data center experience'),
 Document(metadata={'source_file': 'amazon_jobs.json', 'file_type': 'json'}, page_content="Title: Data Center Engineering Operations Engineer, HYD-Infinity - DCEO\n\nURL: https://www.amazon.jobs/en/jobs/3120322/data-center-engineering-operations-engineer-hyd-infinity-dceo\n\nLocation: Hyderabad, TS, IND\n\nJobID: 3120322\n\nPosted: Posted November 4, 2025\n\nDescription: Basic qualifications:\n\nHigh school or equivalent diploma\nBachelor's degree in engineering or equivalent, or

Embedding and Vector Store DB

In [41]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity

In [42]:
class EmbeddingManager:
    """Handles document embedding generation using SentenceTransformer"""
    
    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        """
        Initialize the embedding manager
        Args:
            model_name: HuggingFace model name for sentence embeddings
        """
        self.model_name = model_name
        self.model = None
        self._load_model()

    def _load_model(self):
        """Load the SentenceTransformer model"""
        try:
            print(f"Loading embedding model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model loaded successfully. Embedding dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            raise

    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        """
        Generate embeddings for a list of texts
        
        Args:
            texts: List of text strings to embed
            
        Returns:
            numpy array of embeddings with shape (len(texts), embedding_dim)
        """
        if not self.model:
            raise ValueError("Model not loaded")
        
        print(f"Generating embeddings for {len(texts)} texts...")
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f"Generated embeddings with shape: {embeddings.shape}")
        return embeddings


## initialize the embedding manager

# embedding_manager=EmbeddingManager(model_name="intfloat/e5-base-v2")
embedding_manager=EmbeddingManager()
embedding_manager

Loading embedding model: all-MiniLM-L6-v2
Model loaded successfully. Embedding dimension: 384


<__main__.EmbeddingManager at 0x20f1a389e50>

VectorStore

In [43]:
class VectorStore:
    """Manages document embeddings in a ChromaDB vector store"""
    
    def __init__(self, collection_name: str = "pdf_documents", persist_directory: str = "../data/vector_store"):
        """
        Initialize the vector store
        
        Args:
            collection_name: Name of the ChromaDB collection
            persist_directory: Directory to persist the vector store
        """
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()
        
    def _initialize_store(self):
        """Initialize ChromaDB client and collection"""
        try:
            # Create persistent ChromaDB client
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)
            
            # Get or create collection
            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description": "PDF document embeddings for RAG"}
            )
            print(f"Vector store initialized. Collection: {self.collection_name}")
            print(f"Existing documents in collection: {self.collection.count()}")
            
        except Exception as e:
            print(f"Error initializing vector store: {e}")
            raise
        
    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        """
        Add documents and their embeddings to the vector store
        
        Args:
            documents: List of LangChain documents
            embeddings: Corresponding embeddings for the documents
        """
        if len(documents) != len(embeddings):
            raise ValueError("Number of documents must match number of embeddings")
        
        print(f"Adding {len(documents)} documents to vector store...")
        
        # Prepare data for ChromaDB
        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []
        
        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            # Generate unique ID
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)
            
            # Prepare metadata
            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)
            
            # Document content
            documents_text.append(doc.page_content)
            
            # Embedding
            embeddings_list.append(embedding.tolist())
        
        # Add to collection
        try:
            self.collection.add(
                ids=ids,
                embeddings=embeddings_list,
                metadatas=metadatas,
                documents=documents_text
            )
            print(f"Successfully added {len(documents)} documents to vector store")
            print(f"Total documents in collection: {self.collection.count()}")
            
        except Exception as e:
            print(f"Error adding documents to vector store: {e}")
            raise

vectorstore=VectorStore()
vectorstore

Vector store initialized. Collection: pdf_documents
Existing documents in collection: 454


<__main__.VectorStore at 0x20f380729d0>

In [44]:
### Convert the text to embddings
texts = [doc.page_content for doc in chunks]

embeddings = embedding_manager.generate_embeddings(texts)

vectorstore.add_documents(documents=chunks, embeddings=embeddings)

Generating embeddings for 442 texts...


Batches: 100%|██████████| 14/14 [00:23<00:00,  1.66s/it]


Generated embeddings with shape: (442, 384)
Adding 442 documents to vector store...
Successfully added 442 documents to vector store
Total documents in collection: 896


### RAGRetriver Pipeline From VectorStore

In [54]:
class RAGRetriever :
    
    def __init__(self, vectorstore: VectorStore, embedding_manager : EmbeddingManager) :
        self.vectorstore = vectorstore
        self.embedding_manager = embedding_manager
        
    def retrieve(self, query: str, top_k: int = 5, score_threshold: float = 0.0) -> List[Dict[str, Any]] :
        print(f"Retrieving documents for query: '{query}'")
        print(f"Top K: {top_k}, score threshold: {score_threshold}")
        
        
        query_embedding=self.embedding_manager.generate_embeddings([query])[0]
        
        try:
            results=self.vectorstore.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k
            )
            
            retrieved_docs=[]
            
            if results['documents'] and results['documents'][0] :
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids = results['ids'][0]
                
                for i, (doc_id, document, metadata, distance) in enumerate(zip(ids, documents, metadatas, distances)) :
                    similarity_score = 1 - distance
                    
                    if similarity_score >= score_threshold :
                        retrieved_docs.append({
                            'id' : doc_id,
                            'content' : document,
                            'metadata' : metadata,
                            'similarity_score' : similarity_score,
                            'distance' : distance,
                            'rank' : i + 1
                        })
                    print(f"Retrieved {len(retrieved_docs)} documents (after filtering)")
            else :
                print("No documents found")
                
            return retrieved_docs
                
        except Exception as e:
            print(f"Error during retrieval : {e}")
            return []
        
rag_retriever = RAGRetriever(vectorstore, embedding_manager)

In [55]:
rag_retriever

<__main__.RAGRetriever at 0x20f1a385490>

In [49]:
rag_retriever.retrieve("Data Engineer jobs from Amazon for freshers")

Retrieving documents for query: 'Data Engineer jobs from Amazon for freshers'
Top K: 5, score threshold: 0.0
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 38.48it/s]

Generated embeddings with shape: (1, 384)
Retrieved 1 documents (after filtering)
Retrieved 2 documents (after filtering)
Retrieved 3 documents (after filtering)
Retrieved 4 documents (after filtering)
Retrieved 5 documents (after filtering)





[{'id': 'doc_3abe987e_100',
  'content': 'Title: Data Engineer , Retail Business Service\n\nURL: https://www.amazon.jobs/en/jobs/3127394/data-engineer-retail-business-service\n\nLocation: Bengaluru, KA, IND\n\nJobID: 3127394\n\nPosted: Posted November 17, 2025\n\nDescription: Basic qualifications:\n\n1+ years of data engineering experience\nExperience with SQL\nExperience with data modeling, warehousing and building ETL pipelines\nExperience with one or more query language (e.g., SQL, PL/SQL, DDL, MDX, HiveQL, SparkSQL, Scala)\nExperience with one or more scripting language (e.g., Python, KornShell)',
  'metadata': {'doc_index': 100,
   'source_file': 'amazon_jobs.json',
   'content_length': 548,
   'file_type': 'json'},
  'similarity_score': 0.41706395149230957,
  'distance': 0.5829360485076904,
  'rank': 1},
 {'id': 'doc_0cf075f5_101',
  'content': '{\n  "Title": "Data Engineer , Retail Business Service",\n  "URL": "https://www.amazon.jobs/en/jobs/3127394/data-engineer-retail-busines

### Integration VectorDB Context pipeline With LLM output

In [123]:
from langchain_groq import ChatGroq
import os
from dotenv import load_dotenv


load_dotenv()
groq_api_key = os.getenv("GROQ_API_KEY")


llm = ChatGroq(groq_api_key=groq_api_key, model_name="openai/gpt-oss-20b", temperature=0.1, max_tokens=2048)


def rag_simple(query: str, retriever: RAGRetriever, llm: ChatGroq, top_k: int = 3):
    results = retriever.retrieve(query, top_k=top_k)
    context = "\n\n".join([doc["content"] for doc in results]) if results else ""

    if not context:
        return "No relevant context found."

    prompt = f"""
    You are an expert job extraction assistant.

    Use ONLY the following retrieved job descriptions to answer the question.

    Extract and fill in each field. 
    If a field does not appear in the context, respond with "Not specified".

    <context>
    {context}
    </context>

    Question: {query}

    Return output in the following JSON format:

    [{{
    "Title": "",
    "Company Name": "",
    "Location": "",
    "URL": "",
    "Experience Level": "",
    "Job Description": ""
    }}]
    """

    response = llm.invoke(
        prompt,
        response_format={"type": "json_object"}
    )
    return response.content

In [129]:
answer = rag_simple("Software Dev Engineer jobs for 2 years Experienced", rag_retriever, llm=llm)
print(answer)

Retrieving documents for query: 'Software Dev Engineer jobs for 2 years Experienced'
Top K: 3, score threshold: 0.0
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 33.01it/s]

Generated embeddings with shape: (1, 384)
Retrieved 1 documents (after filtering)
Retrieved 2 documents (after filtering)
Retrieved 3 documents (after filtering)





[
{"Title":"Software Engineer III, Core","Company Name":"Google","Location":"Bengaluru, Karnataka, India","URL":"https://www.google.com/about/careers/applications/jobs/results/138354336852779718-software-engineer-iii-core?location=India&page=2","Experience Level":"Mid","Job Description":"Minimum qualifications\nBachelor’s degree or equivalent practical experience.\n2 years of experience with software development in one or more programming languages, or 1 year of experience with an advanced degree.\n2 years of experience with data structures or algorithms.\nExperience in Rust or C++ Programming."},{"Title":"Software Support Engineer","Company Name":"Amazon","Location":"Chennai, TN, IND","URL":"https://www.amazon.jobs/en/jobs/2928982/software-support-engineer","Experience Level":"Not specified","Job Description":"Basic qualifications:\n\nBachelor’s degree in Computer Science or a related field.\n2-4 years overall development/technical support experience.\n2-4 years of experience to troub

In [130]:
json_answer = json.loads(answer)
print(json.dumps(json_answer, indent=2))

[
  {
    "Title": "Software Engineer III, Core",
    "Company Name": "Google",
    "Location": "Bengaluru, Karnataka, India",
    "URL": "https://www.google.com/about/careers/applications/jobs/results/138354336852779718-software-engineer-iii-core?location=India&page=2",
    "Experience Level": "Mid",
    "Job Description": "Minimum qualifications\nBachelor\u2019s degree or equivalent practical experience.\n2 years of experience with software development in one or more programming languages, or 1 year of experience with an advanced degree.\n2 years of experience with data structures or algorithms.\nExperience in Rust or C++ Programming."
  },
  {
    "Title": "Software Support Engineer",
    "Company Name": "Amazon",
    "Location": "Chennai, TN, IND",
    "URL": "https://www.amazon.jobs/en/jobs/2928982/software-support-engineer",
    "Experience Level": "Not specified",
    "Job Description": "Basic qualifications:\n\nBachelor\u2019s degree in Computer Science or a related field.\n2-4

In [131]:
for job in json_answer :
    print(job['Title'])
    print(job['Company Name'])
    print(job['URL'])
    print(job['Location'])
    print(job['Job Description'])
    print('\n')

Software Engineer III, Core
Google
https://www.google.com/about/careers/applications/jobs/results/138354336852779718-software-engineer-iii-core?location=India&page=2
Bengaluru, Karnataka, India
Minimum qualifications
Bachelor’s degree or equivalent practical experience.
2 years of experience with software development in one or more programming languages, or 1 year of experience with an advanced degree.
2 years of experience with data structures or algorithms.
Experience in Rust or C++ Programming.


Software Support Engineer
Amazon
https://www.amazon.jobs/en/jobs/2928982/software-support-engineer
Chennai, TN, IND
Basic qualifications:

Bachelor’s degree in Computer Science or a related field.
2-4 years overall development/technical support experience.
2-4 years of experience to troubleshoot and identify the root cause of issues in complex Enterprise Level applications.
2-4 years of experience working on networking, hardware performance and audio/video technologies within Consumer Elect