### Data Loading

In [19]:
from pathlib import Path
from typing import List, Any
from langchain_community.document_loaders import CSVLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer

In [24]:
from pathlib import Path
from langchain_community.document_loaders import CSVLoader

data_dir = "../data"
data_path = Path(data_dir).resolve()
print(f"[INFO] Scanning data directory: {data_path}")

documents = []

csv_files = list(data_path.rglob("*.csv"))
print(f"[INFO] Found {len(csv_files)} CSV files.")

for csv_fp in csv_files:
    print(f"[LOAD] CSV → {csv_fp.name}")
    loader = CSVLoader(
        file_path=str(csv_fp),
        csv_args={"delimiter": ",", "quotechar": '"'}
    )
    docs = loader.load()

    for idx, doc in enumerate(docs):
        doc.metadata["source"] = csv_fp.name

        # Convert structured row text back into key:value dict
        row_dict = {}
        for line in doc.page_content.split("\n"):
            if ":" in line:
                key, value = line.split(":", 1)
                row_dict[key.strip()] = value.strip()

        # Extract clean date (YYYY-MM-DD)
        date_str = row_dict.get("Date", "").split(" ")[0]

        # Optional: Round numeric columns for cleaner text
        open_price = float(row_dict.get("Open", 0))
        high_price = float(row_dict.get("High", 0))
        low_price = float(row_dict.get("Low", 0))
        close_price = float(row_dict.get("Close", 0))
        volume = int(float(row_dict.get("Volume", 0)))

        # Format page_content as single-line string
        doc.page_content = (
            f"Date: {row_dict.get("Date", "")} | "
            f"Open: {open_price:.2f} | "
            f"High: {high_price:.2f} | "
            f"Low: {low_price:.2f} | "
            f"Close: {close_price:.2f} | "
            f"Volume: {volume}"
        )

        # Update metadata with clean date
        doc.metadata["Date"] = date_str

    documents.extend(docs)

# Example outputs
documents[0]


[INFO] Scanning data directory: D:\My GitHub\rachit404\ChronoSense\data
[INFO] Found 1 CSV files.
[LOAD] CSV → goog_stock_data.csv


Document(metadata={'source': 'goog_stock_data.csv', 'row': 0, 'Date': '2020-10-26'}, page_content='Date: 2020-10-26 00:00:00-04:00 | Open: 80.70 | High: 81.36 | Low: 78.29 | Close: 78.98 | Volume: 37066000')

In [25]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity

In [26]:
class EmbeddingManager:
    """Handles document embedding generation using Sentence Transformers."""
    
    def __init__(self, model_name: str = 'all-MiniLM-L6-v2'):
        """
        Initialize the embedding manager
        
        Args:
            model_name: Hugging Face model name for sentence embeddings
        """
        self.model_name = model_name
        self.model = None
        self._load_model()
        
    def _load_model(self):        
        """Load the sentence transformer model."""
        try:
            print(f"Loading embedding model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model loaded successfully. Embedding dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            raise e
        
    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        """
        Generate embeddings for a list of texts.
        
        Args:
            texts: List of text strings to embed
        
        Returns:
            Numpy array of embeddings with shape (len(texts), embedding_dim)
        """
        if not self.model:
            raise ValueError("Embedding model is not loaded.")
        
        print(f"Generating embeddings for {len(texts)} texts...")
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f"Generated embeddings with shape: {embeddings.shape}")
        return embeddings
    

# Initialize the embedding manager
embedding_manager = EmbeddingManager()
embedding_manager

Loading embedding model: all-MiniLM-L6-v2
Model loaded successfully. Embedding dimension: 384


<__main__.EmbeddingManager at 0x174ffd69df0>

In [28]:
import os
class VectorStore:
    """Manages document embeddings in a ChromaDB vector store."""
    
    def __init__(self, collection_name: str = "pdf_documents", persist_directory: str = "../data/vector_store"):
        """
        Initialize the vector store.
        
        Args:
            collection_name: Name of the ChromaDB collection
            persist_directory: Directory to persist the vector store
        """
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()
        
    def _initialize_store(self):
        """Initialize ChromaDB client and collection."""
        try:
            # Create persistent ChromaDB Client
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)
            
            # Get or Create collection
            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description": "PDF document embeddings for RAG"}
                )
            print(f"Vector store intialized. Collection: {self.collection_name}")
            print(f"Existing documents in collection: {self.collection.count()}")
            
        except Exception as e:
            print(f"Error initializing vector store: {e}")
            raise e
        
    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        """
        Add documents and their embeddings to the vector store.
        
        Args:
            documents: List of LangChain Document objects
            embeddings: corresponding embeddings for the documents
        """
        if len(documents) != len(embeddings):
            raise ValueError("Number of documents must match number of embeddings.")
        
        print(f"Adding {len(documents)} documents to vector store...")
        
        # Prepare data for ChromaDB
        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []
        
        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            # Generate unique ID
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)
            
            # Prepare metadata
            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)            
            metadatas.append(metadata)
            
            # Document content
            documents_text.append(doc.page_content)
            
            # Embedding
            embeddings_list.append(embedding.tolist())
            
        # Add to collection
        try:
            self.collection.add(
                ids=ids,
                embeddings=embeddings_list,
                metadatas=metadatas,
                documents=documents_text,
            )
            print(f"Successfully added {len(documents)} documents to vector store.")
            print(f"Total documents in collection now: {self.collection.count()}")
            
        except Exception as e:
            print(f"Error adding documents to vector store: {e}")
            raise
        
# Initialize vector store
vector_store = VectorStore()
vector_store                   

Vector store intialized. Collection: pdf_documents
Existing documents in collection: 0


<__main__.VectorStore at 0x1749261d130>

In [31]:
# Convert the text to embeddings
texts = [doc.page_content for doc in documents]

# Generate embeddings
embeddings = embedding_manager.generate_embeddings(texts)

# Store in vector database
vector_store.add_documents(documents, embeddings)

Generating embeddings for 1256 texts...


Batches: 100%|██████████| 40/40 [00:05<00:00,  7.31it/s]


Generated embeddings with shape: (1256, 384)
Adding 1256 documents to vector store...
Successfully added 1256 documents to vector store.
Total documents in collection now: 2512


In [30]:
class RAGRetriver:
    """Handles query based retrieval from the vector store."""
    
    def __init__(self, vector_store: VectorStore, embedding_manager: EmbeddingManager):
        """
        Initialize the retriever.
        
        Args:
            vector_store: VectorStore containing document embeddings
            embedding_manager: Manager for generating query embeddings
        """
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager
        
    def retrieve(self, query: str, top_k: int = 5, score_threshold: float = 0.0) -> List[Dict[str, Any]]:
        """
        Retrieve relevant documents for a query.
        
        Args:
            query: User search query string
            top_k: Number of top results to return
            score_threshold: Minimum similarity score to threshold
            
        Returns:
            List of dictionaries containing retrieved documents and metadata
        """
        print(f"Retrieving documents for query: '{query}'")
        print(f"Top K: {top_k}, Score Threshold: {score_threshold}")
        
        # Generate query embedding
        query_embedding = self.embedding_manager.generate_embeddings([query])[0]
        
        # Search in vector store
        try:
            results = self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k,
            )
            
            # Process results
            retrieved_docs = []
            
            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids = results['ids'][0]
                
                for i, (doc_id, document, metadata, distance) in enumerate(zip(ids, documents, metadatas, distances)):
                    # Convert distance to similarity score (ChromaDB uses cosine distance)
                    similarity_score = 1 - distance
                    
                    if similarity_score >= score_threshold:
                        retrieved_docs.append({
                            'id': doc_id,
                            'content': document,
                            'metadata': metadata,
                            'similarity_score': similarity_score,
                            'distance': distance,
                            'rank': i + 1,                            
                        })
                        
                print(f"Retrieved {len(retrieved_docs)} documents (after filtering).")
            else:
                print("No documents retrieved.")
                
            return retrieved_docs
        
        except Exception as e:
            print(f"Error during retrieval: {e}")
            return []
        
        
# Initialize retriever
rag_retriever = RAGRetriver(vector_store, embedding_manager)
rag_retriever 

<__main__.RAGRetriver at 0x17480199a00>

In [32]:
rag_retriever.retrieve("What was the closing price trend in October 2022?")

Retrieving documents for query: 'What was the closing price trend in October 2022?'
Top K: 5, Score Threshold: 0.0
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 21.50it/s]

Generated embeddings with shape: (1, 384)
Retrieved 0 documents (after filtering).





[]