In [7]:
import os
import time
from typing import List, Dict, Any
import chromadb
from sentence_transformers import SentenceTransformer
from langchain_text_splitters import RecursiveCharacterTextSplitter
import numpy as np


In [2]:

def process_user_query(query: str):
    """
    Process user query and convert to embedding for vector search.
    
    This section demonstrates:
    - Query preprocessing
    - Embedding model usage
    - Vector conversion
    - Query optimization
    """
    print("\nüîç SECTION 3: QUERY PROCESSING")
    print("=" * 50)
    
    # Load embedding model (what model is used?)
    model = SentenceTransformer('all-MiniLM-L6-v2')  # What embedding model is used?
    
    print(f"ü§ñ Using model: {model}")
    print(f"üìê Embedding dimensions: {model.get_sentence_embedding_dimension()}")
    
    # Preprocess query
    cleaned_query = query.lower().strip()
    print(f"üìù Original query: '{query}'")
    print(f"üßπ Cleaned query: '{cleaned_query}'")
    
    # Convert query to embedding
    query_embedding = model.encode([cleaned_query])
    print(f"üî¢ Query embedding shape: {query_embedding.shape}")
    print(f"üìä Embedding sample: {query_embedding[0][:5]}...")
    
    return model, query_embedding[0]

In [8]:
   # Step 3: Process user query
model, query_embedding = process_user_query("What is the capital of France?")


üîç SECTION 3: QUERY PROCESSING
ü§ñ Using model: SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False, 'architecture': 'BertModel'})
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)
üìê Embedding dimensions: 384
üìù Original query: 'What is the capital of France?'
üßπ Cleaned query: 'what is the capital of france?'
üî¢ Query embedding shape: (1, 384)
üìä Embedding sample: [ 0.08204811  0.03605553 -0.00389289 -0.00488105  0.02565114]...


In [9]:
print("\n‚úÖ Query processing complete.")
print(model)
print(query_embedding)


‚úÖ Query processing complete.
SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False, 'architecture': 'BertModel'})
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)
[ 8.20481107e-02  3.60555314e-02 -3.89288529e-03 -4.88104578e-03
  2.56511364e-02 -5.71434870e-02  1.21916067e-02  4.67890408e-03
  3.49498726e-02 -2.24219412e-02 -8.00523721e-03 -1.09353542e-01
  2.27247849e-02 -2.93208789e-02 -4.35220562e-02 -1.20241232e-01
 -8.48641328e-04 -1.81501228e-02  5.61295375e-02  3.08522978e-03
  2.33634724e-03 -1.68392397e-02  6.36246949e-02 -2.36602146e-02
  3.14935632e-02 -3.47979218e-02 -2.05488633e-02 -2.79095117e-03
 -1.10379755e-02 -3.61267254e-02  5.41410930e-02 -3.66171338e-02
 -2.50086486e-

In [2]:

import os
import time
from typing import List, Dict, Any
import chromadb
from sentence_transformers import SentenceTransformer
from langchain_text_splitters import RecursiveCharacterTextSplitter
import numpy as np

# ========================================
# SECTION 1: DOCUMENT LOADING & CHUNKING
# ========================================

def load_and_chunk_documents():
    """
    Load sample policy documents and chunk them for better retrieval.
    
    This section demonstrates:
    - Document loading from sample data
    - Text chunking using LangChain
    - Chunk size and overlap configuration
    """
    print("üìö SECTION 1: DOCUMENT LOADING & CHUNKING")
    print("=" * 50)
    
    # Sample policy documents (same as previous labs)
    policy_documents = [
        {
            "id": "policy_001",
            "title": "Home Office Equipment Reimbursement",
            "content": "Employees working from home may claim up to $500 per year for office equipment including desks, chairs, monitors, and computer accessories. Receipts must be submitted within 30 days of purchase. This policy applies to full-time remote workers only. The equipment must be used primarily for work purposes and should be ergonomic and suitable for a professional home office environment.",
            "category": "reimbursement"
        },
        {
            "id": "policy_002", 
            "title": "Travel Expense Guidelines",
            "content": "Business travel expenses are reimbursable when pre-approved by your manager. Meals are covered up to $50 per day, hotel stays up to $200 per night. All receipts must be submitted within 14 days of return. International travel requires additional approval from the department head. Travel insurance is mandatory for all business trips exceeding 7 days.",
            "category": "travel"
        },
        {
            "id": "policy_003",
            "title": "Remote Work Furniture Policy", 
            "content": "Remote employees may purchase ergonomic furniture for their home office setup. This includes standing desks, ergonomic chairs, and monitor arms. Maximum reimbursement is $300 per item with manager approval required. All furniture must meet ergonomic standards and be purchased from approved vendors. Receipts must be submitted within 45 days of purchase.",
            "category": "reimbursement"
        },
        {
            "id": "policy_004",
            "title": "Equipment and Supplies Reimbursement",
            "content": "Work-related equipment and supplies purchased for home office use are eligible for reimbursement. This covers laptops, monitors, keyboards, mice, and other computer peripherals. Submit expense reports with receipts for approval. Equipment must be used for work purposes and should be compatible with company systems. Annual limit is $1000 per employee.",
            "category": "reimbursement"
        },
        {
            "id": "policy_005",
            "title": "Vacation and PTO Policy",
            "content": "Full-time employees accrue 15 days of paid time off per year. Vacation requests must be submitted at least 2 weeks in advance. Unused PTO does not roll over to the next year. Emergency leave can be taken with manager approval. Sick leave is separate from vacation time and does not count against PTO balance.",
            "category": "benefits"
        }
    ]
    
    print(f"üìÑ Loaded {len(policy_documents)} policy documents")
    
    # Configure text splitter (same as chunking lab)
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=200,  # What is the chunk size?
        chunk_overlap=50,  # What is the overlap?
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    
    # Chunk all documents
    all_chunks = []
    for doc in policy_documents:
        chunks = text_splitter.split_text(doc["content"])
        for i, chunk in enumerate(chunks):
            all_chunks.append({
                "id": f"{doc['id']}_chunk_{i}",
                "title": doc["title"],
                "content": chunk,
                "category": doc["category"],
                "source_doc": doc["id"]
            })
    
    print(f"‚úÇÔ∏è Created {len(all_chunks)} chunks from {len(policy_documents)} documents")
    print(f"üìè Average chunk size: {sum(len(chunk['content']) for chunk in all_chunks) // len(all_chunks)} characters")
    
    return all_chunks

In [3]:

    # Step 1: Load and chunk documents
chunks = load_and_chunk_documents()

üìö SECTION 1: DOCUMENT LOADING & CHUNKING
üìÑ Loaded 5 policy documents
‚úÇÔ∏è Created 13 chunks from 5 documents
üìè Average chunk size: 162 characters


In [4]:
chunks

[{'id': 'policy_001_chunk_0',
  'title': 'Home Office Equipment Reimbursement',
  'content': 'Employees working from home may claim up to $500 per year for office equipment including desks, chairs, monitors, and computer accessories. Receipts must be submitted within 30 days of purchase. This',
  'category': 'reimbursement',
  'source_doc': 'policy_001'},
 {'id': 'policy_001_chunk_1',
  'title': 'Home Office Equipment Reimbursement',
  'content': 'be submitted within 30 days of purchase. This policy applies to full-time remote workers only. The equipment must be used primarily for work purposes and should be ergonomic and suitable for a',
  'category': 'reimbursement',
  'source_doc': 'policy_001'},
 {'id': 'policy_001_chunk_2',
  'title': 'Home Office Equipment Reimbursement',
  'content': 'and should be ergonomic and suitable for a professional home office environment.',
  'category': 'reimbursement',
  'source_doc': 'policy_001'},
 {'id': 'policy_002_chunk_0',
  'title': 'Travel Exp