Assignment

In [1]:
# ============================================================================
# INSTALLATION
# ============================================================================
print("Installing required packages...")
# Force update bitsandbytes and transformers
!pip install -q -U bitsandbytes transformers accelerate chromadb sentence-transformers pandas torch
print("‚úì Installation complete\n")

Installing required packages...
‚úì Installation complete



In [3]:
"""
================================================================================
EVENT INTELLIGENCE RAG SYSTEM - FIXED VERSION
================================================================================
Assignment: Build a RAG system for operational event intelligence

- Enhanced retrieval with metadata filtering for exact ID matches
- Hybrid search (exact + semantic)
- Better event ID extraction and matching
- Data validation checks

================================================================================
"""
# ============================================================================
# INSTALLATION
# ============================================================================
print("Installing required packages...")
# Force update bitsandbytes and transformers
!pip install -q -U bitsandbytes transformers accelerate chromadb sentence-transformers pandas torch
print("‚úì Installation complete\n")


import pandas as pd
import sqlite3
import json
import re
import uuid
from datetime import datetime
from typing import List, Dict, Any, Tuple
import warnings
warnings.filterwarnings('ignore')

import chromadb
from chromadb.config import Settings
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig
import torch
from google.colab import files

print("‚úì All packages imported successfully\n")

# ============================================================================
# STEP 1: DATA INGESTION & SQL SETUP
# ============================================================================

def upload_csv_file():
    print("="*70)
    print("[STEP 1: DATA INGESTION & SQL SETUP]")
    print("="*70)
    print("\nPlease upload your CSV file...")

    uploaded = files.upload()
    if not uploaded:
        raise ValueError("‚ùå No file uploaded!")

    filename = list(uploaded.keys())[0]
    print(f"‚úì File uploaded: {filename}")

    df = pd.read_csv(filename, low_memory=False)
    print(f"‚úì Loaded {len(df)} records with {len(df.columns)} columns")

    return df

def create_sqlite_database(df):
    print("\n" + "-"*70)
    print("Creating SQLite database...")

    db_path = "event_details.db"
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    cursor.execute("DROP TABLE IF EXISTS event_details")
    df.to_sql('event_details', conn, if_exists='replace', index=False)

    cursor.execute("SELECT COUNT(*) FROM event_details")
    count = cursor.fetchone()[0]

    print(f"‚úì Database created with {count} rows")
    return conn

# ============================================================================
# STEP 2: FEATURE ENGINEERING
# ============================================================================

def parse_json_safely(json_str):
  #data has JSON "hidden" inside text strings. This function safely unpacks that data.
  # If the JSON is broken or empty, it returns an empty dictionary instead of crashing the program.

    if pd.isna(json_str) or json_str == '':
        return {}
    try:
        if isinstance(json_str, str):
            return json.loads(json_str)
        return {}
    except:
        return {}

def create_event_narrative(row):
  #It gathers facts from 100+ columns (Location, Time, Priority, etc.)
  #and joins them into a single, human-readable paragraph. This is what the AI actually "reads."
    narrative_parts = []

    if pd.notna(row.get('EVENT_ID')):
        narrative_parts.append(f"Incident ID {row['EVENT_ID']}")
    if pd.notna(row.get('ALARM_ID')):
        narrative_parts.append(f"Alarm {row['ALARM_ID']}")
    if pd.notna(row.get('ALARM_NAME')):
        narrative_parts.append(f"Type: {row['ALARM_NAME']}")
    if pd.notna(row.get('CATEGORY_NAME')):
        narrative_parts.append(f"Category: {row['CATEGORY_NAME']}")
    if pd.notna(row.get('PRIORITY')):
        narrative_parts.append(f"Priority Level: {row['PRIORITY']}")
    if pd.notna(row.get('SEVERITY')):
        narrative_parts.append(f"Severity: {row['SEVERITY']}")
    if pd.notna(row.get('URGENCY')):
        narrative_parts.append(f"Urgency: {row['URGENCY']}")
    if pd.notna(row.get('LOCATION')):
        narrative_parts.append(f"Location: {row['LOCATION']}")
    if pd.notna(row.get('SITE_NAME')):
        narrative_parts.append(f"Site: {row['SITE_NAME']}")
    if pd.notna(row.get('JURISDICTION_NAME')):
        narrative_parts.append(f"Jurisdiction: {row['JURISDICTION_NAME']}")
    if pd.notna(row.get('LATITUDE')) and pd.notna(row.get('LONGITUDE')):
        narrative_parts.append(f"Coordinates: ({row['LATITUDE']}, {row['LONGITUDE']})")
    if pd.notna(row.get('ALARM_GENERATED_TIME')):
        narrative_parts.append(f"Generated at: {row['ALARM_GENERATED_TIME']}")
    if pd.notna(row.get('EVENT_CREATED_TIME')):
        narrative_parts.append(f"Created at: {row['EVENT_CREATED_TIME']}")
    if pd.notna(row.get('EVENT_STATUS')):
        narrative_parts.append(f"Event Status: {row['EVENT_STATUS']}")
    if pd.notna(row.get('ALARM_STATUS')):
        narrative_parts.append(f"Alarm Status: {row['ALARM_STATUS']}")
    if pd.notna(row.get('PRIMARY_AGENCY')):
        narrative_parts.append(f"Primary Agency: {row['PRIMARY_AGENCY']}")
    if pd.notna(row.get('SECONDARY_AGENCY')):
        narrative_parts.append(f"Secondary Agencies: {row['SECONDARY_AGENCY']}")
    if pd.notna(row.get('USER_NAME')):
        narrative_parts.append(f"Personnel: {row['USER_NAME']}")
    if pd.notna(row.get('STATION_NAME')):
        narrative_parts.append(f"Station: {row['STATION_NAME']}")
    if pd.notna(row.get('SOP_NAME')):
        narrative_parts.append(f"SOP: {row['SOP_NAME']}")
    if pd.notna(row.get('SOP_DESCRIPTION')):
        sop_desc = str(row['SOP_DESCRIPTION'])[:200]
        narrative_parts.append(f"SOP Description: {sop_desc}")
    if pd.notna(row.get('SOP_DOCUMENT_URL')):
        narrative_parts.append(f"SOP Document: {row['SOP_DOCUMENT_URL']}")

    if pd.notna(row.get('ADDITIONAL_DETAILS')):
        details = parse_json_safely(row['ADDITIONAL_DETAILS'])
        if details:
            if 'incidentNarrative' in details:
                narrative_parts.append(f"Narrative: {details['incidentNarrative']}")
            if 'BuildingName' in details:
                narrative_parts.append(f"Building: {details['BuildingName']}")
            if 'BuildingFloor' in details:
                narrative_parts.append(f"Floor: {details['BuildingFloor']}")
            if 'BuildingNumber' in details:
                narrative_parts.append(f"Building Number: {details['BuildingNumber']}")
            if 'numberOfPeopleInEmergency' in details:
                narrative_parts.append(f"People Affected: {details['numberOfPeopleInEmergency']}")
            if 'incidentInjuries' in details:
                narrative_parts.append(f"Injuries: {details['incidentInjuries']}")
            if 'roadCondition' in details:
                narrative_parts.append(f"Road Condition: {details['roadCondition']}")
            if 'nearbyLandmarks' in details:
                narrative_parts.append(f"Nearby: {details['nearbyLandmarks']}")
            if 'callerName' in details:
                narrative_parts.append(f"Reported by: {details['callerName']}")

    if pd.notna(row.get('REC_DATA')):
        rec_data = parse_json_safely(row['REC_DATA'])
        if rec_data and 'stationRec' in rec_data:
            station_info = rec_data['stationRec']
            if 'data' in station_info and station_info['data']:
                station = station_info['data'][0]
                narrative_parts.append(f"Nearest Station: {station.get('name', 'N/A')}")
                narrative_parts.append(f"Distance: {station.get('distance_km', 'N/A')} km")
                eta_seconds = station.get('time_seconds', 0)
                eta_minutes = int(eta_seconds) // 60 if eta_seconds else 0
                narrative_parts.append(f"Estimated Time: {eta_minutes} minutes")
                if 'equipment' in station:
                    narrative_parts.append(f"Available Equipment: {len(station['equipment'])} units")

    if pd.notna(row.get('PHONE')):
        narrative_parts.append(f"Contact: {row['PHONE']}")
    if pd.notna(row.get('NAME_CONTACT')):
        narrative_parts.append(f"Contact Person: {row['NAME_CONTACT']}")
    if pd.notna(row.get('DEVICE_NAME')):
        narrative_parts.append(f"Device: {row['DEVICE_NAME']}")
    if pd.notna(row.get('COMPONENT_ID')):
        narrative_parts.append(f"Component ID: {row['COMPONENT_ID']}")

    narrative = ". ".join(narrative_parts)
    if narrative:
        narrative += "."

    return narrative

def engineer_features(df):
  #The "Manager" function. It runs the narrative creator on every row and creates extra metadata columns
  #(like month or urgency) that help the search engine filter data later.

    print("\n" + "="*70)
    print("[STEP 2: FEATURE ENGINEERING]")
    print("="*70)

    df_enhanced = df.copy()
    df_enhanced['event_text'] = df_enhanced.apply(create_event_narrative, axis=1)
    print(f"‚úì Created event_text for {len(df_enhanced)} records")

    df_enhanced['priority'] = df_enhanced['PRIORITY'].fillna('Unknown').astype(str)
    df_enhanced['component_id'] = df_enhanced['COMPONENT_ID'].fillna('Unknown').astype(str)
    df_enhanced['severity'] = df_enhanced['SEVERITY'].fillna('Unknown').astype(str)
    df_enhanced['urgency'] = df_enhanced['URGENCY'].fillna('Unknown').astype(str)

    if 'ALARM_GENERATED_TIME' in df_enhanced.columns:
        df_enhanced['month'] = pd.to_datetime(df_enhanced['ALARM_GENERATED_TIME'], errors='coerce').dt.month
        df_enhanced['month'] = df_enhanced['month'].fillna(0).astype(int)
    else:
        df_enhanced['month'] = 0

    print("\nüîç Checking for test events...")
    test_events = ['INC001572', 'INC001573', 'INC001574']
    for event_id in test_events:
        exists = (df_enhanced['EVENT_ID'] == event_id).any()
        if exists:
            print(f"‚úì {event_id} found")
        else:
            print(f"‚ùå {event_id} NOT found")

    if not any((df_enhanced['EVENT_ID'] == e).any() for e in test_events):
        print(f"\n‚ö†Ô∏è  Sample IDs: {df_enhanced['EVENT_ID'].head(10).tolist()}")

    return df_enhanced

# ============================================================================
# STEP 3: TEXT CHUNKING
# ============================================================================

def chunk_text_with_overlap(text, chunk_size=500, overlap=50):
  # Large documents are hard for AI to digest. This function slices long narratives into smaller pieces (500 characters).
  # It uses overlap so that the end of one slice and the start of the next share some text, ensuring no context is lost "at the edges."


    if not text or len(text) <= chunk_size:
        return [text] if text else []

    chunks = []
    start = 0

    while start < len(text):
        end = start + chunk_size
        # This part looks backward from the 500th character to find the last period (.), exclamation mark (!), or question mark (?).
        #The Result: It identifies the end of the last complete sentence within that 500-character limit.
        if end < len(text):
            last_period = text[start:end].rfind('.')
            last_exclamation = text[start:end].rfind('!')
            last_question = text[start:end].rfind('?')
            boundary = max(last_period, last_exclamation, last_question)
            # If a punctuation mark was found, the code moves the "cut" point to exactly after that mark. This ensures every chunk is a set of complete sentences.
            if boundary != -1:
                end = start + boundary + 1

        chunk = text[start:end].strip()
        if chunk:
            chunks.append(chunk)

        start = end - overlap #This is the most clever part. Instead of starting the next chunk exactly where the last one ended, it "steps back" by 50 characters (the overlap).
        if start <= 0:
            start = end

    return chunks

def create_chunks_with_metadata(df, chunk_size=500, overlap=50):
  #Takes those slices and attaches "ID Tags" (metadata) to them.
  # It ensures every slice knows which EVENT_ID it belongs to, which is why your search is accurate.
    print("\n" + "="*70)
    print("[STEP 3: TEXT CHUNKING]")
    print("="*70)

    all_chunks = []

    for idx, row in df.iterrows():
        text = row.get('event_text', '')
        if not text:
            continue

        text_chunks = chunk_text_with_overlap(text, chunk_size, overlap)

        for chunk_idx, chunk_text in enumerate(text_chunks):
            unique_id = f"{row.get('EVENT_ID', idx)}_{chunk_idx}_{uuid.uuid4().hex[:8]}"

            chunk_data = {
                'chunk_id': unique_id,
                'text': chunk_text,
                'alarm_id': str(row.get('ALARM_ID', '')),
                'event_id': str(row.get('EVENT_ID', '')),
                'priority': str(row.get('priority', '')),
                'component_id': str(row.get('component_id', '')),
                'severity': str(row.get('severity', '')),
                'urgency': str(row.get('urgency', '')),
                'location': str(row.get('LOCATION', '')),
                'category': str(row.get('CATEGORY_NAME', '')),
                'status': str(row.get('EVENT_STATUS', '')),
                'jurisdiction': str(row.get('JURISDICTION_NAME', '')),
                'month': int(row.get('month', 0))
            }

            all_chunks.append(chunk_data)

    print(f"‚úì Created {len(all_chunks)} chunks from {len(df)} events")
    return all_chunks

# ============================================================================
# STEP 4: EMBEDDINGS & VECTOR DATABASE
# ============================================================================

def load_embedding_model(model_name="sentence-transformers/all-MiniLM-L6-v2"):
  #Loads a specialized model that turns human text into a list of numbers (a vector).
    print("\n" + "="*70)
    print("[STEP 4: EMBEDDINGS & VECTOR DATABASE]")
    print("="*70)

    model = SentenceTransformer(model_name)
    print("‚úì Embedding model loaded")
    return model

def setup_chromadb():
  #Initializes ChromaDB, a "Vector Database."
  #Unlike a normal database, this one can find information based on meaning rather than just exact words.
    client = chromadb.Client(Settings(anonymized_telemetry=False, allow_reset=True))

    try:
        client.delete_collection("event_intelligence")
    except:
        pass

    collection = client.create_collection(
        name="event_intelligence",
        metadata={"hnsw:space": "cosine"}
    )

    print("‚úì ChromaDB initialized")
    return client, collection

def generate_and_store_embeddings(chunks, embedding_model, collection):
  #Turns all your text chunks into numbers and saves them in ChromaDB.
    texts = [chunk['text'] for chunk in chunks]
    embeddings = embedding_model.encode(texts, show_progress_bar=True, batch_size=32)

    ids = [chunk['chunk_id'] for chunk in chunks]
    documents = texts
    metadatas = [
        {
            'alarm_id': chunk['alarm_id'],
            'event_id': chunk['event_id'],
            'priority': chunk['priority'],
            'component_id': chunk['component_id'],
            'severity': chunk['severity'],
            'urgency': chunk['urgency'],
            'location': chunk['location'],
            'category': chunk['category'],
            'status': chunk['status'],
            'jurisdiction': chunk['jurisdiction'],
            'month': chunk['month']
        }
        for chunk in chunks
    ]

    batch_size = 1000
    for i in range(0, len(ids), batch_size):
        end_idx = min(i + batch_size, len(ids))
        collection.add(
            embeddings=embeddings[i:end_idx].tolist(),
            documents=documents[i:end_idx],
            metadatas=metadatas[i:end_idx],
            ids=ids[i:end_idx]
        )

    print(f"‚úì Stored {len(chunks)} embeddings")

# ============================================================================
# STEP 5: ENHANCED RETRIEVAL
# ============================================================================

def retrieve_by_exact_id(event_id, collection, max_results=20):
  #It tells ChromaDB: "Ignore the AI for a second and just find every piece of text tagged with this specific INC001572 ID.
    """Get ALL chunks for specific event ID"""
    try:
        results = collection.get(
            where={"event_id": event_id.upper()},
            limit=max_results
        )

        retrieved_chunks = []
        if results['documents']:
            print(f"  ‚úì Found {len(results['documents'])} chunks for {event_id}")
            for i in range(len(results['documents'])):
                chunk_data = {
                    'text': results['documents'][i],
                    'metadata': results['metadatas'][i],
                    'distance': 0.0,
                    'relevance_score': 1.0
                }
                retrieved_chunks.append(chunk_data)
        return retrieved_chunks
    except Exception as e:
        print(f"  ‚ùå Error: {e}")
        return []

def retrieve_semantic(query, collection, embedding_model, top_k=5, where_filter=None):
  #This is the "Smart Search." It looks for concepts. If you ask about "flooding," it will find "water leakage" because it understands they are similar concepts.
    """Semantic search with optional filtering"""
    query_embedding = embedding_model.encode([query])[0]

    try:
        if where_filter:
            results = collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k,
                where=where_filter
            )
        else:
            results = collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k
            )

        retrieved_chunks = []
        if results['documents'] and results['documents'][0]:
            for i in range(len(results['documents'][0])):
                chunk_data = {
                    'text': results['documents'][0][i],
                    'metadata': results['metadatas'][0][i],
                    'distance': results['distances'][0][i],
                    'relevance_score': 1 - results['distances'][0][i]
                }
                retrieved_chunks.append(chunk_data)
        return retrieved_chunks
    except Exception as e:
        print(f"  ‚ùå Error: {e}")
        return []

def hybrid_retrieve(query, collection, embedding_model, top_k=5):
  #It uses a Regex (Regular Expression) to see if you typed an Incident ID. If you did, it uses the Exact Search; if you didn't, it uses Semantic Search.
    """Hybrid: exact match for IDs, semantic for general queries"""
    event_id_match = re.search(r'INC\d+', query, re.IGNORECASE)

    if event_id_match:
        event_id = event_id_match.group(0).upper()
        print(f"  üéØ Detected Event ID: {event_id}")

        chunks = retrieve_by_exact_id(event_id, collection, max_results=20)
        if chunks:
            return chunks[:top_k]
        else:
            print(f"  ‚ö† No exact match, trying semantic search")

    print(f"  üîç Semantic search (top_k={top_k})")
    return retrieve_semantic(query, collection, embedding_model, top_k)

# ============================================================================
# STEP 6: RAG PROMPT & GENERATION
# ============================================================================

def create_rag_prompt(query, retrieved_chunks):
  #This takes the found chunks and packages them into a strict instruction for the LLM.
  #It tells the AI: "Here is the data, answer the user's question, and do not make anything up."
    context_parts = []
    for i, chunk in enumerate(retrieved_chunks, 1):
        context_parts.append(f"[Context {i}]")
        context_parts.append(f"Event: {chunk['metadata'].get('event_id', 'N/A')}")
        context_parts.append(f"Priority: {chunk['metadata'].get('priority', 'N/A')}")
        context_parts.append(f"Category: {chunk['metadata'].get('category', 'N/A')}")
        context_parts.append(f"Content: {chunk['text']}")
        context_parts.append("")

    context_text = "\n".join(context_parts)

    prompt = f"""You are an intelligent assistant for an operational event intelligence system. Answer questions about incidents based ONLY on the provided context.

INSTRUCTIONS:
1. Use ONLY information from the context
2. If answer not in context, say "I cannot find this information in the available data"
3. Be specific and cite event IDs
4. List all matching events if multiple

CONTEXT:
{context_text}

QUESTION: {query}

ANSWER: """

    return prompt

def load_llm_model(model_name="TinyLlama/TinyLlama-1.1B-Chat-v1.0"):
  #Loads TinyLlama using 4-bit quantization.
  #This "shrinks" the model so it can run on the free T4 GPU without running out of memory.
    print("\n" + "="*70)
    print("[LOADING LLM FOR RAG]")
    print("="*70)

    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto",
        quantization_config=quantization_config,
        trust_remote_code=True
    )

    text_generator = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=512,
        temperature=0.7,
        top_p=0.95,
        do_sample=True
    )

    print("‚úì LLM ready")
    return tokenizer, text_generator

def generate_answer(prompt, text_generator):
  #The final step. It sends the prompt to TinyLlama and cleans up the text it sends back to give you a neat answer.
    try:
        response = text_generator(
            prompt,
            max_new_tokens=512,
            num_return_sequences=1,
            pad_token_id=text_generator.tokenizer.eos_token_id
        )

        generated_text = response[0]['generated_text']

        if "ANSWER:" in generated_text:
            answer = generated_text.split("ANSWER:")[-1].strip()
        else:
            answer = generated_text[len(prompt):].strip()

        return answer
    except Exception as e:
        return f"Error: {str(e)}"

def query_rag_system(question, collection, embedding_model, text_generator, top_k=5):
  #It connects the Retrieval, Prompting, and Generation steps into one single command.
    """Complete RAG pipeline"""
    print("\n" + "="*70)
    print(f"QUERY: {question}")
    print("="*70)

    print("\n[RETRIEVAL]")
    retrieved_chunks = hybrid_retrieve(question, collection, embedding_model, top_k)

    print(f"‚úì Retrieved {len(retrieved_chunks)} chunks")
    for i, chunk in enumerate(retrieved_chunks, 1):
        print(f"\n  Chunk {i}:")
        print(f"    Event: {chunk['metadata'].get('event_id', 'N/A')}")
        print(f"    Priority: {chunk['metadata'].get('priority', 'N/A')}")
        print(f"    Score: {chunk.get('relevance_score', 0):.3f}")
        print(f"    Text: {chunk['text'][:150]}...")

    print("\n[GENERATING ANSWER]")
    prompt = create_rag_prompt(question, retrieved_chunks)
    answer = generate_answer(prompt, text_generator)

    print("\n[ANSWER]")
    print("-"*70)
    print(answer)
    print("-"*70)

    return {
        'question': question,
        'answer': answer,
        'retrieved_chunks': retrieved_chunks,
        'num_chunks': len(retrieved_chunks)
    }

# ============================================================================
# VALIDATION TESTS
# ============================================================================

def run_validation_tests(collection, embedding_model, text_generator):
    print("\n" + "="*70)
    print("VALIDATION SCENARIOS")
    print("="*70)

    scenario1_questions = [
        "Give me complete details on the Event INC001572",
        "What are the SOP steps recommended for INC001572?",
        "What actions were taken for the incident INC001572?",
        "Who was the workforce dispatched for the event INC001572 and what is their current status?",
        "What are the buildings affected for the water leakage event INC001572?",
        "When and where did incident INC001572 happen and what is the current status?",
        "List the contact numbers of all building in-charges notified regarding event INC001572"
    ]

    scenario1_results = []
    for i, question in enumerate(scenario1_questions, 1):
        print(f"\n### Scenario 1 - Question {i}/{len(scenario1_questions)} ###")
        result = query_rag_system(question, collection, embedding_model, text_generator, top_k=5)
        scenario1_results.append(result)

    scenario2_questions = [
        "How many pending water leakage events are there?",
        "List recent water leakage events that happened in Bangalore at night time",
        "How many water leakage events happened in the last week?",
        "What are the Standard Operating Procedures for water leakage events?",
        "Who was the resource sent for the water leakage event that happened today in Bangalore?",
        "Show me the response plan being executed for the water leak incident at Zone 5",
        "Show workforce assigned to all incidents between Oct 25-29, 2025"
    ]

    scenario2_results = []
    for i, question in enumerate(scenario2_questions, 1):
        print(f"\n### Scenario 2 - Question {i}/{len(scenario2_questions)} ###")
        result = query_rag_system(question, collection, embedding_model, text_generator, top_k=5)
        scenario2_results.append(result)

    return scenario1_results, scenario2_results

# ============================================================================
# MAIN PIPELINE
# ============================================================================

def main():
    print("\n" + "="*70)
    print("EVENT INTELLIGENCE RAG SYSTEM - COMPLETE PIPELINE")
    print("="*70)

    df = upload_csv_file()
    conn = create_sqlite_database(df)
    df_engineered = engineer_features(df)
    chunks = create_chunks_with_metadata(df_engineered, chunk_size=500, overlap=50)
    embedding_model = load_embedding_model()
    client, collection = setup_chromadb()
    generate_and_store_embeddings(chunks, embedding_model, collection)
    tokenizer, text_generator = load_llm_model()

    scenario1_results, scenario2_results = run_validation_tests(
        collection, embedding_model, text_generator
    )

    print("\n" + "="*70)
    print("‚úì SYSTEM READY!")
    print("="*70)

    return {
        'database_connection': conn,
        'dataframe': df_engineered,
        'chunks': chunks,
        'embedding_model': embedding_model,
        'chromadb_client': client,
        'collection': collection,
        'tokenizer': tokenizer,
        'text_generator': text_generator,
        'scenario1_results': scenario1_results,
        'scenario2_results': scenario2_results
    }

if __name__ == "__main__":
    system_components = main()

    conn = system_components['database_connection']
    df = system_components['dataframe']
    chunks = system_components['chunks']
    embedding_model = system_components['embedding_model']
    client = system_components['chromadb_client']
    collection = system_components['collection']
    tokenizer = system_components['tokenizer']
    text_generator = system_components['text_generator']

    print("\nTo ask custom questions:")
    print(">>> result = query_rag_system('Your question', collection, embedding_model, text_generator)")

Installing required packages...
‚úì Installation complete

‚úì All packages imported successfully


EVENT INTELLIGENCE RAG SYSTEM - COMPLETE PIPELINE
[STEP 1: DATA INGESTION & SQL SETUP]

Please upload your CSV file...


Saving V_EVENT_DETAILS_202512311554.csv to V_EVENT_DETAILS_202512311554 (1).csv
‚úì File uploaded: V_EVENT_DETAILS_202512311554 (1).csv
‚úì Loaded 3921 records with 104 columns

----------------------------------------------------------------------
Creating SQLite database...
‚úì Database created with 3921 rows

[STEP 2: FEATURE ENGINEERING]
‚úì Created event_text for 3921 records

üîç Checking for test events...
‚úì INC001572 found
‚úì INC001573 found
‚úì INC001574 found

[STEP 3: TEXT CHUNKING]
‚úì Created 4298 chunks from 3921 events

[STEP 4: EMBEDDINGS & VECTOR DATABASE]
‚úì Embedding model loaded
‚úì ChromaDB initialized


Batches:   0%|          | 0/135 [00:00<?, ?it/s]

‚úì Stored 4298 embeddings

[LOADING LLM FOR RAG]


model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Device set to use cuda:0


‚úì LLM ready

VALIDATION SCENARIOS

### Scenario 1 - Question 1/7 ###

QUERY: Give me complete details on the Event INC001572

[RETRIEVAL]
  üéØ Detected Event ID: INC001572
  ‚úì Found 1 chunks for INC001572
‚úì Retrieved 1 chunks

  Chunk 1:
    Event: INC001572
    Priority: Critical
    Score: 1.000
    Text: Incident ID INC001572. Alarm 20574. Type: Driver Identified ADAS. Priority Level: Critical. Location: Abu Dhabi - Zone 1 - Abu Dhabi - United Arab Emi...

[GENERATING ANSWER]

[ANSWER]
----------------------------------------------------------------------
Answer: The primary agency for the incident is the Police.
----------------------------------------------------------------------

### Scenario 1 - Question 2/7 ###

QUERY: What are the SOP steps recommended for INC001572?

[RETRIEVAL]
  üéØ Detected Event ID: INC001572
  ‚úì Found 1 chunks for INC001572
‚úì Retrieved 1 chunks

  Chunk 1:
    Event: INC001572
    Priority: Critical
    Score: 1.000
    Text: Incident ID IN

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset



[ANSWER]
----------------------------------------------------------------------

----------------------------------------------------------------------

### Scenario 2 - Question 4/7 ###

QUERY: What are the Standard Operating Procedures for water leakage events?

[RETRIEVAL]
  üîç Semantic search (top_k=5)
‚úì Retrieved 5 chunks

  Chunk 1:
    Event: INC003019
    Priority: High
    Score: 0.416
    Text: Incident ID INC003019. Alarm 36653. Type: Water Leakage Alert. Category: Smart Water. Priority Level: High. Location: Banglore. Site: Banglore. Jurisd...

  Chunk 2:
    Event: INC003908
    Priority: Critical
    Score: 0.415
    Text: Incident ID INC003908. Alarm 38370. Type: Water Leakage Alert. Category: Smart Water. Priority Level: Critical. Location: Banglore. Site: Banglore. Ju...

  Chunk 3:
    Event: INC003951
    Priority: High
    Score: 0.415
    Text: Incident ID INC003951. Alarm 38481. Type: Water Leakage Alert. Category: Smart Water. Priority Level: High. Location: