In [None]:
!pip install --quiet sentence-transformers langchain langchain-core langchain-chroma chromadb aiosqlite # RUN ONCE -> RESTART SESSION

In [1]:
import os
import sys
from google.colab import drive
from google.colab import userdata

drive.mount("/content/drive")

HF_TOKEN = userdata.get("HF_TOKEN")
os.environ["HF_TOKEN"] = HF_TOKEN

print("\nHugging Face Token successfully set.")

OPENAI_API_KEY = userdata.get("OPENAI_API")
os.environ["OPENAI_API"] = OPENAI_API_KEY

print("\nOpenAI API successfully set.\n")

sys.path.append("/content/drive/MyDrive/ES-CSA/src")

%cd /content/drive/MyDrive/ES-CSA/data/

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

Hugging Face Token successfully set.

OpenAI API successfully set.

/content/drive/MyDrive/ES-CSA/data


In [14]:
import re
import numpy
import sqlite3
import json
import ast
from datetime import datetime
from typing import Optional, List, Dict
from collections import defaultdict
from langchain_chroma import Chroma
from langchain_core.documents import Document
from embedding_utils import LocalEmbedding
from intent_classifier import BGEIntentClassifier, INTENT_LABELS, INTENT_TO_METADATA
from insight_extraction import compute_consumer_insights, compute_general_insights

In [3]:
# Initializing Embedding Model (all-MiniLM-L6-v2)

embedding_model = LocalEmbedding()

print("\nModel: all-MiniLM-L6-v2")

# Initializing Vector Store (ChromaDB)

vectorstore = Chroma(
    persist_directory="embeddings/chromadb_embeddings",
    embedding_function=embedding_model,
    collection_name="consumer_db"
)

print("\nVector Store: ChromaDB")


Model: all-MiniLM-L6-v2

Vector Store: ChromaDB


In [4]:
# Test: Retrieval from consumer_db collection in vector store.

query = "Which city has the most users?"

results = vectorstore.similarity_search(query, k=3)

# Print top-k results
for i, doc in enumerate(results):
    print(f"\nResult {i+1}")
    print("-" * 40)
    print(doc.page_content)
    print("Metadata:", doc.metadata)


Result 1
----------------------------------------
Regional Popularity:
The city of Quetta has 75 active users.
The city of Lahore has 73 active users.
The city of Peshawar has 62 active users.
The city of Faisalabad has 61 active users.
The city of Rawalpindi has 60 active users.
The city of Islamabad has 59 active users.
The city of Multan has 58 active users.
The city of Karachi has 52 active users.
Metadata: {'category': 'general_insights', 'subcategory': 'Regional Popularity', 'type': 'General Insights'}

Result 2
----------------------------------------
User Type Distribution:
There are 263 Postpaid users in the network.
There are 237 Prepaid users in the network.
Metadata: {'category': 'general_insights', 'subcategory': 'User Type Distribution', 'type': 'General Insights'}

Result 3
----------------------------------------
Purchases:
On 2023-06-24T08:44:14, user spent 305 PKR purchasing 4246 MB of browsing data, 2638 MB for social media, 586 SMS, 492 on-net voice minutes, and 27

In [5]:
# Extracting MSISDN

def parse_msisdn(query: str) -> Optional[str]:
    """
    Parses a valid MSISDN (10-15 digits) from query text.
    """
    match = re.search(r"\b(?:\+?\d{1,3})?\d{10,15}\b", query)
    if match:
        msisdn = match.group().lstrip("+")
        if msisdn.isdigit() and 10 <= len(msisdn) <= 15:
            return msisdn
    return None

def get_msisdn_from_context(query: str = None, session_context: dict = None) -> Optional[str]:

    if session_context and "msisdn" in session_context:
        return session_context["msisdn"].strip().lstrip("+")

    if query:
        return parse_msisdn(query)

    return None


# Resolving User Identity

def resolve_user_identity(msisdn: str, db_path: str = "msisdn_mapping.db") -> Optional[dict]:

    try:
        conn = sqlite3.connect(db_path)
        cursor = conn.cursor()
        cursor.execute(
            "SELECT user_index, user_name FROM msisdn_mapping WHERE msisdn = ?",
            (msisdn,)
        )
        result = cursor.fetchone()
        conn.close()

        if result:
            return {
                "user_index": result[0],
                "user_name": result[1],
                "msisdn": msisdn
            }
        return None
    except sqlite3.Error as e:
        print(f"[ERROR] SQLite query failed: {e}")
        return None

In [6]:
# Test: Resolving user identity based on MSISDN mapping database.

query = "Hi, I need help with my account."

session = {"msisdn": "9230610000463"}

print("Query:", query)

msisdn = get_msisdn_from_context(query=query, session_context=session)
print("\nExtracted MSISDN:", msisdn)

user_info = resolve_user_identity(msisdn)
print("\nResolved User Indentity:", user_info)

Query: Hi, I need help with my account.

Extracted MSISDN: 9230610000463

Resolved User Indentity: {'user_index': 0, 'user_name': 'User 1', 'msisdn': '9230610000463'}


In [7]:
# Initializing Intent Classifier (BAAI/bge-m3)

classifier = BGEIntentClassifier(
    intent_labels=INTENT_LABELS,
    intent_metadata_map=INTENT_TO_METADATA
)

print("\nIntent Classifier: BAAI/bge-m3")


Intent Classifier: BAAI/bge-m3


In [8]:
# Test: Resolving the intent behind provided queries.

test_queries = [
    ("What type of plan am I on?"),
    ("List my recent purchase history."),
    ("What is the status of my latest complaint?"),
    ("How much data have I consumed?"),
    ("How long does it take to resolve billing issues?"),
    ("May I please speak to a human representative?")
]

for query in test_queries:
    result = classifier.classify(query)
    predicted = result["intent"]
    confidence = result["score"]
    metadata = result["metadata_filter"]

    print("-" * 65)
    print(f"\nQuery:             {query}\n")
    print(f"Predicted Intent:  {predicted} (Confidence: {confidence})\n")
    print(f"Metadata Filter:   {metadata}\n")

-----------------------------------------------------------------

Query:             What type of plan am I on?

Predicted Intent:  user_profile (Confidence: 0.7343536019325256)

Metadata Filter:   {'category': 'consumer_data', 'type': 'User Data', 'section': 'user_profile'}

-----------------------------------------------------------------

Query:             List my recent purchase history.

Predicted Intent:  purchases_info (Confidence: 0.9043279886245728)

Metadata Filter:   {'category': 'consumer_data', 'type': 'User Data', 'section': 'purchases'}

-----------------------------------------------------------------

Query:             What is the status of my latest complaint?

Predicted Intent:  tickets_info (Confidence: 0.8533746004104614)

Metadata Filter:   {'category': 'consumer_data', 'type': 'User Data', 'section': 'tickets'}

-----------------------------------------------------------------

Query:             How much data have I consumed?

Predicted Intent:  cdrs_info (Co

In [9]:
# Retrieving Relevant Documents

def retrieve_relevant_docs(
    query: str,
    classifier,
    user_info: dict,
    vectorstore,
    k: int = 4,
    fallback_threshold: float = 0.65
):
    """
    Classify query intent, resolve metadata filters, and retrieve user-specific context.
    Post-parses raw context into structured atomized facts for consumer_data and general_insights.
    """
    intent_result = classifier.classify(query)
    intent_label = intent_result["intent"]
    confidence = intent_result["score"]
    filters = intent_result.get("metadata_filter", {}) or {}

    if filters.get("category") == "consumer_data":
        filters["user_index"] = user_info["user_index"]

    fallback_triggered = intent_label == "other" or confidence < fallback_threshold
    wrapped_filter = {"$and": [{key: val} for key, val in filters.items()]} if filters else None

    if fallback_triggered:
        raw_results = []
    else:
        raw_results = (
            vectorstore.similarity_search(query, k=k, filter=wrapped_filter)
            if wrapped_filter else
            vectorstore.similarity_search(query, k=k)
        )

    if filters.get("category") == "consumer_data" and not fallback_triggered:
        processed_results = compute_consumer_insights(raw_results)
    elif filters.get("category") == "general_insights" and not fallback_triggered:
        processed_results = compute_general_insights(raw_results)
    else:
        processed_results = [doc.page_content for doc in raw_results]

    return {
        "query": query,
        "intent": intent_label,
        "filters": wrapped_filter,
        "user_info": user_info,
        "confidence": confidence,
        "results": processed_results,
        "fallback_triggered": fallback_triggered
    }

In [10]:
# Test: Retrieving relevant user-specific documents based on query.

# Query 1:

query = "What type of plan am I on?"

results = retrieve_relevant_docs(
    query=query,
    classifier=classifier,
    user_info=user_info,
    vectorstore=vectorstore,
    k=1
)

print("-" * 65)
print(f"Query: {results['query']}")
print("-" * 65)
print(f"\nResolved User Identity:   {results['user_info']}")
print(f"Resolved Metadata Filters:  {results['filters']}")
print(f"Resolved Intent Label:      {results['intent']}")
print(f"\nConfidence Score:         {results['confidence']:.4f}")
print(f"Fallback Triggered:         {results['fallback_triggered']}")

if results["fallback_triggered"] or not results["results"]:
    print("\nNo relevant documents found.")
else:
    print(f"\nRetrieved Context:\n" + "-" * 18)
    print(f"\n".join(results["results"]))

# Query 2:

query = "What is the status of my recent complaint?"

results = retrieve_relevant_docs(
    query=query,
    classifier=classifier,
    user_info=user_info,
    vectorstore=vectorstore,
    k=1
)

print("\n")
print("-" * 65)
print(f"Query: {results['query']}")
print("-" * 65)
print(f"\nResolved User Identity:   {results['user_info']}")
print(f"Resolved Metadata Filters:  {results['filters']}")
print(f"Resolved Intent Label:      {results['intent']}")
print(f"\nConfidence Score:         {results['confidence']:.4f}")
print(f"Fallback Triggered:         {results['fallback_triggered']}")

if results["fallback_triggered"] or not results["results"]:
    print("\nNo relevant documents found.")
else:
    print(f"\nRetrieved Context:\n" + "-" * 18)
    print(f"\n".join(results["results"]))

# Query 3:

query = "How long does it take to resolve billing related issues?"

results = retrieve_relevant_docs(
    query=query,
    classifier=classifier,
    user_info=user_info,
    vectorstore=vectorstore,
    k=1
)

print("\n")
print("-" * 65)
print(f"Query: {results['query']}")
print("-" * 65)
print(f"\nResolved User Identity:   {results['user_info']}")
print(f"Resolved Metadata Filters:  {results['filters']}")
print(f"Resolved Intent Label:      {results['intent']}")
print(f"\nConfidence Score:         {results['confidence']:.4f}")
print(f"Fallback Triggered:         {results['fallback_triggered']}")

if results["fallback_triggered"] or not results["results"]:
    print("\nNo relevant documents found.")
else:
    print(f"\nRetrieved Context:\n" + "-" * 18)
    print(f"\n".join(results["results"]))

-----------------------------------------------------------------
Query: What type of plan am I on?
-----------------------------------------------------------------

Resolved User Identity:   {'user_index': 0, 'user_name': 'User 1', 'msisdn': '9230610000463'}
Resolved Metadata Filters:  {'$and': [{'category': 'consumer_data'}, {'type': 'User Data'}, {'section': 'user_profile'}, {'user_index': 0}]}
Resolved Intent Label:      user_profile

Confidence Score:         0.7344
Fallback Triggered:         False

Retrieved Context:
------------------
The user's name is 1.
The user is located in Lahore.
The user has a Prepaid account.


-----------------------------------------------------------------
Query: What is the status of my recent complaint?
-----------------------------------------------------------------

Resolved User Identity:   {'user_index': 0, 'user_name': 'User 1', 'msisdn': '9230610000463'}
Resolved Metadata Filters:  {'$and': [{'category': 'consumer_data'}, {'type': 'User Da

In [11]:
# Format Retrieved Context into Structured LLM Input

def format_context(intent: str, docs: List[str], user_info: dict) -> str:
    if not docs:
        return "No relevant context found for this request."

    user_name = user_info.get("user_name", "Unknown")
    msisdn = user_info.get("msisdn", "Unknown")
    user_identity = f"[User: {user_name} | MSISDN: {msisdn}]"

    raw_context = "\n".join(doc.strip() for doc in docs if doc.strip())

    title_map = {
        "user_profile": "User Profile",
        "cdrs_info": "CDRs",
        "purchases_info": "Purchase History",
        "tickets_info": "Support History",
        "regional_popularity": "Regional Popularity",
        "user_type_distribution": "User Type Distribution",
        "regional_user_type_distribution": "Regional User Type Distribution",
        "ticket_statistics": "Ticket Statistics",
        "resolution_times": "Average Ticket Resolution Times",
    }

    section_title = title_map.get(intent, "Account Information")

    formatted_input = (
        f"{user_identity}\n\n"
        f"{section_title}:\n"
        f"{raw_context.strip()}"
    )

    return formatted_input

In [12]:
# Complete RAG Pipeline Wrapper

def run_rag_pipeline(
    query: str,
    session_context: dict,
    classifier,
    vectorstore,
    db_path: str = "msisdn_mapping.db",
    k: int = 1,
    fallback_threshold: float = 0.65,
):
    """
    Full RAG pipeline that resolves MSISDN from session,
    retrieves relevant context, and formats it for LLM input.
    """

    msisdn = get_msisdn_from_context(query=query, session_context=session_context)
    if not msisdn:
        return {
            "intent": "auth_required",
            "formatted": "Unable to resolve user identity. Please provide your number for authentication.",
            "user_info": ""
        }

    user_info = resolve_user_identity(msisdn, db_path=db_path)
    if not user_info:
        return {
            "intent": "user_not_found",
            "formatted": f"The SIM: {msisdn} was not found in the system.",
            "user_info": ""
        }

    results = retrieve_relevant_docs(
        query=query,
        classifier=classifier,
        user_info=user_info,
        vectorstore=vectorstore,
        k=k,
        fallback_threshold=fallback_threshold,
    )

    if results["fallback_triggered"] or not results["results"]:
        return {
            "intent": results.get("intent", "fallback"),
            "formatted": f"Directly Respond (No Context Available) – Query: {query}",
            "user_info": user_info
        }

    formatted = format_context(
        intent=results["intent"],
        docs=results["results"],
        user_info=results["user_info"]
    )

    return {
        "intent": results["intent"],
        "formatted": formatted,
        "user_info": results["user_info"]
    }

In [13]:
# Test: Final Formatted and Structured Input for LLM

session = {"msisdn": "9230610000463"}

# Query 1:

query = "Where is my account registered?"

results = run_rag_pipeline(
    query=query,
    session_context=session,
    classifier=classifier,
    vectorstore=vectorstore,
    k=1
)

print("=" * 65)
print(f"Query: {query}")
print("=" * 65)
print("\nPre-LLM Structured Input:\n\n", results)
print("\n")

# Query 2:

query = "Give me the status of my latest support ticket."

results = run_rag_pipeline(
    query=query,
    session_context=session,
    classifier=classifier,
    vectorstore=vectorstore,
    k=1
)

print("=" * 65)
print(f"Query: {query}")
print("=" * 65)
print("\nPre-LLM Structured Input:\n\n", results)
print("\n")

# Query 3:

query = "Summarize my overall consumption across all services."

results = run_rag_pipeline(
    query=query,
    session_context=session,
    classifier=classifier,
    vectorstore=vectorstore,
    k=1
)

print("=" * 65)
print(f"Query: {query}")
print("=" * 65)
print("\nPre-LLM Structured Input:\n\n", results)
print("\n")

# Query 4:

query = "How long will it take to resolve my network issue complaint?"

results = run_rag_pipeline(
    query=query,
    session_context=session,
    classifier=classifier,
    vectorstore=vectorstore,
    k=1
)

print("=" * 65)
print(f"Query: {query}")
print("=" * 65)
print("\nPre-LLM Structured Input:\n\n", results)
print("\n")

# Query 5:

query = "Hello? Can I get some help?"

results = run_rag_pipeline(
    query=query,
    session_context=session,
    classifier=classifier,
    vectorstore=vectorstore,
    k=1
)

print("=" * 65)
print(f"Query: {query}")
print("=" * 65)
print("\nPre-LLM Structured Input:\n\n", results)
print("\n")

Query: Where is my account registered?

Pre-LLM Structured Input:

 {'intent': 'user_profile', 'formatted': "[User: User 1 | MSISDN: 9230610000463]\n\nUser Profile:\nThe user's name is 1.\nThe user is located in Lahore.\nThe user has a Prepaid account.", 'user_info': {'user_index': 0, 'user_name': 'User 1', 'msisdn': '9230610000463'}}


Query: Give me the status of my latest support ticket.

Pre-LLM Structured Input:

 {'intent': 'tickets_info', 'formatted': '[User: User 1 | MSISDN: 9230610000463]\n\nSupport History:\nRecent Support History:\n- T44316: Ticket T44316 was logged on September 15, 2024 at 03:19 AM under Billing category. Description: Issue reported under Billing category. The ticket was resolved on September 16, 2024 at 10:19 PM with the following resolution: Resolved with detailed explanation for Billing category..\n- T74113: Ticket T74113 was logged on September 30, 2024 at 11:34 AM under Network Issue category. Description: Issue reported under Network Issue category. T