In [1]:
import os
from google.colab import drive
from google.colab import userdata

drive.mount("/content/drive")

HF_TOKEN = userdata.get("HF_TOKEN")
os.environ["HF_TOKEN"] = HF_TOKEN

print("\nHugging Face Token successfully set.")

OPENAI_API_KEY = userdata.get("OPENAI_API")
os.environ["OPENAI_API"] = OPENAI_API_KEY

print("\nOpenAI API successfully set.\n")

%cd /content/drive/MyDrive/ES-CSA/data/

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

Hugging Face Token successfully set.

OpenAI API successfully set.

/content/drive/MyDrive/ES-CSA/data


In [None]:
!pip install -q langchain langchain-community langchain-huggingface chromadb sentence-transformers

In [2]:
import json
import sqlite3
from chromadb import PersistentClient
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.schema import Document

In [3]:
# Initializing Embedding Model (all-MiniLM-L6-v2)

embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

print("\nModel: all-MiniLM-L6-v2")
print("\nReady for embedding generation.")


Model: all-MiniLM-L6-v2

Ready for embedding generation.


In [4]:
# Initializing Vector Store (ChromaDB)

persist_dir = "embeddings/chromadb_embeddings"

client = PersistentClient(path=persist_dir)

collection = client.get_or_create_collection(name="consumer_db")

print("\nVector Store: ChromaDB")
print("\nReady for embedding storage.")


Vector Store: ChromaDB

Ready for embedding storage.


In [5]:
# Load Pre-processed Data

with open("processed/consumer_data_formatted.json", "r", encoding="utf-8") as f:
    consumer_data = json.load(f)

with open("processed/general_insights_formatted.json", "r", encoding="utf-8") as f:
    general_insights = json.load(f)

print("Successfully loaded pre-processed consumer data.")
print("\nSuccessfully loaded pre-processed general insights.")

Successfully loaded pre-processed consumer data.

Successfully loaded pre-processed general insights.


In [6]:
# Mapping MSISDN -> User

conn = sqlite3.connect("msisdn_mapping.db")
cursor = conn.cursor()
cursor.execute("SELECT msisdn, user_index, user_name FROM msisdn_mapping")
msisdn_to_user = {row[0]: {"user_index": row[1], "user_name": row[2]} for row in cursor.fetchall()}
conn.close()

# Text Cleaning Function

def clean_text(text: str) -> str:
    return text.replace("\n\n", "\n").replace("\\n", "\n").strip()

print("MSISDN-User Mapping:\n")
for k, v in list(msisdn_to_user.items())[:5]:
    print(f"MSISDN: {k} -> {v}")

MSISDN-User Mapping:

MSISDN: 9230610000463 -> {'user_index': 0, 'user_name': 'User 1'}
MSISDN: 9230347659110 -> {'user_index': 1, 'user_name': 'User 2'}
MSISDN: 9230141002657 -> {'user_index': 2, 'user_name': 'User 3'}
MSISDN: 9230162731400 -> {'user_index': 3, 'user_name': 'User 4'}
MSISDN: 9230108284824 -> {'user_index': 4, 'user_name': 'User 5'}


##Embedding: **Consumer Data**

In [7]:
# Preprocessed Consumer Data

print(json.dumps(consumer_data[0], indent=4))

{
    "User Profile": "User 1 from Lahore is on a Prepaid plan.",
    "Call Detail Records": [
        "On 2024-10-26T22:44:08, user used 211 voice resources and was charged 10 PKR.",
        "On 2023-05-23T07:54:33, user used 34 data resources and was charged 0 PKR.",
        "On 2023-10-12T09:06:27, user used 36 voice resources and was charged 0 PKR.",
        "On 2024-01-21T13:20:44, user used 25 data resources and was charged 0 PKR.",
        "On 2023-07-31T16:27:19, user used 18 data resources and was charged 11 PKR.",
        "On 2023-12-24T01:07:33, user used 229 voice resources and was charged 5 PKR.",
        "On 2024-11-09T17:08:02, user used 12 sms resources and was charged 0 PKR."
    ],
    "Purchases": [
        "On 2024-11-06T15:50:55, user spent 912 PKR purchasing 4572 MB of browsing data, 836 MB for social media, 564 SMS, 416 on-net voice minutes, and 168 off-net voice minutes.",
        "On 2024-01-13T08:17:43, user spent 639 PKR purchasing 4545 MB of browsing data, 2

In [8]:
# Consumer Data Embedding: Structure-Aware Chunking

documents = []

for idx, user_item in enumerate(consumer_data):
    user_profile = user_item.get("User Profile", "No profile available").strip()
    call_records = "\n".join(user_item.get("Call Detail Records", [])).strip()
    purchases = "\n".join(user_item.get("Purchases", [])).strip()
    tickets = "\n".join(user_item.get("Tickets", [])).strip()

    if user_profile:
        doc = Document(
            page_content=clean_text(f"User Profile:\n{user_profile}"),
            metadata={
                "type": "User Data",
                "category": "consumer_data",
                "user_index": idx,
                "section": "user_profile"
            }
        )
        documents.append(doc)

    if call_records:
        doc = Document(
            page_content=clean_text(f"Call Detail Records:\n{call_records}"),
            metadata={
                "type": "User Data",
                "category": "consumer_data",
                "user_index": idx,
                "section": "cdrs"
            }
        )
        documents.append(doc)

    if purchases:
        doc = Document(
            page_content=clean_text(f"Purchases:\n{purchases}"),
            metadata={
                "type": "User Data",
                "category": "consumer_data",
                "user_index": idx,
                "section": "purchases"
            }
        )
        documents.append(doc)

    if tickets:
        doc = Document(
            page_content=clean_text(f"Tickets:\n{tickets}"),
            metadata={
                "type": "User Data",
                "category": "consumer_data",
                "user_index": idx,
                "section": "tickets"
            }
        )
        documents.append(doc)

print("Sample:")
print("\nMetadata:", documents[0].metadata)
print("\n", documents[0].page_content)

Sample:

Metadata: {'type': 'User Data', 'category': 'consumer_data', 'user_index': 0, 'section': 'user_profile'}

 User Profile:
User 1 from Lahore is on a Prepaid plan.


In [9]:
# Storing Consumer Data Embeddings in ChromaDB

for doc in documents:
    collection.add(
        ids=[f"consumer_data_{doc.metadata['user_index']}_{doc.metadata['section']}"],
        documents=[doc.page_content],
        metadatas=[doc.metadata]
    )

print("\nEmbedded Consumer Data successfully stored in consumer_db.")


Embedded Consumer Data successfully stored in consumer_db.


In [10]:
# Sample Document and Embedding (Consumer Data)

print("Number of stored documents:", collection.count())

sample = collection.get(
    include=["documents", "embeddings"],
    where={"category": "consumer_data"},
    limit=1
)

if sample["documents"]:
    print("\nSample Document:\n\n", sample["documents"][0])
    print("\nSample Embedding:\n\n", sample["embeddings"][0][:5])
else:
    print("No documents found in the collection.")

Number of stored documents: 2000

Sample Document:

 User Profile:
User 1 from Lahore is on a Prepaid plan.

Sample Embedding:

 [-0.01824937  0.0377876  -0.02205833  0.00484487 -0.07456089]


##Embedding: **General Insights**

In [11]:
# Preprocessed General Insights

for key, value in general_insights.items():
    print(f"\n{key}:\n")
    print(json.dumps(value, indent=4))


Regional Popularity:

[
    "The city of Quetta has 75 active users.",
    "The city of Lahore has 73 active users.",
    "The city of Peshawar has 62 active users.",
    "The city of Faisalabad has 61 active users.",
    "The city of Rawalpindi has 60 active users.",
    "The city of Islamabad has 59 active users.",
    "The city of Multan has 58 active users.",
    "The city of Karachi has 52 active users."
]

User Type Distribution:

[
    "There are 263 Postpaid users in the network.",
    "There are 237 Prepaid users in the network."
]

Regional User Type Distribution:

[
    "In Quetta, there are 33 postpaid users and 28 prepaid users.",
    "In Lahore, there are 32 postpaid users and 27 prepaid users.",
    "In Peshawar, there are 24 postpaid users and 28 prepaid users.",
    "In Faisalabad, there are 42 postpaid users and 31 prepaid users.",
    "In Rawalpindi, there are 27 postpaid users and 31 prepaid users.",
    "In Islamabad, there are 40 postpaid users and 22 prepaid use

In [12]:
# Processing/Chunking General Insights

documents = []

for category, insights_list in general_insights.items():
    category_text = f"{category}:\n" + "\n".join(insights_list)

    doc = Document(
        page_content=clean_text(category_text.strip()),
        metadata={
            "type": "General Insights",
            "category": "general_insights",
            "subcategory": category
        }
    )
    documents.append(doc)

print("Sample:")
print(f"\nMetadata: {documents[0].metadata}")
print(f"\n{documents[0].page_content}")

Sample:

Metadata: {'type': 'General Insights', 'category': 'general_insights', 'subcategory': 'Regional Popularity'}

Regional Popularity:
The city of Quetta has 75 active users.
The city of Lahore has 73 active users.
The city of Peshawar has 62 active users.
The city of Faisalabad has 61 active users.
The city of Rawalpindi has 60 active users.
The city of Islamabad has 59 active users.
The city of Multan has 58 active users.
The city of Karachi has 52 active users.


In [13]:
# Storing General Insight Embeddings to ChromaDB

for doc in documents:
    collection.add(
        ids=[f"general_insights_{doc.metadata['subcategory']}"],
        documents=[doc.page_content],
        metadatas=[doc.metadata]
    )

print("\nEmbedded General Insights successfully stored in consumer_db.")


Embedded General Insights successfully stored in consumer_db.


In [14]:
# Sample Document and Embedding (General Insights)

print("Number of stored documents:", collection.count())

sample = collection.get(
    include=["documents", "embeddings"],
    where={"category": "general_insights"},
    limit=1
)

if sample["documents"]:
    print("\nSample Document:\n\n", sample["documents"][0])
    print("\nSample Embedding:\n\n", sample["embeddings"][0][:5])
else:
    print("No documents found in the collection.")

Number of stored documents: 2005

Sample Document:

 Regional Popularity:
The city of Quetta has 75 active users.
The city of Lahore has 73 active users.
The city of Peshawar has 62 active users.
The city of Faisalabad has 61 active users.
The city of Rawalpindi has 60 active users.
The city of Islamabad has 59 active users.
The city of Multan has 58 active users.
The city of Karachi has 52 active users.

Sample Embedding:

 [ 0.09470701 -0.05078129 -0.05272472  0.02903754 -0.07649574]
