In [None]:
import os
from google.colab import drive
from google.colab import userdata

drive.mount("/content/drive")

HF_TOKEN = userdata.get("HF_TOKEN")
os.environ["HF_TOKEN"] = HF_TOKEN

print("\nHugging Face Token successfully set.")

OPENAI_API_KEY = userdata.get("OPENAI_API")
os.environ["OPENAI_API"] = OPENAI_API_KEY

print("\nOpenAI API successfully set.\n")

%cd /content/drive/MyDrive/ES-CSA/data/

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

Hugging Face Token successfully set.

OpenAI API successfully set.

/content/drive/MyDrive/ES-CSA/data


In [None]:
import json
import pandas as pd

In [None]:
# Processing Consumer_Data_Summary for Embeddings

consumer_data_path = "processed/consumer_data_summary.json"

with open(consumer_data_path, "r") as f:
    consumer_data = json.load(f)

def format_user_profile(user):
    """Generates structured text representation of user profile."""
    return f"{user['User Profile']['Name']} from {user['User Profile']['City']} is on a {user['User Profile']['Plan Type']} plan."

def format_cdrs(cdrs):
    """Generates structured text representation of call detail records."""
    return [
        f"On {record['Date']}, user used {record['Resource Used']} resources and was charged {record['Charge (PKR)']} PKR."
        for record in cdrs
    ]

def format_purchases(purchases):
    """Generates structured text representation of purchase history."""
    return [
        (f"On {purchase['Date']}, user spent {purchase['Amount Spent (PKR)']} PKR purchasing "
         f"{purchase['Data Browsing (MB)']} MB of browsing data, {purchase['Data Social (MB)']} MB for social media, "
         f"{purchase['SMS']} SMS, {purchase['Voice On-Net (min)']} on-net voice minutes, and "
         f"{purchase['Voice Off-Net (min)']} off-net voice minutes.")
        for purchase in purchases
    ]

def format_tickets(tickets):
    """Generates structured text representation of customer support tickets."""
    return [
        (f"Ticket {ticket['Ticket ID']} was logged on {ticket['Logged Time']} under {ticket['Category']} category. "
         f"Description: {ticket['Description']} The ticket was resolved on {ticket['Resolved Time']} "
         f"with the following resolution: {ticket['Resolution']}.")
        for ticket in tickets
    ]

user_representations = []
for user in consumer_data:
    structured_entry = {
        "User Profile": format_user_profile(user),
        "Call Detail Records": format_cdrs(user["CDRS"]),
        "Purchases": format_purchases(user["Purchases"]),
        "Tickets": format_tickets(user["Tickets"])
    }
    user_representations.append(structured_entry)

for i, text in enumerate(user_representations[:1]):
    print(f"User {i+1} Details:\n{text}\n{'-'*80}")

User 1 Details:
{'User Profile': 'User 1 from Lahore is on a Prepaid plan.', 'Call Detail Records': ['On 2024-10-26T22:44:08, user used 211 voice resources and was charged 10 PKR.', 'On 2023-05-23T07:54:33, user used 34 data resources and was charged 0 PKR.', 'On 2023-10-12T09:06:27, user used 36 voice resources and was charged 0 PKR.', 'On 2024-01-21T13:20:44, user used 25 data resources and was charged 0 PKR.', 'On 2023-07-31T16:27:19, user used 18 data resources and was charged 11 PKR.', 'On 2023-12-24T01:07:33, user used 229 voice resources and was charged 5 PKR.', 'On 2024-11-09T17:08:02, user used 12 sms resources and was charged 0 PKR.'], 'Purchases': ['On 2024-11-06T15:50:55, user spent 912 PKR purchasing 4572 MB of browsing data, 836 MB for social media, 564 SMS, 416 on-net voice minutes, and 168 off-net voice minutes.', 'On 2024-01-13T08:17:43, user spent 639 PKR purchasing 4545 MB of browsing data, 2373 MB for social media, 488 SMS, 387 on-net voice minutes, and 175 off-net 

In [None]:
# Saving Formatted Consumer Data Summary

consumer_data_formatted_path = "processed/consumer_data_formatted.json"

with open(consumer_data_formatted_path, "w") as f:
    json.dump(user_representations, f, indent=4)

print(f"Saved formatted consumer data summary.")

Saved formatted consumer data summary.


In [None]:
# Processing General_Insights_Summary for Embeddings

general_insights_path = "processed/general_insights_summary.json"

with open(general_insights_path, "r", encoding="utf-8") as file:
    general_insights = json.load(file)["General Insights"]

def format_regional_popularity(region_data):
    return [
        f"The city of {entry['City']} has {entry['User_Count']} active users."
        for entry in region_data
    ]

def format_user_type_distribution(user_type_data):
    return [
        f"There are {entry['Count']} {entry['User_Type']} users in the network."
        for entry in user_type_data
    ]

def format_regional_user_type_distribution(region_data, city_data):
    return [
        f"In {city_data[i]['City']}, there are {entry['Postpaid']} postpaid users and {entry['Prepaid']} prepaid users."
        for i, entry in enumerate(region_data)
    ]

def format_most_common_tickets(ticket_data):
    return [
        f"The '{entry['Category']}' category has {entry['Ticket_Count']} support tickets."
        for entry in ticket_data
    ]

def format_resolution_time(resolution_data):
    return [
        f"The average resolution time for '{entry['Category']}' tickets is {round(entry['Avg_Resolution_Hours'], 2)} hours."
        for entry in resolution_data
    ]

formatted_general_insights = {
    "Regional Popularity": format_regional_popularity(general_insights["Regional Popularity"]),
    "User Type Distribution": format_user_type_distribution(general_insights["User Type Distribution"]),
    "Regional User Type Distribution": format_regional_user_type_distribution(
        general_insights["Regional User Type Distribution"],
        general_insights["Regional Popularity"]
    ),
    "Most Common Ticket Categories": format_most_common_tickets(general_insights["Most Common Ticket Categories"]),
    "Average Resolution Time Per Ticket Category": format_resolution_time(general_insights["Average Resolution Time Per Ticket Category"])
}

print(json.dumps(formatted_general_insights, indent=4))

{
    "Regional Popularity": [
        "The city of Quetta has 75 active users.",
        "The city of Lahore has 73 active users.",
        "The city of Peshawar has 62 active users.",
        "The city of Faisalabad has 61 active users.",
        "The city of Rawalpindi has 60 active users.",
        "The city of Islamabad has 59 active users.",
        "The city of Multan has 58 active users.",
        "The city of Karachi has 52 active users."
    ],
    "User Type Distribution": [
        "There are 263 Postpaid users in the network.",
        "There are 237 Prepaid users in the network."
    ],
    "Regional User Type Distribution": [
        "In Quetta, there are 33 postpaid users and 28 prepaid users.",
        "In Lahore, there are 32 postpaid users and 27 prepaid users.",
        "In Peshawar, there are 24 postpaid users and 28 prepaid users.",
        "In Faisalabad, there are 42 postpaid users and 31 prepaid users.",
        "In Rawalpindi, there are 27 postpaid users and 3

In [None]:
# Saving Formatted General Insights Summary

general_insights_formatted_path = "processed/general_insights_formatted.json"

with open(general_insights_formatted_path, "w", encoding="utf-8") as f:
    json.dump(formatted_general_insights, f, indent=4)

print(f"Saved formatted general insights summary.")

Saved formatted general insights summary.
