In [1]:
import os
import sys
from google.colab import drive
from google.colab import userdata

drive.mount("/content/drive")

HF_TOKEN = userdata.get("HF_TOKEN")
os.environ["HF_TOKEN"] = HF_TOKEN

print("Hugging Face Token successfully set.")

%cd /content/drive/MyDrive/ES-CSA/data/
sys.path.append('/content/drive/My Drive/ES-CSA/src')

Mounted at /content/drive
Hugging Face Token successfully set.
/content/drive/MyDrive/ES-CSA/data


In [2]:
!pip install numpy pandas



In [3]:
import ast
import json
import pandas as pd
import numpy as np

In [13]:
# Load Consumer Data

with open("processed/consumer_data.json", "r") as file:
    consumer_data = json.load(file)

# Flatten Nested JSON Fields

users = []
cdrs = []
purchases = []
tickets = []

for user in consumer_data:
    base_info = {
        "MSISDN": user["MSISDN"],
        "Name": user["Name"],
        "City": user["City"],
        "User_Type": user["User_Type"]
    }

    for cdr in user.get("CDRS", []):
        cdrs.append({**base_info, **cdr})

    for purchase in user.get("Purchases", []):
        purchases.append({**base_info, **purchase})

    for ticket in user.get("Tickets", []):
        tickets.append({**base_info, **ticket})

df_users = pd.DataFrame(consumer_data)
df_cdrs = pd.DataFrame(cdrs)
df_purchases = pd.DataFrame(purchases)
df_tickets = pd.DataFrame(tickets)

# Datetime Coversion

datetime_columns = {
    "df_cdrs": "Datetime_Charged",
    "df_purchases": "Datetime",
    "df_tickets": ["Log_Time", "Resolution_Time"]
}

for df_name, col in datetime_columns.items():
    if isinstance(col, list):
        for c in col:
            globals()[df_name][c] = pd.to_datetime(globals()[df_name][c], errors="coerce")
    else:
        globals()[df_name][col] = pd.to_datetime(globals()[df_name][col], errors="coerce")

print('Data prepared for EDA.')

Data prepared for EDA.


In [17]:
# Exploratory Data Analysis (EDA) - df_users

### Regional Popularity Breakdown (Customers per City)

regional_popularity = df_users["City"].value_counts().reset_index()
regional_popularity.columns = ["City", "User_Count"]
print("Regional Popularity Breakdown:\n", regional_popularity)

### User Type Distribution (Prepaid vs Postpaid)

user_type_distribution = df_users["User_Type"].value_counts().reset_index()
user_type_distribution.columns = ["User_Type", "Count"]
print("\nUser Type Distribution:\n", user_type_distribution)

### Regional User Type Distribution (User Type per City)

regional_user_type_distribution = pd.crosstab(df_users["City"], df_users["User_Type"])
print("\nRegional User Type Distribution:\n", regional_user_type_distribution)


Regional Popularity Breakdown:
          City  User_Count
0      Quetta          75
1      Lahore          73
2    Peshawar          62
3  Faisalabad          61
4  Rawalpindi          60
5   Islamabad          59
6      Multan          58
7     Karachi          52

User Type Distribution:
   User_Type  Count
0  Postpaid    263
1   Prepaid    237

Regional User Type Distribution:
 User_Type   Postpaid  Prepaid
City                         
Faisalabad        33       28
Islamabad         32       27
Karachi           24       28
Lahore            42       31
Multan            27       31
Peshawar          40       22
Quetta            35       40
Rawalpindi        30       30


In [20]:
# Exploratory Data Analysis (EDA) - df_cdrs

### Total Resources Consumed by Category (Data/SMS/Voice Call) per User

total_resources_per_user = df_cdrs.pivot_table(
    index="MSISDN",
    columns="Resource_Type",
    values="Resource_Value",
    aggfunc="sum",
    fill_value=0
).reset_index()

print("\nTotal Resources Consumed per Category by User:\n", total_resources_per_user)

### Total Amount Spent per Resource Category by User

total_amount_spent_per_category = df_cdrs.pivot_table(
    index="MSISDN",
    columns="Resource_Type",
    values="Amount_Charged",
    aggfunc="sum",
    fill_value=0
).reset_index()

print("\nTotal Amount Spent per Resource Category by User:\n", total_amount_spent_per_category)

### Total Amount Spent on Resources by User

total_amount_spent = df_cdrs.groupby("MSISDN")["Amount_Charged"].sum().reset_index()
total_amount_spent.columns = ["MSISDN", "Total_Amount_Charged"]

print("\nTotal Amount Spent on Resources by User:\n", total_amount_spent)

### Frequency of Resource Consumption (User Activity)

user_activity = df_cdrs.groupby("MSISDN").size().reset_index(name="Transaction_Count")

print("\nFrequency of Resource Consumption (User Activity):\n", user_activity)


Total Resources Consumed per Category by User:
 Resource_Type         MSISDN  Data  SMS  Voice
0              9230101039883   167   79    262
1              9230102123429   188  157      0
2              9230104488435   311  112    270
3              9230108284824     0   79    211
4              9230108807547   153  154    164
..                       ...   ...  ...    ...
495            9230996123791    65    0    464
496            9230996583212    35   65    476
497            9230996629579   182   60    333
498            9230998618451   370   68    298
499            9230999762562   278   93    216

[500 rows x 4 columns]

Total Amount Spent per Resource Category by User:
 Resource_Type         MSISDN  Data  SMS  Voice
0              9230101039883    24   24     33
1              9230102123429    19   19      0
2              9230104488435    20   11      8
3              9230108284824     0   65      0
4              9230108807547     6   47     16
..                       ... 

In [21]:
# Exploratory Data Analysis (EDA) - df_purchases

### Total Amount Spent per User

total_spent_per_user = df_purchases.groupby("MSISDN")["Amount"].sum().reset_index()
total_spent_per_user.columns = ["MSISDN", "Total_Amount_Purchases"]

print("Total Amount Spent per User:\n", total_spent_per_user)

### Total Purchases per User

total_purchases_per_user = df_purchases.groupby("MSISDN").size().reset_index(name="Total_Purchase_Transactions")

print("\nTotal Purchases by User:\n", total_purchases_per_user)

Total Amount Spent per User:
             MSISDN  Total_Amount_Purchases
0    9230101039883                    9287
1    9230102123429                    4195
2    9230104488435                    9423
3    9230108284824                    7354
4    9230108807547                    3167
..             ...                     ...
495  9230996123791                    5721
496  9230996583212                    3651
497  9230996629579                    9068
498  9230998618451                    5723
499  9230999762562                    5609

[500 rows x 2 columns]

Total Purchases by User:
             MSISDN  Total_Purchase_Transactions
0    9230101039883                           16
1    9230102123429                            6
2    9230104488435                           17
3    9230108284824                           14
4    9230108807547                            9
..             ...                          ...
495  9230996123791                           10
496  9230996583212 

In [23]:
# Exploratory Data Analysis (EDA) - df_tickets

### Total Tickets per User

df_tickets["Total_Tickets"] = df_tickets.groupby("MSISDN")["Ticket_ID"].transform("count")
tickets_per_user = df_tickets[['MSISDN', 'Total_Tickets']].drop_duplicates()

print("\nTotal Tickets per User:\n", tickets_per_user)

### Most Common Ticket Categories

ticket_category_counts = df_tickets["Category"].value_counts().reset_index()
ticket_category_counts.columns = ["Category", "Ticket_Count"]

print("\nMost Frequent Ticket Categories:\n", ticket_category_counts)

### Average Resolution Time per Ticket Category

df_tickets["Log_Time"] = pd.to_datetime(df_tickets["Log_Time"])
df_tickets["Resolution_Time"] = pd.to_datetime(df_tickets["Resolution_Time"])

df_tickets["Resolution_Duration_Hours"] = (df_tickets["Resolution_Time"] - df_tickets["Log_Time"]).dt.total_seconds() / 3600

avg_resolution_time = df_tickets.groupby("Category")["Resolution_Duration_Hours"].mean().reset_index()
avg_resolution_time.rename(columns={"Resolution_Duration_Hours": "Avg_Resolution_Hours"}, inplace=True)

print("\nAverage Resolution Hours per Ticket Category:\n", avg_resolution_time)


Total Tickets per User:
              MSISDN  Total_Tickets
0     9230610000463             10
10    9230347659110             11
21    9230141002657              6
27    9230162731400              7
34    9230108284824              7
...             ...            ...
4937  9230521821189             12
4949  9230745590992             15
4964  9230678073159             15
4979  9230644375655              7
4986  9230620961607             14

[500 rows x 2 columns]

Most Frequent Ticket Categories:
             Category  Ticket_Count
0      Network Issue          1018
1    General Inquiry          1015
2  Technical Support          1013
3            Billing          1009
4          Complaint           945

Average Resolution Hours per Ticket Category:
             Category  Avg_Resolution_Hours
0            Billing             36.513380
1          Complaint             37.357672
2    General Inquiry             37.310345
3      Network Issue             36.795678
4  Technical Support  

In [27]:
# Storing EDA Insights

eda_insights = [
    regional_popularity.set_index("City"),
    user_type_distribution.set_index("User_Type"),
    regional_user_type_distribution,
    total_resources_per_user.set_index("MSISDN"),
    total_amount_spent_per_category.set_index("MSISDN"),
    total_amount_spent.set_index("MSISDN"),
    user_activity.set_index("MSISDN"),
    total_spent_per_user.set_index("MSISDN"),
    total_purchases_per_user.set_index("MSISDN"),
    tickets_per_user.set_index("MSISDN"),
    ticket_category_counts.set_index("Category"),
    avg_resolution_time.set_index("Category")
]

consumer_insights = {}

for df in [df for df in eda_insights if "MSISDN" in df.index.names]:
    for msisdn, row in df.iterrows():
        if msisdn not in consumer_insights:
            consumer_insights[msisdn] = {}
        consumer_insights[msisdn].update(row.to_dict())

consumer_insights["regional_popularity"] = regional_popularity.to_dict(orient="records")
consumer_insights["user_type_distribution"] = user_type_distribution.to_dict(orient="records")
consumer_insights["regional_user_type_distribution"] = regional_user_type_distribution.to_dict()
consumer_insights["ticket_category_counts"] = ticket_category_counts.to_dict(orient="records")
consumer_insights["avg_resolution_time_per_category"] = avg_resolution_time.to_dict(orient="records")

insights_file_path = "processed/consumer_insights.json"
with open(insights_file_path, "w") as json_file:
    json.dump(consumer_insights, json_file, indent=4)

print(f"Consumer insights saved as JSON file.")

Consumer insights saved as JSON file.
