In [1]:
import os
import sys
from google.colab import drive
from google.colab import userdata

drive.mount("/content/drive")

HF_TOKEN = userdata.get("HF_TOKEN")
os.environ["HF_TOKEN"] = HF_TOKEN

print("Hugging Face Token successfully set.")

%cd /content/drive/MyDrive/ES-CSA/
sys.path.append('/content/drive/My Drive/ES-CSA/src')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Hugging Face Token successfully set.
/content/drive/MyDrive/ES-CSA


In [2]:
!pip install numpy pandas



In [3]:
import json
import pandas as pd
import numpy as np
from dataintegrator import DataIntegrator

In [4]:
# Loading Data

userprofile = pd.read_csv("data/raw/UserProfile.csv")[["MSISDN","Name", "City", "User Type"]].copy()
cdrs = pd.read_csv("data/raw/CDRS.csv")[["MSISDN", "Amount Charged", "Resources Consumed", "Datetime Charged"]].copy()
purchases = pd.read_csv("data/raw/Purchases.csv")[["MSISDN", "Data Browsing Allowance","SMS Allowance", "Voice On-Net Allowance", "Voice Off-Net Allowance", "Data Social Allowance", "Datetime", "Amount"]].copy()
tickets = pd.read_csv("data/raw/Tickets.csv")[["MSISDN", "Ticket ID", "Log Time", "Resolution Time", "Category", "Description", "Resolutions"]].copy()

# Fixing Spaced Columns

userprofile.columns = userprofile.columns.str.strip()
userprofile.rename(columns=lambda x: x.strip().replace(" ", "_"), inplace=True)

cdrs.columns = cdrs.columns.str.strip()
cdrs.rename(columns=lambda x: x.strip().replace(" ", "_"), inplace=True)

purchases.columns = purchases.columns.str.strip()
purchases.rename(columns=lambda x: x.strip().replace(" ", "_"), inplace=True)

tickets.columns = tickets.columns.str.strip()
tickets.rename(columns=lambda x: x.strip().replace(" ", "_"), inplace=True)

# Splitting "Resources_Consumed" from CDRS into "Resource_Value" and "Resource_Type"

def categorize_resources(resource):

  resource = resource.lower()

  if "sms" in resource:
    amount = resource.split(" ")[0].replace(",", ".")
    return int(amount), "SMS"
  elif "mb data" in resource:
        amount = resource.split(" ")[0].replace(",", ".")
        return int(amount), "Data"
  elif "seconds voice call" in resource:
        amount = resource.split(" ")[0].replace(",", ".")
        return int(amount), "Voice"
  else:
        return None, "Other"

cdrs[["Resource_Value", "Resource_Type"]] = cdrs["Resources_Consumed"].apply(lambda x: pd.Series(categorize_resources(x)))
cdrs.drop(columns=["Resources_Consumed"], inplace=True)
cdrs = cdrs[["MSISDN", "Resource_Value", "Resource_Type", "Amount_Charged", "Datetime_Charged"]]

# Converting Date/Time Columns

cdrs["Datetime_Charged"] = pd.to_datetime(cdrs["Datetime_Charged"])
purchases["Datetime"] = pd.to_datetime(purchases["Datetime"])
tickets["Log_Time"] = pd.to_datetime(tickets["Log_Time"])
tickets["Resolution_Time"] = pd.to_datetime(tickets["Resolution_Time"])

# Converting Numeric Columns

cdrs["Amount_Charged"] = pd.to_numeric(cdrs["Amount_Charged"], errors="coerce")
purchases["Data_Browsing_Allowance"] = pd.to_numeric(purchases["Data_Browsing_Allowance"], errors="coerce")
purchases["SMS_Allowance"] = pd.to_numeric(purchases["SMS_Allowance"], errors="coerce")
purchases["Voice_On-Net_Allowance"] = pd.to_numeric(purchases["Voice_On-Net_Allowance"], errors="coerce")
purchases["Voice_Off-Net_Allowance"] = pd.to_numeric(purchases["Voice_Off-Net_Allowance"], errors="coerce")
purchases["Data_Social_Allowance"] = pd.to_numeric(purchases["Data_Social_Allowance"], errors="coerce")
purchases["Amount"] = pd.to_numeric(purchases["Amount"], errors="coerce")


print("User Profile:", userprofile.columns.tolist())
print()
print(userprofile.info())

print("\nCDRS:", cdrs.columns.tolist())
print()
print(cdrs.info())

print("\nPurchases:", purchases.columns.tolist())
print()
print(purchases.info())

print("\nTickets:", tickets.columns.tolist())
print()
print(tickets.info())

User Profile: ['MSISDN', 'Name', 'City', 'User_Type']

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   MSISDN     500 non-null    int64 
 1   Name       500 non-null    object
 2   City       500 non-null    object
 3   User_Type  500 non-null    object
dtypes: int64(1), object(3)
memory usage: 15.8+ KB
None

CDRS: ['MSISDN', 'Resource_Value', 'Resource_Type', 'Amount_Charged', 'Datetime_Charged']

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   MSISDN            5000 non-null   int64         
 1   Resource_Value    5000 non-null   int64         
 2   Resource_Type     5000 non-null   object        
 3   Amount_Charged    5000 non-null   int64         
 4   Datetime_Charged  5000 n

In [5]:
# Saving Processed Data

userprofile.to_csv("data/processed/userprofile.csv", index=False)
cdrs.to_csv("data/processed/cdrs.csv", index=False)
purchases.to_csv("data/processed/purchases.csv", index=False)
tickets.to_csv("data/processed/tickets.csv", index=False)

print('Processed data has been successfully saved.')

Processed data has been successfully saved.


In [6]:
# Intergrating Consumer Data in Nested/Structured Form using DataIntegrator
# The DataIntegrator class can be found in src folder from Github repository

data_integrator = DataIntegrator(userprofile, cdrs, purchases, tickets)

nested_data = data_integrator.integrate_data()
nested_dict = nested_data.to_dict(orient="records")

print("Integrated Consumer Data:\n", nested_data.columns.tolist())
print()
print(nested_data.head())

Integrated Consumer Data:
 ['MSISDN', 'Name', 'City', 'User_Type', 'CDRS', 'Purchases', 'Tickets']

          MSISDN    Name       City User_Type  \
0  9230610000463  User 1     Lahore   Prepaid   
1  9230347659110  User 2     Quetta  Postpaid   
2  9230141002657  User 3    Karachi   Prepaid   
3  9230162731400  User 4    Karachi  Postpaid   
4  9230108284824  User 5  Islamabad   Prepaid   

                                                CDRS  \
0  [{'Amount_Charged': 10, 'Resource_Value': 211,...   
1  [{'Amount_Charged': 0, 'Resource_Value': 295, ...   
2  [{'Amount_Charged': 18, 'Resource_Value': 45, ...   
3  [{'Amount_Charged': 0, 'Resource_Value': 31, '...   
4  [{'Amount_Charged': 0, 'Resource_Value': 53, '...   

                                           Purchases  \
0  [{'Datetime': 2024-11-06 15:50:55, 'Amount': 9...   
1  [{'Datetime': 2024-01-31 06:41:43, 'Amount': 2...   
2  [{'Datetime': 2024-06-11 02:16:48, 'Amount': 1...   
3  [{'Datetime': 2024-05-17 10:32:39, 'Amoun

In [17]:
# Saving Integrated + Nested Consumer Data as consumer_data.CSV

nested_data.to_csv("data/processed/consumer_data.csv", index=False)

print("Saved consumer data in .csv format.")

Saved consumer data in .csv format.


In [18]:
# Saving Heirarchical Consumer Data as consumer_data.JSON

def convert_timestamps(obj):
    if isinstance(obj, pd.Timestamp):
        return obj.isoformat()  # Convert to ISO 8601 string (YYYY-MM-DDTHH:MM:SS)
    return obj

with open("data/processed/consumer_data.json", "w") as f:
    json.dump(nested_dict, f, indent=4, default=convert_timestamps)

print('Saved heirarchical consumer data in .JSON format.')

Saved heirarchical consumer data in .JSON format.
