# Data Preperation & Wrangling
***

In [1]:
import pandas as pd
import re

In [2]:
import os
os.chdir('/Users/qas/Desktop/evamp&saanga/CSA/ES-CSA/data/raw')

# Load Raw Data
cdrs = pd.read_csv('CDRS.csv')
purchases = pd.read_csv('Purchases.csv')
tickets = pd.read_csv('Tickets.csv')
userprofile = pd.read_csv('UserProfile.csv')

In [3]:
# CDRS

# Convert to DateTime
cdrs['Datetime Charged'] = pd.to_datetime(cdrs['Datetime Charged'], format='%Y-%m-%d %H:%M:%S')

# Standardize Resource Type
def classify_resource(resource_str):
    if 'SMS' in resource_str:
        return 'SMS'
    elif 'seconds voice call' in resource_str:
        return 'Voice Call'
    elif 'MB data' in resource_str:
        return 'Data'
    else:
        return 'Unknown'

cdrs['Resource Type'] = cdrs['Resources Consumed'].apply(classify_resource)

# Extract Resource Value
def extract_resource_value(resource_str):
    match = re.search(r'\d+', resource_str)
    return int(match.group()) if match else None

cdrs['Resource Value'] = cdrs['Resources Consumed'].apply(extract_resource_value)

# Drop Original Column
cdrs.drop(columns=['Resources Consumed'], inplace=True)

cdrs.head()

Unnamed: 0,Amount Charged,Datetime Charged,MSISDN,Resource Type,Resource Value
0,6,2023-12-09 19:21:06,9230250373229,SMS,5
1,0,2023-08-08 09:11:57,9230164210176,SMS,1
2,0,2024-07-06 07:27:35,9230213698765,SMS,37
3,13,2024-04-23 06:25:25,9230854438776,SMS,45
4,16,2023-01-16 23:35:17,9230751286997,Voice Call,240


In [4]:
# Purchases

# Convert to DateTime
purchases['Datetime'] = pd.to_datetime(purchases['Datetime'], format='%Y-%m-%d %H:%M:%S')

purchases.head()

Unnamed: 0,Offer Name,Offer ID,Data Browsing Allowance,SMS Allowance,Voice On-Net Allowance,Voice Off-Net Allowance,Data Social Allowance,Datetime,Amount,MSISDN
0,Offer 27,O031,3035,979,81,181,1236,2024-04-13 11:05:30,604,9230715340104
1,Offer 42,O020,4819,676,218,40,1226,2023-09-05 11:29:10,356,9230636066934
2,Offer 30,O017,7907,565,80,195,2691,2023-03-11 13:05:07,248,9230570122595
3,Offer 12,O001,4495,496,104,216,1418,2023-05-17 00:56:48,737,9230996629579
4,Offer 34,O047,3851,237,157,188,2975,2024-11-14 13:06:23,766,9230403240894


In [5]:
# Tickets

# Convert to DateTime
tickets['Log Time'] = pd.to_datetime(tickets['Log Time'], format='%Y-%m-%d %H:%M:%S')
tickets['Resolution Time'] = pd.to_datetime(tickets['Resolution Time'], format='%Y-%m-%d %H:%M:%S')

tickets.head()

Unnamed: 0,Ticket ID,Log Time,Resolution Time,Category,Description,Resolutions,MSISDN
0,T54509,2024-07-19 10:30:02,2024-07-19 16:30:02,Network Issue,Issue reported under Network Issue category.,Resolved with detailed explanation for Network...,9230323952441
1,T66986,2023-11-29 05:45:02,2023-12-02 01:45:02,Network Issue,Issue reported under Network Issue category.,Resolved with detailed explanation for Network...,9230367979227
2,T73025,2023-02-28 21:52:29,2023-03-03 16:52:29,General Inquiry,Issue reported under General Inquiry category.,Resolved with detailed explanation for General...,9230277146142
3,T15861,2023-02-09 08:34:25,2023-02-10 13:34:25,Network Issue,Issue reported under Network Issue category.,Resolved with detailed explanation for Network...,9230488909528
4,T90855,2023-04-07 06:07:08,2023-04-09 22:07:08,Network Issue,Issue reported under Network Issue category.,Resolved with detailed explanation for Network...,9230224556190


In [6]:
# UserProfile

# Drop Age & Gender Columns
userprofile.drop(columns=['Age'], inplace=True)
userprofile.drop(columns=['Gender'], inplace=True)

userprofile.head()

Unnamed: 0,Name,City,User Type,MSISDN
0,User 1,Lahore,Prepaid,9230610000463
1,User 2,Quetta,Postpaid,9230347659110
2,User 3,Karachi,Prepaid,9230141002657
3,User 4,Karachi,Postpaid,9230162731400
4,User 5,Islamabad,Prepaid,9230108284824


# Data Integration & Contextualization
***

In [7]:
# Merging Raw Data on MSISDN
combined_data = userprofile.merge(purchases, on='MSISDN', how='left')
combined_data = combined_data.merge(cdrs, on='MSISDN', how='left')
combined_data = combined_data.merge(tickets, on='MSISDN', how='left')

combined_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 501606 entries, 0 to 501605
Data columns (total 23 columns):
 #   Column                   Non-Null Count   Dtype         
---  ------                   --------------   -----         
 0   Name                     501606 non-null  object        
 1   City                     501606 non-null  object        
 2   User Type                501606 non-null  object        
 3   MSISDN                   501606 non-null  int64         
 4   Offer Name               501606 non-null  object        
 5   Offer ID                 501606 non-null  object        
 6   Data Browsing Allowance  501606 non-null  int64         
 7   SMS Allowance            501606 non-null  int64         
 8   Voice On-Net Allowance   501606 non-null  int64         
 9   Voice Off-Net Allowance  501606 non-null  int64         
 10  Data Social Allowance    501606 non-null  int64         
 11  Datetime                 501606 non-null  datetime64[ns]
 12  Amount          

### Combining Text Segments for Contextualization:

In [8]:
# User Context & Plan Summary
user_context = (
    "User in " + combined_data['City'] + " with a " + combined_data['User Type'] + " plan. "
    + "Currently subscribed to offer: " + combined_data['Offer Name'] + " (Offer ID: " + combined_data['Offer ID'].astype(str) + "). "
)

# Resource Consumption
consumption = (
    "Usage details: Data browsing allowance of " + combined_data['Data Browsing Allowance'].astype(str) + "MB, "
    + "Social data allowance of " + combined_data['Data Social Allowance'].astype(str) + "MB, "
    + combined_data['SMS Allowance'].astype(str) + " SMS, "
    + combined_data['Voice On-Net Allowance'].astype(str) + " on-net minutes, and "
    + combined_data['Voice Off-Net Allowance'].astype(str) + " off-net minutes. "
)

# Billing Information
billing_info = (
    "Recent transaction on " + combined_data['Datetime Charged'].astype(str) + " with amount charged: "
    + combined_data['Amount Charged'].astype(str) + " units. Resource type: " + combined_data['Resource Type'] 
    + " (Value: " + combined_data['Resource Value'].astype(str) + "). "
)

# Support & Ticket Summary
support_summary = (
    "Customer support ticket (ID: " + combined_data['Ticket ID'].astype(str) + ") logged on "
    + combined_data['Log Time'].astype(str) + " under category: " + combined_data['Category'] + ". "
    + "Issue description: " + combined_data['Description'] + ". "
    + "Resolution provided on " + combined_data['Resolution Time'].astype(str) + ": " + combined_data['Resolutions'] + "."
)

# Combined Text Segments
combined_data['Data_Summary'] = (
    user_context + consumption + billing_info + support_summary
)

# Text Normalization
combined_data['Data_Summary'] = combined_data['Data_Summary'].str.lower().str.replace('..', '.', regex=False)

# Combined Data Summary
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

combined_data[['Data_Summary']].head(5)


Unnamed: 0,Data_Summary
0,"user in lahore with a prepaid plan. currently subscribed to offer: offer 17 (offer id: o037). usage details: data browsing allowance of 4572mb, social data allowance of 836mb, 564 sms, 416 on-net minutes, and 168 off-net minutes. recent transaction on 2024-10-26 22:44:08 with amount charged: 10 units. resource type: voice call (value: 211). customer support ticket (id: t44316) logged on 2024-09-15 03:19:13 under category: billing. issue description: issue reported under billing category. resolution provided on 2024-09-16 22:19:13: resolved with detailed explanation for billing category."
1,"user in lahore with a prepaid plan. currently subscribed to offer: offer 17 (offer id: o037). usage details: data browsing allowance of 4572mb, social data allowance of 836mb, 564 sms, 416 on-net minutes, and 168 off-net minutes. recent transaction on 2024-10-26 22:44:08 with amount charged: 10 units. resource type: voice call (value: 211). customer support ticket (id: t74113) logged on 2024-09-30 11:34:24 under category: network issue. issue description: issue reported under network issue category. resolution provided on 2024-09-30 20:34:24: resolved with detailed explanation for network issue category."
2,"user in lahore with a prepaid plan. currently subscribed to offer: offer 17 (offer id: o037). usage details: data browsing allowance of 4572mb, social data allowance of 836mb, 564 sms, 416 on-net minutes, and 168 off-net minutes. recent transaction on 2024-10-26 22:44:08 with amount charged: 10 units. resource type: voice call (value: 211). customer support ticket (id: t38012) logged on 2024-08-20 23:08:37 under category: complaint. issue description: issue reported under complaint category. resolution provided on 2024-08-22 13:08:37: resolved with detailed explanation for complaint category."
3,"user in lahore with a prepaid plan. currently subscribed to offer: offer 17 (offer id: o037). usage details: data browsing allowance of 4572mb, social data allowance of 836mb, 564 sms, 416 on-net minutes, and 168 off-net minutes. recent transaction on 2024-10-26 22:44:08 with amount charged: 10 units. resource type: voice call (value: 211). customer support ticket (id: t41319) logged on 2024-01-14 17:10:45 under category: complaint. issue description: issue reported under complaint category. resolution provided on 2024-01-16 16:10:45: resolved with detailed explanation for complaint category."
4,"user in lahore with a prepaid plan. currently subscribed to offer: offer 17 (offer id: o037). usage details: data browsing allowance of 4572mb, social data allowance of 836mb, 564 sms, 416 on-net minutes, and 168 off-net minutes. recent transaction on 2024-10-26 22:44:08 with amount charged: 10 units. resource type: voice call (value: 211). customer support ticket (id: t70600) logged on 2023-06-13 19:21:32 under category: billing. issue description: issue reported under billing category. resolution provided on 2023-06-16 19:21:32: resolved with detailed explanation for billing category."


In [9]:
# Saving Combined Data to CSV

combined_data.to_csv('/Users/qas/Desktop/evamp&saanga/CSA/ES-CSA/data/combined/combined_data.csv', index=False)

### Incorporating & Storing Metadata:

In [10]:
# Incorporating Metadata

import json

metadata = [
    {
        "city": row['City'],
        "user_type": row['User Type'],
        "offer": row['Offer Name'],
        "data_allowance": row['Data Browsing Allowance'],
        "sms_allowance": row['SMS Allowance'],
        "voice_on_net": row['Voice On-Net Allowance'],
        "voice_off_net": row['Voice Off-Net Allowance'],
        "data_social_allowance": row['Data Social Allowance'],
        "amount": row['Amount'],
        "resource_type": row['Resource Type'],
        "category": row['Category']
    }
    for _, row in combined_data.iterrows()
]

metadata_path = '/Users/qas/Desktop/evamp&saanga/CSA/ES-CSA/data/metadata/metadata.json'
with open(metadata_path, 'w') as f:
    json.dump(metadata, f)

print(f"Metadata successfully saved with {len(metadata)} entries.")

Metadata successfully saved with 501606 entries.
