In [4]:
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import psycopg2
from dotenv import load_dotenv
from sqlalchemy import create_engine
os.chdir('..')
from Db_connection.connection import PostgresConnection
from src.utils import *
from sklearn.cluster import KMeans
import seaborn as sns



In [2]:
db = PostgresConnection(dbname='telecom', user='postgres', password='postgres')
db.connect()

# Query the table to verify the write
query = "SELECT * FROM xdr_data_cleaned"
result = db.execute_query(query)

# Convert result to a DataFrame and display the information
df_cleaned = pd.DataFrame(result, columns=[desc[0] for desc in db.cursor.description])
print(df_cleaned.head(5))

# Close the connection
db.close_connection()

Connected to PostgreSQL database!
      Bearer Id            Start  Start ms              End  End ms  \
0  1.311448e+19   4/4/2019 12:01     770.0  4/25/2019 14:35   662.0   
1  1.311448e+19   4/9/2019 13:04     235.0   4/25/2019 8:15   606.0   
2  1.311448e+19   4/9/2019 17:42       1.0  4/25/2019 11:58   652.0   
3  1.311448e+19   4/10/2019 0:31     486.0   4/25/2019 7:36   171.0   
4  1.311448e+19  4/12/2019 20:10     565.0  4/25/2019 10:40   954.0   

   Dur. (ms)          IMSI  MSISDN/Number          IMEI  \
0  1823652.0  2.082014e+14   3.366496e+10  3.552121e+13   
1  1365104.0  2.082019e+14   3.368185e+10  3.579401e+13   
2  1361762.0  2.082003e+14   3.376063e+10  3.528151e+13   
3  1321509.0  2.082014e+14   3.375034e+10  3.535661e+13   
4  1089009.0  2.082014e+14   3.369980e+10  3.540701e+13   

      Last Location Name  ...  Youtube DL (Bytes)  Youtube UL (Bytes)  \
0  9.16456699548519E+015  ...          15854611.0           2501332.0   
1                L77566A  ...         

## user experiance aggregate by metrics

In [5]:
# Handle outliers
for col in ['TCP DL Retrans. Vol (Bytes)', 'TCP UL Retrans. Vol (Bytes)', 'Avg RTT DL (ms)', 'Avg RTT UL (ms)', 'Avg Bearer TP DL (kbps)', 'Avg Bearer TP UL (kbps)']:
    Q1 = df_cleaned[col].quantile(0.25)
    Q3 = df_cleaned[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df_cleaned[col] = np.where(df_cleaned[col] < lower_bound, df_cleaned[col].mean(), df_cleaned[col])
    df_cleaned[col] = np.where(df_cleaned[col] > upper_bound, df_cleaned[col].mean(), df_cleaned[col])

# Aggregate per customer
agg_df = df_cleaned.groupby('IMSI').agg({
    'TCP DL Retrans. Vol (Bytes)': 'mean',
    'TCP UL Retrans. Vol (Bytes)': 'mean',
    'Avg RTT DL (ms)': 'mean',
    'Avg RTT UL (ms)': 'mean',
    'Avg Bearer TP DL (kbps)': 'mean',
    'Avg Bearer TP UL (kbps)': 'mean',
    'Handset Type': lambda x: x.mode()[0]
}).reset_index()

# Calculate total TCP retransmission, RTT, and throughput
agg_df['total_tcp'] = (agg_df['TCP DL Retrans. Vol (Bytes)'] + agg_df['TCP UL Retrans. Vol (Bytes)'])
agg_df['total_rtt'] = (agg_df['Avg RTT DL (ms)'] + agg_df['Avg RTT UL (ms)'])
agg_df['total_throughput'] = (agg_df['Avg Bearer TP DL (kbps)'] + agg_df['Avg Bearer TP UL (kbps)'])

# Drop the intermediary columns
agg_df = agg_df.drop(columns=[
    'TCP DL Retrans. Vol (Bytes)',
    'TCP UL Retrans. Vol (Bytes)',
    'Avg RTT DL (ms)',
    'Avg RTT UL (ms)',
    'Avg Bearer TP DL (kbps)',
    'Avg Bearer TP UL (kbps)'
])

## cluster discription

In [7]:
features = agg_df[['total_tcp', 'total_rtt', 'total_throughput']]

# Apply KMeans clustering
kmeans = KMeans(n_clusters=3, random_state=0)
kmeans.fit(features)
agg_df['cluster'] = kmeans.labels_

numeric_cols = agg_df.select_dtypes(include='number').columns


# Describe each cluster with only numeric columns
cluster_description = agg_df.groupby('cluster')[numeric_cols].mean()
print("\nCluster Descriptions:")
print(cluster_description)



Cluster Descriptions:
                 IMSI     total_tcp  total_rtt  total_throughput  cluster
cluster                                                                  
0        2.082015e+14  2.169949e+07  74.668510       1849.479042      0.0
1        2.082016e+14  1.214642e+06  85.340488      17414.798193      1.0
2        2.082016e+14  1.146207e+07  76.231245      12724.769845      2.0


## Calculate Engagement and Experiance Score

In [8]:
from scipy.spatial.distance import euclidean

# Assume 'cluster_description' contains the centroids of each cluster
# Extract centroids
engagement_centroid = cluster_description.loc[0, ['total_tcp', 'total_rtt', 'total_throughput']]
experience_centroid = cluster_description.loc[2, ['total_tcp', 'total_rtt', 'total_throughput']]

# Calculate Engagement and Experience Scores
def calculate_scores(row, centroid):
    return euclidean(row[['total_tcp', 'total_rtt', 'total_throughput']], centroid)

agg_df['engagement_score'] = agg_df.apply(lambda row: calculate_scores(row, engagement_centroid), axis=1)
agg_df['experience_score'] = agg_df.apply(lambda row: calculate_scores(row, experience_centroid), axis=1)

# Calculate Satisfaction Score
agg_df['satisfaction_score'] = (agg_df['engagement_score'] + agg_df['experience_score']) / 2

# Top 10 Satisfied Customers
top_10_satisfied = agg_df.nlargest(10, 'satisfaction_score')
print(top_10_satisfied[['IMSI', 'engagement_score', 'experience_score', 'satisfaction_score']])


               IMSI  engagement_score  experience_score  satisfaction_score
58701  2.082017e+14      2.890536e+07      3.914278e+07        3.402407e+07
7045   2.082003e+14      2.861850e+07      3.885593e+07        3.373721e+07
81438  2.082018e+14      2.855992e+07      3.879734e+07        3.367863e+07
80988  2.082018e+14      2.847057e+07      3.870799e+07        3.358928e+07
99920  2.082021e+14      2.843652e+07      3.867395e+07        3.355524e+07
80950  2.082018e+14      2.843395e+07      3.867136e+07        3.355265e+07
46375  2.082015e+14      2.834193e+07      3.857936e+07        3.346065e+07
636    2.082003e+14      2.814339e+07      3.838079e+07        3.326209e+07
71054  2.082017e+14      2.813831e+07      3.837572e+07        3.325701e+07
33747  2.082014e+14      2.806153e+07      3.829895e+07        3.318024e+07


## K-means on Engagement and Experience Scores

In [10]:
from sklearn.cluster import KMeans

# K-means on Engagement and Experience Scores
kmeans_2 = KMeans(n_clusters=2, random_state=0).fit(agg_df[['engagement_score', 'experience_score']])
agg_df['engagement_experience_cluster'] = kmeans_2.labels_

# Aggregate average satisfaction & experience score per cluster
cluster_stats = agg_df.groupby('engagement_experience_cluster').agg({
    'satisfaction_score': 'mean',
    'experience_score': 'mean'
})

print(cluster_stats)


                               satisfaction_score  experience_score
engagement_experience_cluster                                      
0                                    5.228588e+06      8.459855e+06
1                                    1.491122e+07      9.970618e+06


## Build a regression model

In [9]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Prepare data
X = agg_df[['engagement_score', 'experience_score']]
y = agg_df['satisfaction_score']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Train model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# Save model
import joblib
joblib.dump(model, 'satisfaction_model.pkl')


Mean Squared Error: 2.354605655269794e-16


['satisfaction_model.pkl']