In [1]:
import warnings 
import pandas as pd
import numpy as np
import sys
sys.path.append("..")
import seaborn as sns
import matplotlib.pyplot as plt 
from sklearn.cluster import KMeans
warnings.filterwarnings("ignore")

In [2]:
from scripts.data_read import read_data
from scripts.normalization import normalize_data

In [8]:
df = read_data("../data/experience_data.csv")
df.head()

Unnamed: 0,MSISDN/Number,Handset Type,Average TCP retransmission,Average RTT,Average throughput
0,33664962239,Samsung Galaxy A5 Sm-A520F,21569.572935,47.0,67.0
1,33681854413,Samsung Galaxy J5 (Sm-J530),21569.572935,70.0,42.0
2,33760627129,Samsung Galaxy A8 (2018),21569.572935,127.458589,15.0
3,33750343200,undefined,21569.572935,127.458589,88.0
4,33699795932,Samsung Sm-G390F,21569.572935,127.458589,15.0


In [11]:
df.isna().sum()

MSISDN/Number                 0
Handset Type                  0
Average TCP retransmission    0
Average RTT                   0
Average throughput            0
dtype: int64

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150001 entries, 0 to 150000
Data columns (total 5 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   MSISDN/Number               150001 non-null  int64  
 1   Handset Type                150001 non-null  object 
 2   Average TCP retransmission  150001 non-null  float64
 3   Average RTT                 150001 non-null  float64
 4   Average throughput          150001 non-null  float64
dtypes: float64(3), int64(1), object(1)
memory usage: 5.7+ MB


In [13]:
df_used = df.iloc[:,2:]

###### normalizing data 

In [18]:
standarlized_data = normalize_data.standardizer(df_used)

###### clustering 

In [19]:
kmeans = KMeans(n_clusters=3, random_state=0).fit(standarlized_data)

###### add clusters to dataframe 

In [20]:
df_used['clusters'] = kmeans.labels_

In [21]:
df_used

Unnamed: 0,Average TCP retransmission,Average RTT,Average throughput,clusters
0,21569.572935,47.000000,67.000000,2
1,21569.572935,70.000000,42.000000,2
2,21569.572935,127.458589,15.000000,0
3,21569.572935,127.458589,88.000000,0
4,21569.572935,127.458589,15.000000,0
...,...,...,...,...
149996,21569.572935,32.000000,117.000000,2
149997,21569.572935,29.000000,77.000000,2
149998,21569.572935,49.000000,90.000000,2
149999,21569.572935,42.000000,71.000000,2


###### aggregate per cluster to see less engaged cluster 

In [22]:
df_agg_clusters = df_used.groupby('clusters').agg({'Average TCP retransmission':'mean','Average RTT':'mean','Average throughput':'mean'}) 

In [23]:
df_agg_clusters

Unnamed: 0_level_0,Average TCP retransmission,Average RTT,Average throughput
clusters,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,24744.482281,131.41772,2522.716211
1,25846.765661,80.810383,39972.360512
2,18527.42689,45.981731,2290.060516


cluster 1 is less engaged since it has high TCP and TP but less RTT 

In [24]:
#slice cluster 1

In [25]:
df_cluster_less_engaged = df_used[df_used['clusters']==1]

In [26]:
df_cluster_less_engaged

Unnamed: 0,Average TCP retransmission,Average RTT,Average throughput,clusters
11,6034.635000,221.0,34805.0,1
20,9865.591000,97.0,40058.0,1
30,13903.897665,76.0,33242.0,1
43,236.155000,72.0,26416.0,1
46,1166.993000,120.0,35276.0,1
...,...,...,...,...
149974,21569.572935,41.0,24023.0,1
149975,630.334000,82.0,33958.0,1
149979,3031.227000,76.0,59282.0,1
149980,8465.599000,90.0,48120.0,1


###### less engaged cluster on normalized data 

In [27]:
df_norm = pd.DataFrame(standarlized_data,columns=df_used.iloc[:,:-1].columns.tolist())

In [29]:
df_norm['clusters'] =kmeans.labels_

In [30]:
df_norm

Unnamed: 0,Average TCP retransmission,Average RTT,Average throughput,clusters
0,0.004965,0.175373,0.000925,2
1,0.004965,0.261194,0.000580,2
2,0.004965,0.475592,0.000207,0
3,0.004965,0.475592,0.001214,0
4,0.004965,0.475592,0.000207,0
...,...,...,...,...
149996,0.004965,0.119403,0.001615,2
149997,0.004965,0.108209,0.001063,2
149998,0.004965,0.182836,0.001242,2
149999,0.004965,0.156716,0.000980,2


In [None]:
df_norm_less_engaged_clusters = 