# TASK 1: User Overview analysis

In [1]:
# Import the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import sys
sys.path.append('../scripts')
from Clean_data import clean_data
from Extract_data import extract_data

In [3]:
# Import the dataset
df = pd.read_csv("../data/Week1_challenge_data_source(CSV).csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150001 entries, 0 to 150000
Data columns (total 55 columns):
 #   Column                                    Non-Null Count   Dtype  
---  ------                                    --------------   -----  
 0   Bearer Id                                 149010 non-null  float64
 1   Start                                     150000 non-null  object 
 2   Start ms                                  150000 non-null  float64
 3   End                                       150000 non-null  object 
 4   End ms                                    150000 non-null  float64
 5   Dur. (ms)                                 150000 non-null  float64
 6   IMSI                                      149431 non-null  float64
 7   MSISDN/Number                             148935 non-null  float64
 8   IMEI                                      149429 non-null  float64
 9   Last Location Name                        148848 non-null  object 
 10  Avg RTT DL (ms)     

## Sub-tasks

>## Identifying the top 10 handsets used by the customers </br>

The handsets type of the customers are stored in the column named `Handset Type`. But before extracting the information for this column, we should identify each unique user/customer. As a matter of fact, a customer can be identified by either his/her IMSI, MSISDN/number or IMEI.

In [None]:
# Search the column that have the less number of missing values
IdVariable = clean_data(df[['IMSI','MSISDN/Number', 'IMEI','Handset Type']])
missingCol,_,_ = IdVariable.missing_values(verbose=False)
missingCol

Thus one can choose to use the `IMSI` to identify each customer because we get more information using this field instead of `IMEI` or `MSISDN/Number`. Nethertheless, one can also choose to use the `IMEI`. Since we're looking for the handset type, we can just focus on the **IMEI** because it's an unique number which identify a device on a mobile network. Furthermore, without the IMEI, we can't identify the handset type - this is the reason why we get the same number of missing values for those two columns. Even if, we get less missing values for IMSI, this caracteristics does not ensure that we'll get the maximum of information relative to the handset type.

In [None]:
# Extract the unique customer from the table
UniqueUser = df.loc[:,['IMEI','Handset Manufacturer','Handset Type']].dropna(how="all")
# Drop the duplicates
UniqueUser = UniqueUser.drop_duplicates()
# Count the number of each handset type and identify the top 10
UniqueUser.loc[:,'Handset Type'].value_counts()[:10]

In [None]:
# Count the number of each handset type and identify the top 10 (without undefined handset type)
UniqueUser.query("`Handset Type`!='undefined'").loc[:,'Handset Type'].value_counts()[:10]

>## Identify the three top handset manufacturer

In [None]:
def topNManufacturer(df,topn=3):
    toNMan = df.loc[:,'Handset Manufacturer'].value_counts()[:topn]
    return toNMan

In [None]:
# Identify the top 3 manufacturer
topNManufacturer(UniqueUser,topn=3)

>## Identify the top 5 handsets per top 3 handset manufacturer

In [None]:
# Function to extract the top n type of handset for the top m manufacturer
def topTypeManufact(df=UniqueUser,nmanufact=3,ntype=5):
    topNManufact = df.loc[:,'Handset Manufacturer'].value_counts()[:nmanufact]
    res = pd.DataFrame(columns=['Manufacturer','Type','Count'])
    for manufacturer in topNManufact.index:
        temp = df.loc[UniqueUser['Handset Manufacturer']==manufacturer,'Handset Type'].value_counts()[:ntype]
        temp = pd.DataFrame({'Manufacturer':[manufacturer]*ntype,'Type':temp.index,'Count':temp.to_list()})
        res = pd.concat([res,temp])
    return res.reset_index(drop=True)

In [None]:
# Identify the top 5 handsets per top 3 handset manufacturer
topTypeManufact(df=UniqueUser,nmanufact=3,ntype=5)

>## Task 1.1: Get an overview of the users’ behavior on those applications

In [None]:
df[['IMSI','Bearer Id']].groupby("IMSI").count()

In [None]:
df[['IMSI','Dur. (ms)']].groupby("IMSI").sum()

In [None]:
df[['IMSI','Total UL (Bytes)','Total DL (Bytes)']].groupby("IMSI").sum()

In [None]:
test1 = pd.Series(df[['IMSI','Total UL (Bytes)','Total DL (Bytes)']].groupby("IMSI").sum().sum(axis=1),name="Total")

In [None]:
test2 = pd.Series(df[['IMSI','Google UL (Bytes)','Google DL (Bytes)']].groupby("IMSI").sum().sum(axis=1),name="Google")

In [None]:
merge_test =pd.merge(test1,test2,left_index=True,right_index=True,validate="one_to_one")
merge_test

In [4]:
dfData = df.loc[:,['IMSI','Social Media DL (Bytes)','Social Media UL (Bytes)',
                                  'Google DL (Bytes)', 'Google UL (Bytes)', 'Email DL (Bytes)',
                                  'Email UL (Bytes)', 'Youtube DL (Bytes)', 'Youtube UL (Bytes)',
                                  'Netflix DL (Bytes)', 'Netflix UL (Bytes)', 'Gaming DL (Bytes)',
                                  'Gaming UL (Bytes)', 'Other DL (Bytes)', 'Other UL (Bytes)',
                                  'Total UL (Bytes)', 'Total DL (Bytes)']]

In [5]:
DataByte = extract_data(dfData)

In [6]:
DataByte.merge_data('IMSI')

['Social Media DL (Bytes)', 'Social Media UL (Bytes)', 'IMSI']


Unnamed: 0_level_0,Social Media,Google,Email,Youtube,Gaming,Other
IMSI,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2.040471e+14,1546088.0,9218647.0,3330974.0,40940710.0,43070260.0,4.013605e+08
2.040808e+14,715224.0,10438660.0,1520771.0,11959905.0,178048738.0,4.705265e+08
2.082001e+14,3122781.0,1624662.0,3657501.0,19882439.0,9124516.0,5.159737e+08
2.082001e+14,2577104.0,13811478.0,3855734.0,25657632.0,344105897.0,3.096107e+08
2.082001e+14,3426716.0,11665887.0,5564462.0,13840201.0,917684766.0,1.079435e+09
...,...,...,...,...,...,...
2.082099e+14,1256603.0,12047606.0,956312.0,11621474.0,678493990.0,5.883943e+08
2.082099e+14,3465755.0,4287813.0,2284566.0,34857460.0,411382679.0,7.526239e+08
2.082099e+14,2609666.0,4427934.0,1717645.0,29190696.0,272046636.0,4.567354e+08
2.082522e+14,667081.0,2227505.0,3054624.0,24909498.0,788569459.0,5.944805e+08


In [None]:
DataByte.df.groupby("IMSI").sum(1)

In [None]:
SocialMediaCol = [col for col in dfData.columns if 'Social Media' in col]
SocialMediaCol.append("IMSI")
print(SocialMediaCol)

In [None]:
dfData[SocialMediaCol].groupby("IMSI").sum()

In [None]:
df.iloc[:,39:55].columns.to_list()

In [None]:
df.dropna()

In [None]:
np.sum()