<h3>>> Import data</h3>

In [62]:
import pandas as pd
df = pd.read_csv('Churn_prediction.csv')
del df['customerID']

In [63]:
df.head().T

Unnamed: 0,0,1,2,3,4
gender,Female,Male,Male,Male,Female
SeniorCitizen,0,0,0,0,0
Partner,Yes,No,No,No,No
Dependents,No,No,No,No,No
tenure,1,34,2,45,2
PhoneService,No,Yes,Yes,No,Yes
MultipleLines,No phone service,No,No,No phone service,No
InternetService,DSL,DSL,DSL,DSL,Fiber optic
OnlineSecurity,No,Yes,Yes,Yes,No
OnlineBackup,Yes,No,Yes,No,No


In [64]:
df.shape

(7043, 20)

In [65]:
df.dtypes

gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [66]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'] = df['TotalCharges'].fillna(0)

<h3>>> Make string values lower case and replace whitespace by _ </h3>

In [67]:
df.columns = df.columns.str.lower().str.replace(' ','_')

string_columns = list(df.dtypes[df.dtypes=='O'].index)
for col in string_columns:
    df[col] = df[col].str.lower().str.replace(' ','_')

In [68]:
df.churn = (df.churn == 'yes').astype(int)

In [69]:
df.head().T

Unnamed: 0,0,1,2,3,4
gender,female,male,male,male,female
seniorcitizen,0,0,0,0,0
partner,yes,no,no,no,no
dependents,no,no,no,no,no
tenure,1,34,2,45,2
phoneservice,no,yes,yes,no,yes
multiplelines,no_phone_service,no,no,no_phone_service,no
internetservice,dsl,dsl,dsl,dsl,fiber_optic
onlinesecurity,no,yes,yes,yes,no
onlinebackup,yes,no,yes,no,no


<h3>>> Missing data</h3>
<p>We don't have missing values, Thank god :)</p>

In [70]:
df.isnull().sum()

gender              0
seniorcitizen       0
partner             0
dependents          0
tenure              0
phoneservice        0
multiplelines       0
internetservice     0
onlinesecurity      0
onlinebackup        0
deviceprotection    0
techsupport         0
streamingtv         0
streamingmovies     0
contract            0
paperlessbilling    0
paymentmethod       0
monthlycharges      0
totalcharges        0
churn               0
dtype: int64

<h3>>> Duplicate values</h3>

In [71]:
print('We have {0} duplicate observation'.format(df.duplicated().sum()))
# drop them
df.drop_duplicates(inplace=True)
print('Now we have {0} duplicate observation'.format(df.duplicated().sum()))

We have 22 duplicate observation
Now we have 0 duplicate observation


<h3>>> Split dataset to training set and test set</h3>

In [72]:
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df, test_size=0.2, random_state=666)

print("df_train shape: {0}".format(df_train.shape))
print("df_test shape: {0}".format(df_test.shape))

y_train = df_train['churn'].values
y_test = df_test['churn'].values

# del df_train['churn'], df_test['churn']

df_train shape: (5616, 20)
df_test shape: (1405, 20)


<h3>>> EDA</h3>

<p>Because the percentage of churns and not churns is not the same, we find out accuracy method is not a good choice for model evaluation. So we use other evaluation methods.</p>

In [73]:
# from locale import normalize
df_train.churn.value_counts(normalize=True)

0    0.730947
1    0.269053
Name: churn, dtype: float64

In [74]:
global_mean = df_train.churn.mean()
round(global_mean, 3)

0.269

<h3>>> Feature importance</h3>

In [75]:
categorical = ['gender', 'seniorcitizen', 'partner', 'dependents',
               'phoneservice', 'multiplelines', 'internetservice',
               'onlinesecurity', 'onlinebackup', 'deviceprotection',
               'techsupport', 'streamingtv', 'streamingmovies',
               'contract', 'paperlessbilling', 'paymentmethod']
numerical = ['tenure', 'monthlycharges', 'totalcharges']

In [76]:
df_train[categorical].nunique()

gender              2
seniorcitizen       2
partner             2
dependents          2
phoneservice        2
multiplelines       3
internetservice     3
onlinesecurity      3
onlinebackup        3
deviceprotection    3
techsupport         3
streamingtv         3
streamingmovies     3
contract            3
paperlessbilling    2
paymentmethod       4
dtype: int64

<p>Which group of observation has more risk of churn?</p>

In [77]:
for col in categorical:
    df_group = df_train.groupby(by=col).churn.agg(['mean'])
    df_group['diff'] = df_group['mean'] - global_mean
    df_group['risk'] = df_group['mean'] / global_mean
    display(df_group)

Unnamed: 0_level_0,mean,diff,risk
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.272467,0.003414,1.01269
male,0.265675,-0.003378,0.987445


Unnamed: 0_level_0,mean,diff,risk
seniorcitizen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.237953,-0.0311,0.884411
1,0.426566,0.157513,1.585436


Unnamed: 0_level_0,mean,diff,risk
partner,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.332876,0.063824,1.237216
yes,0.200074,-0.068979,0.743624


Unnamed: 0_level_0,mean,diff,risk
dependents,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.318517,0.049464,1.183845
yes,0.153067,-0.115985,0.568912


Unnamed: 0_level_0,mean,diff,risk
phoneservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.239209,-0.029844,0.889077
yes,0.272332,0.003279,1.012188


Unnamed: 0_level_0,mean,diff,risk
multiplelines,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.252495,-0.016557,0.938461
no_phone_service,0.239209,-0.029844,0.889077
yes,0.295117,0.026064,1.096873


Unnamed: 0_level_0,mean,diff,risk
internetservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
dsl,0.191688,-0.077364,0.712456
fiber_optic,0.421982,0.152929,1.568397
no,0.075125,-0.193927,0.279221


Unnamed: 0_level_0,mean,diff,risk
onlinesecurity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.422691,0.153639,1.571036
no_internet_service,0.075125,-0.193927,0.279221
yes,0.147783,-0.121269,0.549272


Unnamed: 0_level_0,mean,diff,risk
onlinebackup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.405218,0.136165,1.506092
no_internet_service,0.075125,-0.193927,0.279221
yes,0.217303,-0.05175,0.807659


Unnamed: 0_level_0,mean,diff,risk
deviceprotection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.392554,0.123501,1.459021
no_internet_service,0.075125,-0.193927,0.279221
yes,0.231638,-0.037414,0.860941


Unnamed: 0_level_0,mean,diff,risk
techsupport,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.416697,0.147644,1.548754
no_internet_service,0.075125,-0.193927,0.279221
yes,0.158574,-0.110479,0.589379


Unnamed: 0_level_0,mean,diff,risk
streamingtv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.344329,0.075276,1.279783
no_internet_service,0.075125,-0.193927,0.279221
yes,0.298866,0.029814,1.110809


Unnamed: 0_level_0,mean,diff,risk
streamingmovies,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.3397,0.070647,1.262577
no_internet_service,0.075125,-0.193927,0.279221
yes,0.30374,0.034688,1.128925


Unnamed: 0_level_0,mean,diff,risk
contract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
month-to-month,0.430839,0.161786,1.601318
one_year,0.119211,-0.149842,0.443077
two_year,0.030814,-0.238238,0.114529


Unnamed: 0_level_0,mean,diff,risk
paperlessbilling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.166593,-0.102459,0.619185
yes,0.338821,0.069768,1.25931


Unnamed: 0_level_0,mean,diff,risk
paymentmethod,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bank_transfer_(automatic),0.173948,-0.095104,0.646521
credit_card_(automatic),0.158367,-0.110685,0.588611
electronic_check,0.4584,0.189347,1.703754
mailed_check,0.186909,-0.082144,0.694691


<h3>Mutual information</h3>
<p>Mutual information (MI) - concept from information theory , it tells us how much we can learn about one variable if we know the value of another</p>
<a href='https://en.wikipedia.org/wiki/Mutual_information'>https://en.wikipedia.org/wiki/Mutual_information</a>

In [78]:
from sklearn.metrics import mutual_info_score

In [79]:
def calculate_mi(series):
    return mutual_info_score(series, df_train.churn)

df_mi = df_train[categorical].apply(calculate_mi)
df_mi = df_mi.sort_values(ascending=False).to_frame(name='more_important')


df_mi

Unnamed: 0,more_important
contract,0.09734
onlinesecurity,0.065429
techsupport,0.061188
internetservice,0.055655
onlinebackup,0.0474
paymentmethod,0.044806
deviceprotection,0.043037
streamingtv,0.032241
streamingmovies,0.031892
paperlessbilling,0.018954


<h3>Corrlation between numerical variables and the dependent variable(churn)</h3>

In [85]:
df_train[['churn','tenure','monthlycharges','totalcharges']].corr(method ='pearson')

Unnamed: 0,churn,tenure,monthlycharges,totalcharges
churn,1.0,-0.349202,0.195308,-0.196418
tenure,-0.349202,1.0,0.24664,0.82862
monthlycharges,0.195308,0.24664,1.0,0.647869
totalcharges,-0.196418,0.82862,0.647869,1.0


<h3>One Hot Encoding</h3>

In [86]:
pd.get_dummies(df_train, columns=['contract', 'onlinesecurity', 'techsupport', 'internetservice'])

Unnamed: 0,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,onlinebackup,deviceprotection,streamingtv,...,contract_two_year,onlinesecurity_no,onlinesecurity_no_internet_service,onlinesecurity_yes,techsupport_no,techsupport_no_internet_service,techsupport_yes,internetservice_dsl,internetservice_fiber_optic,internetservice_no
3448,male,0,yes,yes,4,yes,no,no,no,no,...,0,1,0,0,0,0,1,1,0,0
3697,female,0,yes,yes,32,yes,yes,no,no,yes,...,0,0,0,1,1,0,0,0,1,0
2747,male,0,no,no,7,yes,no,no_internet_service,no_internet_service,no_internet_service,...,0,0,1,0,0,1,0,0,0,1
4420,male,0,no,no,27,yes,no,no,yes,yes,...,0,0,0,1,0,0,1,1,0,0
1022,female,1,no,no,5,yes,no,yes,no,yes,...,0,1,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70,male,0,no,no,1,yes,no,no,no,no,...,0,1,0,0,0,0,1,1,0,0
2884,female,1,yes,yes,72,yes,yes,yes,yes,yes,...,1,0,0,1,0,0,1,1,0,0
1955,female,0,yes,yes,49,yes,yes,no,no,yes,...,0,1,0,0,1,0,0,0,1,0
1926,male,0,yes,yes,49,yes,yes,no,yes,yes,...,0,1,0,0,1,0,0,0,1,0
