In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [4]:
train_full = pd.read_csv('data/train.csv', index_col=0)
train_full.head(2)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
2499,6061-GWWAV,Male,0,No,Yes,41,Yes,No,DSL,Yes,...,Yes,No,Yes,No,One year,No,Mailed check,70.2,2894.55,0
5807,8464-EETCQ,Male,0,No,No,57,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Credit card (automatic),18.8,1094.35,0


In [6]:
train_full['Churn'].value_counts(normalize=True)

0    0.734647
1    0.265353
Name: Churn, dtype: float64

In [8]:
# churn rate number of churn / total data
print('Global Churn Rate: ', train_full['Churn'].value_counts(normalize=True).loc[1])

Global Churn Rate:  0.2653532126375577


In [15]:
# another way
train_full['Churn'].mean()

0.2653532126375577

## Dtypes

In [9]:
train_full.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges        float64
Churn                 int64
dtype: object

In [12]:
numerical = ['tenure', 'MonthlyCharges', 'TotalCharges']
categorical = [ 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod']

In [13]:
train_full[categorical].nunique()

gender              2
SeniorCitizen       2
Partner             2
Dependents          2
PhoneService        2
MultipleLines       3
InternetService     3
OnlineSecurity      3
OnlineBackup        3
DeviceProtection    3
TechSupport         3
StreamingTV         3
StreamingMovies     3
Contract            3
PaperlessBilling    2
PaymentMethod       4
dtype: int64

# Churn rate Risk Ratio

In [72]:
global_churnrate = train_full['Churn'].mean()
global_churnrate

0.2653532126375577

Risk = group_churnrate/global_churnrate

Risk > 1 => more likely to churn

In [73]:
def churn_risk(cat_var):
    df_group = train_full.groupby(cat_var)['Churn'].agg(['mean','count'])
    df_group['dif'] = df_group['mean'] -global_churnrate
    df_group['risk'] = df_group['mean'] / global_churnrate
    return df_group

In [74]:
from IPython.display import display
for c in categorical:
    display(churn_risk(c))

Unnamed: 0_level_0,mean,count,dif,risk
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Female,0.271561,2806,0.006208,1.023394
Male,0.259194,2828,-0.006159,0.976788


Unnamed: 0_level_0,mean,count,dif,risk
SeniorCitizen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.235657,4706,-0.029697,0.888087
1,0.415948,928,0.150595,1.567527


Unnamed: 0_level_0,mean,count,dif,risk
Partner,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No,0.32821,2928,0.062857,1.236881
Yes,0.197339,2706,-0.068014,0.743685


Unnamed: 0_level_0,mean,count,dif,risk
Dependents,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No,0.311508,3971,0.046155,1.173939
Yes,0.155141,1663,-0.110212,0.58466


Unnamed: 0_level_0,mean,count,dif,risk
PhoneService,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No,0.247706,545,-0.017647,0.933497
Yes,0.267243,5089,0.00189,1.007122


Unnamed: 0_level_0,mean,count,dif,risk
MultipleLines,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No,0.253933,2733,-0.01142,0.956964
No phone service,0.247706,545,-0.017647,0.933497
Yes,0.282683,2356,0.017329,1.065307


Unnamed: 0_level_0,mean,count,dif,risk
InternetService,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
DSL,0.191034,1963,-0.074319,0.719924
Fiber optic,0.420772,2436,0.155419,1.585704
No,0.076923,1235,-0.18843,0.289889


Unnamed: 0_level_0,mean,count,dif,risk
OnlineSecurity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No,0.419366,2778,0.154013,1.580408
No internet service,0.076923,1235,-0.18843,0.289889
Yes,0.144972,1621,-0.120381,0.546337


Unnamed: 0_level_0,mean,count,dif,risk
OnlineBackup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No,0.400811,2465,0.135458,1.510482
No internet service,0.076923,1235,-0.18843,0.289889
Yes,0.21303,1934,-0.052323,0.802817


Unnamed: 0_level_0,mean,count,dif,risk
DeviceProtection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No,0.390587,2486,0.125234,1.471952
No internet service,0.076923,1235,-0.18843,0.289889
Yes,0.224255,1913,-0.041098,0.845119


Unnamed: 0_level_0,mean,count,dif,risk
TechSupport,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No,0.413622,2819,0.148269,1.55876
No internet service,0.076923,1235,-0.18843,0.289889
Yes,0.148101,1580,-0.117252,0.558129


Unnamed: 0_level_0,mean,count,dif,risk
StreamingTV,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No,0.334365,2261,0.069012,1.260076
No internet service,0.076923,1235,-0.18843,0.289889
Yes,0.301216,2138,0.035863,1.135151


Unnamed: 0_level_0,mean,count,dif,risk
StreamingMovies,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No,0.337376,2226,0.072023,1.271424
No internet service,0.076923,1235,-0.18843,0.289889
Yes,0.298665,2173,0.033312,1.125539


Unnamed: 0_level_0,mean,count,dif,risk
Contract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Month-to-month,0.425784,3126,0.160431,1.604592
One year,0.108155,1165,-0.157199,0.407587
Two year,0.028295,1343,-0.237058,0.106631


Unnamed: 0_level_0,mean,count,dif,risk
PaperlessBilling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No,0.161234,2301,-0.104119,0.607621
Yes,0.337234,3333,0.071881,1.270886


Unnamed: 0_level_0,mean,count,dif,risk
PaymentMethod,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bank transfer (automatic),0.168309,1218,-0.097045,0.634282
Credit card (automatic),0.157512,1238,-0.107841,0.593594
Electronic check,0.450479,1878,0.185126,1.697659
Mailed check,0.191538,1300,-0.073815,0.721825


## Mutual Information to Measure Feature Importance of Categorical Features

Mutual information tells us how much we can learn about one variable if we know the value of another

In [75]:
from sklearn.metrics import mutual_info_score

In [91]:
# ex: How much do we know about churn by observing the value of contract, and vice versa
mutual_info = {}
for c in categorical:
    mutual_info[c] = mutual_info_score(train_full['Churn'], train_full[c]) # order doesnt matter.

pd.DataFrame.from_dict(mutual_info,orient='index', columns=['MI']).sort_values('MI', ascending=False)

Unnamed: 0,MI
Contract,0.098582
OnlineSecurity,0.064718
TechSupport,0.062203
InternetService,0.054957
OnlineBackup,0.046739
DeviceProtection,0.043216
PaymentMethod,0.04279
StreamingMovies,0.031387
StreamingTV,0.031207
PaperlessBilling,0.020049


 ## Correlation to Measure Feature Importance of Numerical Features

Correlation value: -1 < r < 1

As we see below,
- As tenure (period of time peope stay) and TotalCharge increases, churn rate goes down
- As Monthly charges increases, churn rate goes up

In [93]:
train_full[numerical].corrwith(train_full['Churn'])

tenure           -0.348064
MonthlyCharges    0.191578
TotalCharges     -0.194850
dtype: float64