# 1 - Churn prediction project

In [28]:
import pandas as pd
import numpy as np

import seaborn as sns 
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer

from sklearn.linear_model import LogisticRegression

In [2]:
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
# To see more data
df.head().T

Unnamed: 0,0,1,2,3,4
customerID,7590-VHVEG,5575-GNVDE,3668-QPYBK,7795-CFOCW,9237-HQITU
gender,Female,Male,Male,Male,Female
SeniorCitizen,0,0,0,0,0
Partner,Yes,No,No,No,No
Dependents,No,No,No,No,No
tenure,1,34,2,45,2
PhoneService,No,Yes,Yes,No,Yes
MultipleLines,No phone service,No,No,No phone service,No
InternetService,DSL,DSL,DSL,DSL,Fiber optic
OnlineSecurity,No,Yes,Yes,Yes,No


The data has a few coolumns

- CustomerID -- the ID of the customer
- Gender -- male/female
- Senior Citizen -- whether the customer is a senior citizen (0/1)
- Partner -- whether they live with a partner (yes/no)
- Dependents -- whether they have dependants (yes/no)
- Tenure -- number of months since the start of the contract
- Phone service -- whether they have phone services (yes/no)
- Multiple lines -- whether they have multiple phone lines (yes/no/no phone service)
- Internet service -- the type of internet service (no/fiber/optic)
- Online security -- if online security is enabled (yes/no/no internet)
- Online backup -- if online bakup service is enabled (yes/no/no internet)
- Device protection -- if the device protection service is enabled (yes/no/no internet)
- Tech support -- if the customer have tech support (yes/no/no internet)
- Streaming TV -- if the TV streaming service is enabled (yes/no/no internet)
- Streaming movies -- if the moovie streaming service is enabled (yes/no/no internet)
- Contract -- the type of contract (monthly/yearly/two years)
- Paperless billing -- if the billing is paperless (yes/no)
- Payment method -- payment method (electronic check, mailed check, bank transfer, credit card)
- Monthly charges -- the amount charged monthly (numeric)
- Total charges -- the total amount charged (numeric)
- Churn -- if the client has canceled the contract (yes/no)

In [4]:
df.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [5]:
# Convert the column TotalCharges to numeric
total_charges = pd.to_numeric(df.TotalCharges, errors='coerce') # errors='coerce' to replace all non-numeric values with a NaN

# To confirm that there are non-numeric characters in the data
df[total_charges.isnull()][['customerID', 'TotalCharges']]

Unnamed: 0,customerID,TotalCharges
488,4472-LVYGI,
753,3115-CZMZD,
936,5709-LVOEQ,
1082,4367-NUYAO,
1340,1371-DWPAZ,
3331,7644-OMVMY,
3826,3213-VVOLG,
4380,2520-SGTTA,
5218,2923-ARZLG,
6670,4075-WKNIU,


In [6]:
# Convert the column TotalCharges to numeric
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
# Set the missing values to 0
df.TotalCharges = df.TotalCharges.fillna(0)

In [7]:
# Make the column_names uniform
df.columns = df.columns.str.lower().str.replace(' ', '_')
string_columns = list(df.dtypes[df.dtypes == 'object'].index)
for col in string_columns:
    df[col] = df[col].str.lower().str.replace(' ', '_')

In [8]:
# Convert the target to number (0 for no and 1 for yes)
df.churn = (df.churn == 'yes').astype(int)

In [9]:
# Split the data in train, validation and test
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=1) # random_state=1 Controlls the shuffling before the split 
# Set the random seed to ensure that the result is the same at each code run
df_train, df_val = train_test_split(df_train_full, test_size=0.33, random_state=11)

# Take the churn column and save it out of the dataframe
y_train = df_train.churn.values
y_val = df_val.churn.values

# Delete the churn column to avoid confusion
del df_train['churn']
del df_val['churn']

### Exploratory data analysis

In [10]:
# The missing values
df_train_full.isnull().sum()

customerid          0
gender              0
seniorcitizen       0
partner             0
dependents          0
tenure              0
phoneservice        0
multiplelines       0
internetservice     0
onlinesecurity      0
onlinebackup        0
deviceprotection    0
techsupport         0
streamingtv         0
streamingmovies     0
contract            0
paperlessbilling    0
paymentmethod       0
monthlycharges      0
totalcharges        0
churn               0
dtype: int64

In [11]:
# The distribution of the target
df_train_full.churn.value_counts()

0    4113
1    1521
Name: churn, dtype: int64

The data is imbalanced

In [12]:
# Separate the categorical and the numerical features in two lists
categorical = ['gender', 'seniorcitizen', 'partner', 'dependents', 'phoneservice', 'multiplelines', 'internetservice',
                'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport', 'streamingtv', 'streamingmovies',
                'contract', 'paperlessbilling', 'paymentmethod']
numerical = ['tenure', 'monthlycharges', 'totalcharges']

In [13]:
df_train_full[categorical].nunique()

gender              2
seniorcitizen       2
partner             2
dependents          2
phoneservice        2
multiplelines       3
internetservice     3
onlinesecurity      3
onlinebackup        3
deviceprotection    3
techsupport         3
streamingtv         3
streamingmovies     3
contract            3
paperlessbilling    2
paymentmethod       4
dtype: int64

#### Feature importance

In [14]:
# Check the importance of the gender feature

# Check the churn rate for all female customers
female_mean = df_train_full[df_train_full.gender == 'female'].churn.mean()
print("gender == female: {:.3f}".format(female_mean))

# Check the churn rate for all male customers
male_mean = df_train_full[df_train_full.gender == 'male'].churn.mean()
print("gender == male: {:.3f}".format(male_mean))

gender == female: 0.277
gender == male: 0.263


Knowing the gender of the customer doesn't help ientify whether they will churn

In [15]:
# Check the importance of the partner feature

# Check the churn rate if partner == yes
partner_yes = df_train_full[df_train_full.partner == 'yes'].churn.mean()
print("partner == yes: {:.3f}".format(partner_yes))

# Check the churn rate if partner == no
partner_no = df_train_full[df_train_full.partner == 'no'].churn.mean()
print("partner == no: {:.3f}".format(partner_no))

partner == yes: 0.205
partner == no: 0.330


The partner variable is useful for predicting churn

#### _RISK RATIO_

It's interesting to look at the ratio between the group rate and the global rate. In statistics, the ratio between probabilities in different groups is called risk ratio. In our case, the risk of churning is :

<center>risk = group rate / global rate</center>

In [16]:
# Calculate the global churn rate
global_mean = df_train_full.churn.mean()
# Compute the AVG(churn)
df_group = df_train_full.groupby(by='gender').churn.agg(['mean'])
# Calculate the difference between group churn rate and global rate
df_group['diff'] = df_group['mean'] - global_mean
# Calculate the risk of churning
df_group['risk'] = df_group['mean'] / global_mean

df_group

Unnamed: 0_level_0,mean,diff,risk
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.276824,0.006856,1.025396
male,0.263214,-0.006755,0.97498


In [17]:
from IPython.display import display

# Loop over all categorical variables
for col in categorical:
    # Perform group by for each categorical variable and calculate the difference and the rate
    df_group = df_train_full.groupby(by=col).churn.agg(['mean'])
    df_group['diff'] = df_group['mean'] - global_mean
    df_group['rate'] = df_group['mean'] / global_mean
    # Display the resulting dataframe
    display(df_group)

Unnamed: 0_level_0,mean,diff,rate
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.276824,0.006856,1.025396
male,0.263214,-0.006755,0.97498


Unnamed: 0_level_0,mean,diff,rate
seniorcitizen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.24227,-0.027698,0.897403
1,0.413377,0.143409,1.531208


Unnamed: 0_level_0,mean,diff,rate
partner,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.329809,0.059841,1.221659
yes,0.205033,-0.064935,0.759472


Unnamed: 0_level_0,mean,diff,rate
dependents,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.31376,0.043792,1.162212
yes,0.165666,-0.104302,0.613651


Unnamed: 0_level_0,mean,diff,rate
phoneservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.241316,-0.028652,0.89387
yes,0.273049,0.003081,1.011412


Unnamed: 0_level_0,mean,diff,rate
multiplelines,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.257407,-0.012561,0.953474
no_phone_service,0.241316,-0.028652,0.89387
yes,0.290742,0.020773,1.076948


Unnamed: 0_level_0,mean,diff,rate
internetservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
dsl,0.192347,-0.077621,0.712482
fiber_optic,0.425171,0.155203,1.574895
no,0.077805,-0.192163,0.288201


Unnamed: 0_level_0,mean,diff,rate
onlinesecurity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.420921,0.150953,1.559152
no_internet_service,0.077805,-0.192163,0.288201
yes,0.153226,-0.116742,0.56757


Unnamed: 0_level_0,mean,diff,rate
onlinebackup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.404323,0.134355,1.497672
no_internet_service,0.077805,-0.192163,0.288201
yes,0.217232,-0.052736,0.80466


Unnamed: 0_level_0,mean,diff,rate
deviceprotection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.395875,0.125907,1.466379
no_internet_service,0.077805,-0.192163,0.288201
yes,0.230412,-0.039556,0.85348


Unnamed: 0_level_0,mean,diff,rate
techsupport,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.418914,0.148946,1.551717
no_internet_service,0.077805,-0.192163,0.288201
yes,0.159926,-0.110042,0.59239


Unnamed: 0_level_0,mean,diff,rate
streamingtv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.342832,0.072864,1.269897
no_internet_service,0.077805,-0.192163,0.288201
yes,0.302723,0.032755,1.121328


Unnamed: 0_level_0,mean,diff,rate
streamingmovies,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.338906,0.068938,1.255358
no_internet_service,0.077805,-0.192163,0.288201
yes,0.307273,0.037305,1.138182


Unnamed: 0_level_0,mean,diff,rate
contract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
month-to-month,0.431701,0.161733,1.599082
one_year,0.120573,-0.149395,0.446621
two_year,0.028274,-0.241694,0.10473


Unnamed: 0_level_0,mean,diff,rate
paperlessbilling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.172071,-0.097897,0.637375
yes,0.338151,0.068183,1.25256


Unnamed: 0_level_0,mean,diff,rate
paymentmethod,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bank_transfer_(automatic),0.168171,-0.101797,0.622928
credit_card_(automatic),0.164339,-0.10563,0.608733
electronic_check,0.45589,0.185922,1.688682
mailed_check,0.19387,-0.076098,0.718121


From the results we learn that :

- For gender, there is no much difference between males and females
- Senior citizens tend to churn more than nonseniors
- People with a partner churn less than people with no partner
- People who use phone service are not at risk of churning
- People with no tech support tend to churn more than those who do
- People with monthly contracts cancel the contract a lot more often than others, and people with two-year contracts churn very rarely

#### _Mutual Information_

We can measure the degree of dependency between a categorical variable and the target variable using the metrics of importance. If two variables are dependent, knowing the value of one variable gives us some information about another. On the other hand, if a variable is completely independant of the target variable, it's not useful and can be safely removed from the dataset.

For categorical variables, one such metric is mutual information, which tells how much information we learn about one variable if we got to learn the value of the other variable. In machine learning, we often use it to measure the mutual dependency between two variables.

Higher values of mutual information mean a higher degree of dependence and lower values mean that the two variables are independent, so the variable will not be very useful for predecting the target.

In [18]:
# Create a stand-alone function for calculating mutual information
def calculate_mi(series):
    """Use the mutual_info_score function from sklearn"""
    return mutual_info_score(series, df_train_full.churn)

# Apply the function to each categorical column of the dataset
df_mi = df_train_full[categorical].apply(calculate_mi)
# Sort values of the result
df_mi = df_mi.sort_values(ascending=False).to_frame(name='MI')
df_mi

Unnamed: 0,MI
contract,0.09832
onlinesecurity,0.063085
techsupport,0.061032
internetservice,0.055868
onlinebackup,0.046923
deviceprotection,0.043453
paymentmethod,0.04321
streamingtv,0.031853
streamingmovies,0.031581
paperlessbilling,0.017589


The most useful features according to the mutual information score are contract, onlinesecurity, techsupport, internetservice and online backup

The least useful features according to the mutual information score are partner, seniorcitizen, multiplelines, phoneservice and gender

#### _CORRELATION COEFFICIENT_

In [19]:
# Corr
df_train_full[numerical].corrwith(df_train_full.churn)

tenure           -0.351885
monthlycharges    0.196805
totalcharges     -0.196353
dtype: float64

# 2 - Feature engineering

#### _One-hot encoding_

In [22]:
# Covert the dataframe to a list of dictionaries
train_dict = df_train[categorical + numerical].to_dict(orient='rows')

In [24]:
train_dict[1]

{'gender': 'female',
 'seniorcitizen': 1,
 'partner': 'yes',
 'dependents': 'no',
 'phoneservice': 'yes',
 'multiplelines': 'yes',
 'internetservice': 'fiber_optic',
 'onlinesecurity': 'no',
 'onlinebackup': 'no',
 'deviceprotection': 'yes',
 'techsupport': 'no',
 'streamingtv': 'yes',
 'streamingmovies': 'yes',
 'contract': 'one_year',
 'paperlessbilling': 'yes',
 'paymentmethod': 'credit_card_(automatic)',
 'tenure': 60,
 'monthlycharges': 100.5,
 'totalcharges': 6029.0}

In [25]:
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)

DictVectorizer(sparse=False)

In [26]:
# Convert the dictionarie to a matrix
X_train = dv.transform(train_dict)
X_train[0]

array([0.0000e+00, 0.0000e+00, 1.0000e+00, 1.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00, 1.0000e+00,
       1.0000e+00, 0.0000e+00, 0.0000e+00, 8.6100e+01, 1.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 1.0000e+00,
       0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00, 1.0000e+00,
       0.0000e+00, 1.0000e+00, 1.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 1.0000e+00, 0.0000e+00, 0.0000e+00, 1.0000e+00,
       0.0000e+00, 0.0000e+00, 1.0000e+00, 7.1000e+01, 6.0459e+03])

In [27]:
dv.get_feature_names()

['contract=month-to-month',
 'contract=one_year',
 'contract=two_year',
 'dependents=no',
 'dependents=yes',
 'deviceprotection=no',
 'deviceprotection=no_internet_service',
 'deviceprotection=yes',
 'gender=female',
 'gender=male',
 'internetservice=dsl',
 'internetservice=fiber_optic',
 'internetservice=no',
 'monthlycharges',
 'multiplelines=no',
 'multiplelines=no_phone_service',
 'multiplelines=yes',
 'onlinebackup=no',
 'onlinebackup=no_internet_service',
 'onlinebackup=yes',
 'onlinesecurity=no',
 'onlinesecurity=no_internet_service',
 'onlinesecurity=yes',
 'paperlessbilling=no',
 'paperlessbilling=yes',
 'partner=no',
 'partner=yes',
 'paymentmethod=bank_transfer_(automatic)',
 'paymentmethod=credit_card_(automatic)',
 'paymentmethod=electronic_check',
 'paymentmethod=mailed_check',
 'phoneservice=no',
 'phoneservice=yes',
 'seniorcitizen',
 'streamingmovies=no',
 'streamingmovies=no_internet_service',
 'streamingmovies=yes',
 'streamingtv=no',
 'streamingtv=no_internet_servic

#### _Logistic regression_

In [29]:
model = LogisticRegression(solver='liblinear', random_state=1)
model.fit(X_train, y_train)

LogisticRegression(random_state=1, solver='liblinear')

In [30]:
# One-hot encoding
val_dict = df_val[categorical + numerical].to_dict(orient='rows')
# Use the transform that was fitted previously
X_val = dv.transform(val_dict)

In [32]:
y_pred = model.predict_proba(X_val)
y_pred

array([[0.76509203, 0.23490797],
       [0.73114243, 0.26885757],
       [0.68054933, 0.31945067],
       ...,
       [0.9427494 , 0.0572506 ],
       [0.38477113, 0.61522887],
       [0.93872737, 0.06127263]])

In [33]:
y_pred = model.predict_proba(X_val)[:, 1]
y_pred

array([0.23490797, 0.26885757, 0.31945067, ..., 0.0572506 , 0.61522887,
       0.06127263])

In [34]:
y_pred >= 0.5

array([False, False, False, ..., False,  True, False])

In [35]:
churn = y_pred >= 0.5

In [36]:
# Accuracy 
(y_val == churn).mean()

0.8016129032258065

#### _Model Interpretation_

In [37]:
# The bias
model.intercept_[0]

-0.121988402285897

In [38]:
# The weights of each feature
dict(zip(dv.get_feature_names(), model.coef_[0].round(3)))

{'contract=month-to-month': 0.563,
 'contract=one_year': -0.086,
 'contract=two_year': -0.599,
 'dependents=no': -0.03,
 'dependents=yes': -0.092,
 'deviceprotection=no': 0.1,
 'deviceprotection=no_internet_service': -0.116,
 'deviceprotection=yes': -0.106,
 'gender=female': -0.027,
 'gender=male': -0.095,
 'internetservice=dsl': -0.323,
 'internetservice=fiber_optic': 0.317,
 'internetservice=no': -0.116,
 'monthlycharges': 0.001,
 'multiplelines=no': -0.168,
 'multiplelines=no_phone_service': 0.127,
 'multiplelines=yes': -0.081,
 'onlinebackup=no': 0.136,
 'onlinebackup=no_internet_service': -0.116,
 'onlinebackup=yes': -0.142,
 'onlinesecurity=no': 0.258,
 'onlinesecurity=no_internet_service': -0.116,
 'onlinesecurity=yes': -0.264,
 'paperlessbilling=no': -0.213,
 'paperlessbilling=yes': 0.091,
 'partner=no': -0.048,
 'partner=yes': -0.074,
 'paymentmethod=bank_transfer_(automatic)': -0.027,
 'paymentmethod=credit_card_(automatic)': -0.136,
 'paymentmethod=electronic_check': 0.175,
