# Machine Learning for Classification 
## Churn Prediction Project

In [1]:
# Import modules
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
# Import the dataset
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

# Exploratory Data Analysis

In [3]:
# Number of rows and columns in our dataset
print(df.shape) 
# Too many columns: transpose the dataframe to switch columns for a better view
df.head(2).T
# View the data types: we see Total Charges is an object (String) instead of a float
df.dtypes
# Convert TotalCharges to numeric
df.TotalCharges = pd.to_numeric(df.TotalCharges,errors='coerce')
# Find columns that have missing values and replace them with 0's
df.isnull().sum()
df.TotalCharges = df.TotalCharges.fillna(0)

(7043, 21)


In [4]:
# Convert column names to lowercase, replace spaces with underscores
df.columns = df.columns.str.lower().str.replace(' ','_')
string_columns = list(df.dtypes[df.dtypes == 'object'].index)
for col in string_columns:
    df[col] = df[col].str.lower().str.replace(' ','_')

In [5]:
# Convert churn from integer to boolean: 1 for yes and 0 for no
df.churn = (df.churn == 'yes').astype(int)
df['churn'].unique()

array([0, 1])

In [6]:
df.churn.head()

0    0
1    0
2    1
3    0
4    1
Name: churn, dtype: int64

# Setup validation model using scikit-learn

In [7]:
# Import the module for data splitting: 80% for training and 20% for testing
from sklearn.model_selection import train_test_split

In [8]:
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=1)

In [9]:
print(f'Size of datasets (row,col): training_full dataset - {df_train_full.shape} and test dataset - {df_test.shape}')

Size of datasets (row,col): training_full dataset - (5634, 21) and test dataset - (1409, 21)


In [10]:
df_train,df_val = train_test_split(df_train_full,test_size=0.33, random_state=11)

In [11]:
y_train = df_train.churn.values
y_val = df_val.churn.values

In [12]:
del df_train['churn']
del df_val['churn']

# Some more Exploratory Data Analysis

In [13]:
# Check for missing values, we have none
df_train_full.isnull().sum()

customerid          0
gender              0
seniorcitizen       0
partner             0
dependents          0
tenure              0
phoneservice        0
multiplelines       0
internetservice     0
onlinesecurity      0
onlinebackup        0
deviceprotection    0
techsupport         0
streamingtv         0
streamingmovies     0
contract            0
paperlessbilling    0
paymentmethod       0
monthlycharges      0
totalcharges        0
churn               0
dtype: int64

In [14]:
# Check the distribution  of values in the target variable: 27% (1521/5634) have churned
df_train_full.churn.value_counts()

churn
0    4113
1    1521
Name: count, dtype: int64

In [15]:
# Calculate churn rate: % of users that have churned, or probaility of churning
# In our dataset, non-churn dataset dominates, our dataset is imbalanced
global_mean = df_train_full.churn.mean()
print(round(global_mean,3))

0.27


In [16]:
# Create separate lists for categorical and numerical variables in our dataset
categorical = ['gender', 'seniorcitizen', 'partner', 'dependents','phoneservice', 'multiplelines', \
               'internetservice', 'onlinesecurity', 'onlinebackup', 'deviceprotection',\
               'techsupport', 'streamingtv', 'streamingmovies',\
               'contract', 'paperlessbilling', 'paymentmethod']
numerical = ['tenure', 'monthlycharges', 'totalcharges']

In [17]:
# Check number of unique values 
df_train_full[categorical].nunique()

gender              2
seniorcitizen       2
partner             2
dependents          2
phoneservice        2
multiplelines       3
internetservice     3
onlinesecurity      3
onlinebackup        3
deviceprotection    3
techsupport         3
streamingtv         3
streamingmovies     3
contract            3
paperlessbilling    2
paymentmethod       4
dtype: int64

# Feature Importance Analysis

In [18]:
# Calculate churn for gender: male and female
female_mean = df_train_full[df_train_full['gender'] == 'female'].churn.mean()
male_mean = df_train_full[df_train_full['gender'] == 'male'].churn.mean()
print(f'Churn for females is {round(female_mean,2)}, and churn for males is {round(male_mean,2)}')
# Global churn is 27%, and difference between churn for males and females is very small
# This means, gender doesn't help us identify if a customer will churn

Churn for females is 0.28, and churn for males is 0.26


In [19]:
# Calculate churn for partner: yes/no
partner_yes = df_train_full[df_train_full['partner'] == 'yes'].churn.mean()
partner_no = df_train_full[df_train_full['partner'] == 'no'].churn.mean()
print(f'Churn for partner-yes is {round(partner_yes,2)}, and churn for partner-no is {round(partner_no,2)}')
# Clients with no partners are more likely to churn

Churn for partner-yes is 0.21, and churn for partner-no is 0.33


In [20]:
# Instead of looking at the difference, let's look at the ratio between group rate and global rate, called risk ratio
# Risk Ratio = group rate/global rate, between 0 and infinity.
# Risk ratio ~ 1 if the diffrence between group and global rate is less: i.e. the group has same risk as overall population
# Risk Ratio < 1: churn rate of the group is less than the global churn
# Risk Ratio > 1: There's more churn in the group than the population, 2 means group is twice as likely to churn.
print(round(male_mean/df_train_full.churn.mean(),2)) # risk ratio for gender = male, group mean/global mean
print(round(female_mean/df_train_full.churn.mean(),2)) # risk ration for gender = female, group mean/global mean


0.97
1.03


In [21]:
# Calculate risk ratio for all categorical variables
for col in  categorical:
    df_group = df_train_full.groupby(by=col).churn.agg(['mean'])
    df_group['diff'] = df_group['mean'] - global_mean
    df_group['risk'] = df_group['mean']/global_mean
    display(df_group)

Unnamed: 0_level_0,mean,diff,risk
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.276824,0.006856,1.025396
male,0.263214,-0.006755,0.97498


Unnamed: 0_level_0,mean,diff,risk
seniorcitizen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.24227,-0.027698,0.897403
1,0.413377,0.143409,1.531208


Unnamed: 0_level_0,mean,diff,risk
partner,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.329809,0.059841,1.221659
yes,0.205033,-0.064935,0.759472


Unnamed: 0_level_0,mean,diff,risk
dependents,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.31376,0.043792,1.162212
yes,0.165666,-0.104302,0.613651


Unnamed: 0_level_0,mean,diff,risk
phoneservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.241316,-0.028652,0.89387
yes,0.273049,0.003081,1.011412


Unnamed: 0_level_0,mean,diff,risk
multiplelines,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.257407,-0.012561,0.953474
no_phone_service,0.241316,-0.028652,0.89387
yes,0.290742,0.020773,1.076948


Unnamed: 0_level_0,mean,diff,risk
internetservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
dsl,0.192347,-0.077621,0.712482
fiber_optic,0.425171,0.155203,1.574895
no,0.077805,-0.192163,0.288201


Unnamed: 0_level_0,mean,diff,risk
onlinesecurity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.420921,0.150953,1.559152
no_internet_service,0.077805,-0.192163,0.288201
yes,0.153226,-0.116742,0.56757


Unnamed: 0_level_0,mean,diff,risk
onlinebackup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.404323,0.134355,1.497672
no_internet_service,0.077805,-0.192163,0.288201
yes,0.217232,-0.052736,0.80466


Unnamed: 0_level_0,mean,diff,risk
deviceprotection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.395875,0.125907,1.466379
no_internet_service,0.077805,-0.192163,0.288201
yes,0.230412,-0.039556,0.85348


Unnamed: 0_level_0,mean,diff,risk
techsupport,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.418914,0.148946,1.551717
no_internet_service,0.077805,-0.192163,0.288201
yes,0.159926,-0.110042,0.59239


Unnamed: 0_level_0,mean,diff,risk
streamingtv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.342832,0.072864,1.269897
no_internet_service,0.077805,-0.192163,0.288201
yes,0.302723,0.032755,1.121328


Unnamed: 0_level_0,mean,diff,risk
streamingmovies,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.338906,0.068938,1.255358
no_internet_service,0.077805,-0.192163,0.288201
yes,0.307273,0.037305,1.138182


Unnamed: 0_level_0,mean,diff,risk
contract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
month-to-month,0.431701,0.161733,1.599082
one_year,0.120573,-0.149395,0.446621
two_year,0.028274,-0.241694,0.10473


Unnamed: 0_level_0,mean,diff,risk
paperlessbilling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.172071,-0.097897,0.637375
yes,0.338151,0.068183,1.25256


Unnamed: 0_level_0,mean,diff,risk
paymentmethod,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bank_transfer_(automatic),0.168171,-0.101797,0.622928
credit_card_(automatic),0.164339,-0.10563,0.608733
electronic_check,0.45589,0.185922,1.688682
mailed_check,0.19387,-0.076098,0.718121


In [22]:
# Calculate Mutual Information = Degree of dependency between a categorical variable and target
# If mutual information between a categorical variable and target is high, the variable is useful for predicting the target
# If mutual information between a categorical variable and target is low, then variable & target are independent
from sklearn.metrics import mutual_info_score
def calculate_mi(series):
    return mutual_info_score(series, df_train_full.churn)

df_mi = df_train_full[categorical].apply(calculate_mi)
df_mi = df_mi.sort_values(ascending=False).to_frame(name='MI')
df_mi
# We see that contract, onlinesecurity, techsupport are the most important features

Unnamed: 0,MI
contract,0.09832
onlinesecurity,0.063085
techsupport,0.061032
internetservice,0.055868
onlinebackup,0.046923
deviceprotection,0.043453
paymentmethod,0.04321
streamingtv,0.031853
streamingmovies,0.031581
paperlessbilling,0.017589


In [23]:
# Mutual Info doesn't work when one of the features is numerical
# Correlation Coefficient: value is between 1 to -1
# +ve correlation: when one variable goes up, other variable goes up as well
# Zero correlation: No relation between variables, they are independent 
# -ve correlation: When one variable goes up, other variable goes down.
df_train_full[numerical].corrwith(df_train_full.churn)

tenure           -0.351885
monthlycharges    0.196805
totalcharges     -0.196353
dtype: float64

In [24]:
# Tenure: -ve correlation: longer the customers stay, they are less likely to churn
# Monthly Charges: +ve correlation, Customers who pay more often tend to churn
# Total Charges: -ve correlation: Longer a customer has stayed, more they have paid, and less likely to churn

# Feature Engineering

In [25]:
# Transform all categorical variables to numeric features, as ML models can only deal with numbers in matrices
# One-Hot Encoding to do the same: we will use scikit-learn's DictVectorizer to implement it
# DictVectorizer takes a dictionary  and created vectors: these vectors are put as rows in a Matrix
# Step1: Convert df into a list of dictionaries: each column in the df becomes the dictionary key
train_dict = df_train[categorical + numerical].to_dict(orient='records')
# Step 2: Use DictVectorizer()
# sparse=False will create a NumPy array
# The fit() method looks at the dictionary and applies one-hot encoding to categorical features
from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)
# Step 3: Convert the vectorizer into a Matrix
X_train = dv.transform(train_dict)
dv.get_feature_names_out()

array(['contract=month-to-month', 'contract=one_year',
       'contract=two_year', 'dependents=no', 'dependents=yes',
       'deviceprotection=no', 'deviceprotection=no_internet_service',
       'deviceprotection=yes', 'gender=female', 'gender=male',
       'internetservice=dsl', 'internetservice=fiber_optic',
       'internetservice=no', 'monthlycharges', 'multiplelines=no',
       'multiplelines=no_phone_service', 'multiplelines=yes',
       'onlinebackup=no', 'onlinebackup=no_internet_service',
       'onlinebackup=yes', 'onlinesecurity=no',
       'onlinesecurity=no_internet_service', 'onlinesecurity=yes',
       'paperlessbilling=no', 'paperlessbilling=yes', 'partner=no',
       'partner=yes', 'paymentmethod=bank_transfer_(automatic)',
       'paymentmethod=credit_card_(automatic)',
       'paymentmethod=electronic_check', 'paymentmethod=mailed_check',
       'phoneservice=no', 'phoneservice=yes', 'seniorcitizen',
       'streamingmovies=no', 'streamingmovies=no_internet_service',