## Churn Prediction using ML Classification model

##### predicts which customer of a company is likely to churn in a 0-1 probability

In [17]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

In [29]:
df = pd.read_csv(r"C:\Users\hp\Downloads\ML\Telco-Customer-Churn.csv") #load dataset from copy downloaded from kaggle
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [33]:
df.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
count,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692
std,0.368612,24.559481,30.090047
min,0.0,0.0,18.25
25%,0.0,9.0,35.5
50%,0.0,29.0,70.35
75%,0.0,55.0,89.85
max,1.0,72.0,118.75


#### Data Cleaning 

In [35]:
###formats column names
df.columns = df.columns.str.lower().str.replace(' ', '_')

###formats all index by removing spaces and making index lowercase. ie formats rows
categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')

In [37]:
df.head().T

Unnamed: 0,0,1,2,3,4
customerid,7590-vhveg,5575-gnvde,3668-qpybk,7795-cfocw,9237-hqitu
gender,female,male,male,male,female
seniorcitizen,0,0,0,0,0
partner,yes,no,no,no,no
dependents,no,no,no,no,no
tenure,1,34,2,45,2
phoneservice,no,yes,yes,no,yes
multiplelines,no_phone_service,no,no,no_phone_service,no
internetservice,dsl,dsl,dsl,dsl,fiber_optic
onlinesecurity,no,yes,yes,yes,no


In [39]:
df.dtypes # checks datatypes and restores wrong types. 

customerid           object
gender               object
seniorcitizen         int64
partner              object
dependents           object
tenure                int64
phoneservice         object
multiplelines        object
internetservice      object
onlinesecurity       object
onlinebackup         object
deviceprotection     object
techsupport          object
streamingtv          object
streamingmovies      object
contract             object
paperlessbilling     object
paymentmethod        object
monthlycharges      float64
totalcharges         object
churn                object
dtype: object

In [42]:
df.totalcharges = pd.to_numeric(df.totalcharges, errors = 'coerce')  #changes object dtypes to int and second input ignores nulls, and  other dtypes included

df.totalcharges = df.totalcharges.fillna(0)

In [48]:

df.churn = (df.churn == 'yes').astype(int)  #assigns 0 and 1 to yes and no


df.churn.head()

### Setting up Validation Framework

In [53]:
from sklearn.model_selection import train_test_split

In [57]:
df_full_train, df_test = train_test_split(df, test_size = 0.2, random_state = 1) ##splits test to 20%


In [59]:
len(df_full_train), len(df_test)

(5634, 1409)

In [89]:
df_train, df_val = train_test_split(df_full_train, test_size = 0.25, random_state = 1) ##splits full train to train and validation 60 and 20%
len(df_train), len(df_val)

(4225, 1409)

In [101]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True) 
## maintains the default indexing.

In [103]:
y_train = df_train.churn.values
y_val = df_val.churn.values
y_test = df_test.churn.values   ##saves y values (churn) separately. and deletes churn column from original df

In [107]:
del df_train['churn']
del df_val['churn']
del df_test['churn']

### Exploratory Data Analysis

In [110]:
df_full_train = df_full_train.reset_index(drop=True)

In [112]:
df_full_train.churn.value_counts()

churn
0    4113
1    1521
Name: count, dtype: int64

In [153]:
global_churn = df_full_train.churn.mean()  #churn rate app 27 5of users are churning

In [116]:
df_full_train.dtypes

customerid           object
gender               object
seniorcitizen         int64
partner              object
dependents           object
tenure                int64
phoneservice         object
multiplelines        object
internetservice      object
onlinesecurity       object
onlinebackup         object
deviceprotection     object
techsupport          object
streamingtv          object
streamingmovies      object
contract             object
paperlessbilling     object
paymentmethod        object
monthlycharges      float64
totalcharges        float64
churn                 int32
dtype: object

In [118]:
numerical = ['tenure','monthlycharges', 'totalcharges' ]

In [124]:
categorical = ['gender', 'seniorcitizen', 'partner', 'dependents',
     'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod']

In [147]:
df_full_train[categorical].nunique()

gender              2
seniorcitizen       2
partner             2
dependents          2
phoneservice        2
multiplelines       3
internetservice     3
onlinesecurity      3
onlinebackup        3
deviceprotection    3
techsupport         3
streamingtv         3
streamingmovies     3
contract            3
paperlessbilling    2
paymentmethod       4
dtype: int64

### Feature Importance, Churn Rate and Risk Ratio

In [139]:
churn_female = df_full_train[df_full_train.gender == 'female'].churn.mean() #checks churn rate of females

In [135]:
churn_male = df_full_train[df_full_train.gender == 'male'].churn.mean() #checks churn rate of males

In [141]:
churn_female, churn_male, #to check same for all categorical values, we input the code below

(0.27682403433476394, 0.2632135306553911)

In [157]:
from IPython.display import display #jupyter nbook function to display table below:

In [161]:
for c in categorical:
    print(c) #prints name
    df_group = df_full_train.groupby(c).churn.agg(['mean', 'count']) #groups category by mean and count
    df_group['diff'] = df_group['mean'] - global_churn #compares category churn rate with global churn rate by difference
    df_group['risk'] = df_group['mean'] / global_churn #compares by ratio
    display(df_group)
    print() 
    print() #adds space before next category

gender


Unnamed: 0_level_0,mean,count,diff,risk
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,0.276824,2796,0.006856,1.025396
male,0.263214,2838,-0.006755,0.97498




seniorcitizen


Unnamed: 0_level_0,mean,count,diff,risk
seniorcitizen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.24227,4722,-0.027698,0.897403
1,0.413377,912,0.143409,1.531208




partner


Unnamed: 0_level_0,mean,count,diff,risk
partner,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.329809,2932,0.059841,1.221659
yes,0.205033,2702,-0.064935,0.759472




dependents


Unnamed: 0_level_0,mean,count,diff,risk
dependents,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.31376,3968,0.043792,1.162212
yes,0.165666,1666,-0.104302,0.613651




phoneservice


Unnamed: 0_level_0,mean,count,diff,risk
phoneservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.241316,547,-0.028652,0.89387
yes,0.273049,5087,0.003081,1.011412




multiplelines


Unnamed: 0_level_0,mean,count,diff,risk
multiplelines,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.257407,2700,-0.012561,0.953474
no_phone_service,0.241316,547,-0.028652,0.89387
yes,0.290742,2387,0.020773,1.076948




internetservice


Unnamed: 0_level_0,mean,count,diff,risk
internetservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
dsl,0.192347,1934,-0.077621,0.712482
fiber_optic,0.425171,2479,0.155203,1.574895
no,0.077805,1221,-0.192163,0.288201




onlinesecurity


Unnamed: 0_level_0,mean,count,diff,risk
onlinesecurity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.420921,2801,0.150953,1.559152
no_internet_service,0.077805,1221,-0.192163,0.288201
yes,0.153226,1612,-0.116742,0.56757




onlinebackup


Unnamed: 0_level_0,mean,count,diff,risk
onlinebackup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.404323,2498,0.134355,1.497672
no_internet_service,0.077805,1221,-0.192163,0.288201
yes,0.217232,1915,-0.052736,0.80466




deviceprotection


Unnamed: 0_level_0,mean,count,diff,risk
deviceprotection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.395875,2473,0.125907,1.466379
no_internet_service,0.077805,1221,-0.192163,0.288201
yes,0.230412,1940,-0.039556,0.85348




techsupport


Unnamed: 0_level_0,mean,count,diff,risk
techsupport,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.418914,2781,0.148946,1.551717
no_internet_service,0.077805,1221,-0.192163,0.288201
yes,0.159926,1632,-0.110042,0.59239




streamingtv


Unnamed: 0_level_0,mean,count,diff,risk
streamingtv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.342832,2246,0.072864,1.269897
no_internet_service,0.077805,1221,-0.192163,0.288201
yes,0.302723,2167,0.032755,1.121328




streamingmovies


Unnamed: 0_level_0,mean,count,diff,risk
streamingmovies,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.338906,2213,0.068938,1.255358
no_internet_service,0.077805,1221,-0.192163,0.288201
yes,0.307273,2200,0.037305,1.138182




contract


Unnamed: 0_level_0,mean,count,diff,risk
contract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
month-to-month,0.431701,3104,0.161733,1.599082
one_year,0.120573,1186,-0.149395,0.446621
two_year,0.028274,1344,-0.241694,0.10473




paperlessbilling


Unnamed: 0_level_0,mean,count,diff,risk
paperlessbilling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.172071,2313,-0.097897,0.637375
yes,0.338151,3321,0.068183,1.25256




paymentmethod


Unnamed: 0_level_0,mean,count,diff,risk
paymentmethod,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bank_transfer_(automatic),0.168171,1219,-0.101797,0.622928
credit_card_(automatic),0.164339,1217,-0.10563,0.608733
electronic_check,0.45589,1893,0.185922,1.688682
mailed_check,0.19387,1305,-0.076098,0.718121






### Feature importance: Mutual information

In [170]:
from sklearn.metrics import mutual_info_score #helps us compare mutual probability between variables as we did above using sklearn

In [172]:
mutual_info_score(df_full_train.churn, df_full_train.contract ) ##order doesnt matter. compares prob of churn based on contract  etc

0.0983203874041556

In [174]:
mutual_info_score(df_full_train.churn, df_full_train.gender ) ##less important than contract

0.0001174846211139946

In [176]:
mutual_info_score(df_full_train.churn, df_full_train.partner )

0.009967689095399745

In [184]:
def mutual_info_churn_score(series):
    return mutual_info_score(series, df_full_train.churn)  ##creates a function to iterate over all categories.

In [192]:
mi = df_full_train[categorical].apply(mutual_info_churn_score) #applies func on categorical dataframe and compares all to churn. 
mi.sort_values(ascending = False)

contract            0.098320
onlinesecurity      0.063085
techsupport         0.061032
internetservice     0.055868
onlinebackup        0.046923
deviceprotection    0.043453
paymentmethod       0.043210
streamingtv         0.031853
streamingmovies     0.031581
paperlessbilling    0.017589
dependents          0.012346
partner             0.009968
seniorcitizen       0.009410
multiplelines       0.000857
phoneservice        0.000229
gender              0.000117
dtype: float64

### Feature Importance: Correlation

Used to compare numerical columns as opposed to mutual information which takes categorical columns

In [196]:
df_full_train[numerical].corrwith(df_full_train.churn)  ##syntax to perform correlation

tenure           -0.351885
monthlycharges    0.196805
totalcharges     -0.196353
dtype: float64

### One-hot encoding
used to encode categorical variables

In [263]:
from sklearn.feature_extraction import DictVectorizer

In [201]:
df_train[['gender','contract']].iloc[:100] #seletcs 10 inputs of gender and contract from train dataset

Unnamed: 0,gender,contract
0,female,two_year
1,male,month-to-month
2,female,month-to-month
3,female,month-to-month
4,female,two_year
5,male,month-to-month
6,male,month-to-month
7,female,month-to-month
8,female,two_year
9,female,month-to-month


In [205]:
df_train[['gender','contract']].iloc[:100].to_dict() #converts to dictionary

{'gender': {0: 'female',
  1: 'male',
  2: 'female',
  3: 'female',
  4: 'female',
  5: 'male',
  6: 'male',
  7: 'female',
  8: 'female',
  9: 'female'},
 'contract': {0: 'two_year',
  1: 'month-to-month',
  2: 'month-to-month',
  3: 'month-to-month',
  4: 'two_year',
  5: 'month-to-month',
  6: 'month-to-month',
  7: 'month-to-month',
  8: 'two_year',
  9: 'month-to-month'}}

In [267]:
train_dicts = df_train[categorical + numerical].to_dict(orient = 'records') #aligns horizontally our entire train dataset, side by sidein landscape

In [269]:
dv = DictVectorizer(sparse=False)  #creates a dict vectorizer instance

In [271]:
dv.fit(train_dicts) #passes the dicts

In [273]:
dv.get_feature_names_out()  # shows column names for dv dicts

array(['contract=month-to-month', 'contract=one_year',
       'contract=two_year', 'dependents=no', 'dependents=yes',
       'deviceprotection=no', 'deviceprotection=no_internet_service',
       'deviceprotection=yes', 'gender=female', 'gender=male',
       'internetservice=dsl', 'internetservice=fiber_optic',
       'internetservice=no', 'monthlycharges', 'multiplelines=no',
       'multiplelines=no_phone_service', 'multiplelines=yes',
       'onlinebackup=no', 'onlinebackup=no_internet_service',
       'onlinebackup=yes', 'onlinesecurity=no',
       'onlinesecurity=no_internet_service', 'onlinesecurity=yes',
       'paperlessbilling=no', 'paperlessbilling=yes', 'partner=no',
       'partner=yes', 'paymentmethod=bank_transfer_(automatic)',
       'paymentmethod=credit_card_(automatic)',
       'paymentmethod=electronic_check', 'paymentmethod=mailed_check',
       'phoneservice=no', 'phoneservice=yes', 'seniorcitizen',
       'streamingmovies=no', 'streamingmovies=no_internet_service',

In [281]:
X_train = dv.transform(train_dicts)   #gives the encoded matrix ()

In [283]:
X_train.shape

(4225, 45)

In [285]:
##repeat for val dataset without fitting. we only fit train

val_dicts = df_val[categorical + numerical].to_dict(orient = 'records')
X_val = dv.transform(val_dicts) 

### lOGISTIC rEGRESSION

logistic regression outputs sigmoid values between 0 and 1 whiles linear regression is infinite

Train a model with sklearn
apply it to the validation dataset
calculate the accuracy

In [290]:
from sklearn.linear_model import LogisticRegression

In [312]:
model = LogisticRegression(solver='lbfgs', max_iter=10000 )  #initiates an instance
model.fit(X_train, y_train)

In [314]:
model.intercept_[0] ##value of w0
model.coef_[0].round(3) #checks w of X_train columns of  as in linear regression and rounds to 3 dcm place

array([ 0.682,  0.033, -0.677,  0.054, -0.016,  0.112, -0.159,  0.084,
        0.038, -0.   , -0.494,  0.691, -0.159, -0.018, -0.189,  0.068,
        0.159,  0.115, -0.159,  0.081,  0.284, -0.159, -0.088, -0.163,
        0.201, -0.045,  0.082, -0.053, -0.003,  0.107, -0.012,  0.068,
       -0.03 ,  0.193, -0.094, -0.159,  0.291, -0.054, -0.159,  0.251,
        0.234, -0.159, -0.038, -0.069,  0.   ])

In [316]:
## model.predict(X_train)  #returns predictions of 0 or 1 as churn or no churn. this is called hard predictions
## model.predict_proba(X_train) #soft predict return probability first column is prob of no churn, second is prob of churn.
y_pred = model.predict_proba(X_val)[:,1] #takes only prob of churn rations. We focus on val data

In [318]:
churn_decision = (y_pred >= 0.5)

In [320]:
churn_decision

array([False, False, False, ..., False,  True,  True])

In [322]:
#compare y_val and churn_decision to check accuracy
y_val

array([0, 0, 0, ..., 0, 1, 1])

In [324]:
churn_decision.astype(int)  #coverts the boolean to int

array([0, 0, 0, ..., 0, 1, 1])

##to check accuracy
(y_val == churn_decision).mean()   

##compares all values that are true from both y and x and returns ratio.
##shows about 80% match.

View w values of each feature and select most important w to retrain model

Just like in lin regression, compute a total of w0 and w*x to find the y. but logistic take sigmoid(y) to give probability between 0 to 1 where above 0.5 means can churn

### Using the model

In [337]:
y_full_train = df_full_train.churn.values

del df_full_train['churn']

#### Process of training model

In [339]:
#set dictionary of full train in landscape
dicts_full_train = df_full_train[categorical + numerical].to_dict(orient = 'records')

#create Dict vectorizer instance to convert dicts to vectors
dv = DictVectorizer(sparse=False) 

#fits into dv
dv.fit(dicts_full_train)

#transform
X_full_train = dv.transform(dicts_full_train)

#train the lg model
model = LogisticRegression(solver='lbfgs', max_iter=10000 )  #initiates an instance
model.fit(X_full_train, y_full_train)

Model trained

##### repeat same for test data

In [347]:
#test data is converted to dict because dv takes dict
dicts_test = df_test[categorical + numerical].to_dict(orient = 'records')

#don't fit to dv because we're not going to train. just transform test data
X_test = dv.transform(dicts_test) 

#predict
y_pred = model.predict_proba(X_test)[:,1]

#create churn conditon
churn_decision = (y_pred >= 0.5 )


In [353]:
###test the accuracy
(churn_decision == y_test).mean()

0.8119233498935415

In [371]:
##To use it on a customer, the model takes the feature matrix of customer and computes dot products of w and wo, then gives probab.

customer = dicts_test[10]

X_small = dv.transform([customer]) #always transform encoded features before testing on model

model.predict_proba(X_small)[0,1] #models gives 0.47 hence no churn
 


0.4798103384356113

In [373]:
y_test[10]  # customer correctly won't churn

0

#### Our Model predicts correctly 

In [None]:
Summary

Load Data

Clean data, harmonize names, fill missing values

Set up validation framework

Perform EDA

Perform one hot encoding

Train Model with Logistic Regression  from train data and test on val data

Train Model with full train data and test on test data