# Customer Churn Prediction with Imbalanced Data

In this project we will use traditional classifiers to predict customer churn. Our dataset is significantly imbalanced, with the 'No Churn' instances outnumbering the 'Churn' ones to the degree that it will influence our models negatively.

We will deal with this by upsampling the minority class, paying attention to the rate of false negatives as we train our models.

We start with some exploratory analysis, then data preprocessing and preparation, and finally, machine learning models and comparison of their performance.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.patches as mpatches
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')

In [None]:
df=pd.read_csv('../input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv')
df.head()

In [None]:
df.info()

Although there don't seem to be any nulls, there are eleven rows where the TotalCharges is " ".

In [None]:
df[df['TotalCharges']==" "]

There are eleven rows where the TotalCharges is " ". These clients have zero tenure, and haven't churned, therefore we presume they're new customers who haven't paid anything yet.

We register these rows as Total Charges = 0

In [None]:
df['TotalCharges']=df['TotalCharges'].replace(" ",0).astype('float32')

In [None]:
#percentage of classes
ch=df[df['Churn']=='Yes']
no_ch=df[df['Churn']=='No']
print('churn percentage-->',(ch.shape[0]/df.shape[0])*100)
print('no churn percentage-->',(no_ch.shape[0]/df.shape[0])*100)

df['Churn'].value_counts().plot(kind='pie', autopct='%1.1f%%');

We have an imbalanced dataset and this will probably pose problems to our machine learning. We will deal with this later.

In [None]:
data=df.copy()

### Exploratory Analysis

Distribution of categorical variables

In [None]:
def pie(features):
    for feature in features:
        plt.figure(figsize=(10,10))
        plt.subplot(1,2,1)
        data[data['Churn']=='Yes'][feature].value_counts().plot(kind='pie', autopct='%1.1f%%');
        plt.title('Churn');
        plt.subplot(1,2,2)
        data[data['Churn']=='No'][feature].value_counts().plot(kind='pie', autopct='%1.1f%%');
        plt.title('No Churn');

features=['gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod']
pie(features)

Distribution of Continuous Variables

In [None]:
def kde(feature):
    plt.figure(figsize=(14,4))
    plt.title('Distribution of {}'. format(feature))
    sns.kdeplot(data[data['Churn']=='Yes'][feature], label='Churn');
    sns.kdeplot(data[data['Churn']=='No'][feature], label='No Churn');

kde('tenure')
kde('MonthlyCharges')
kde('TotalCharges')

In [None]:
def box(feature):
    plt.figure(figsize=(4,4))
    sns.boxplot(x='Churn', y=feature, data=data);
    
box('tenure')
box('MonthlyCharges')    
box('TotalCharges')

### Data Preprocessing and Feature Selection

Now we will prepare the dataset for model training, by one-hot encoding categoricals and removing unnecessary columns ('customerID'). For maximus control over the process we will do it in a structured, step-by-step way.

In [None]:
#step-by-step dummy encoding, 
#encoding one column at a time and deleting redundant columns

data.drop(columns=data.columns[0],inplace=True)

data['Male']=pd.get_dummies(data.iloc[:,0], drop_first=True)
data.drop(columns=data.columns[0],inplace=True)

data['Partner_yes']=pd.get_dummies(data.iloc[:,1],drop_first=True)
data.drop(columns=data.columns[1], inplace=True)

data['Dependent_yes']=pd.get_dummies(data.iloc[:,1],drop_first=True)
data.drop(columns=data.columns[1], inplace=True)

data['Phone_service_yes']=pd.get_dummies(data.iloc[:,2],drop_first=True)
data.drop(columns=data.columns[2], inplace=True)

data['multiple_lines_yes']=pd.get_dummies(data.iloc[:,2]).iloc[:,-1]
data.drop(columns=data.columns[2], inplace=True)

internet=pd.get_dummies(data.iloc[:,2],prefix='Internet')
data=pd.concat([data,internet],axis=1).drop(columns=['InternetService'])

data['online security_yes']=pd.get_dummies(data.iloc[:,2]).iloc[:,2]
data.drop(columns='OnlineSecurity',inplace=True)

data['online backup_yes']=pd.get_dummies(data.iloc[:,2]).iloc[:,2]
data.drop(columns='OnlineBackup',inplace=True)

data['device protection_yes']=pd.get_dummies(data.iloc[:,2]).iloc[:,2]
data.drop(columns='DeviceProtection',inplace=True)

data['tech support_yes']=pd.get_dummies(data.iloc[:,2]).iloc[:,2]
data.drop(columns='TechSupport',inplace=True)

data['streamingTV_yes']=pd.get_dummies(data.iloc[:,2]).iloc[:,2]
data.drop(columns='StreamingTV',inplace=True)

data['streaming movies_yes']=pd.get_dummies(data.iloc[:,2]).iloc[:,2]
data.drop(columns='StreamingMovies',inplace=True)

contract=pd.get_dummies(data.iloc[:,2],prefix='contract')
data=pd.concat([data,contract],axis=1).drop(columns=['Contract'])

data['paperless biling_yes']=pd.get_dummies(data.iloc[:,2]).iloc[:,1]
data.drop(columns='PaperlessBilling',inplace=True)

paymethod=pd.get_dummies(data.iloc[:,2],prefix='paymethod')
data=pd.concat([data,paymethod],axis=1).drop(columns=['PaymentMethod'])

In [None]:
data.head(2)

In [None]:
#separate data and labels
y=data['Churn']
data.drop(columns='Churn', inplace=True)

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [None]:
#feature selection via random forest
forest=RandomForestClassifier(n_estimators=600, max_depth=5, random_state=7)
forest.fit(data,y)
imp=forest.feature_importances_

#store feature importances in new DataFrame
feature_importances=pd.DataFrame()
feature_importances['feature']=pd.Series(data.columns)
feature_importances['importance']=imp
feature_importances.head()

In [None]:
plt.figure(figsize=(10,10))
sns.barplot(x='importance', y='feature', 
            data=feature_importances.sort_values(by='importance',ascending=False));

In [None]:
#keep most important columns and create final training dataset
cols=feature_importances.sort_values(by='importance',ascending=False).iloc[:12,0].values
x=data[cols].values
data[cols].head(2)

In [None]:
#encode labels, and train_test_split
enc=LabelEncoder()
y=enc.fit_transform(y)

x_tr,x_ts,y_tr,y_ts=train_test_split(x,y,stratify=y, random_state=77)

In [None]:
#scaling data, only numerical columns
from sklearn.preprocessing import StandardScaler
num_cols=[1,3,4]#numerical columns(tenure,total charges, monthly charges)
sc=StandardScaler()
x_tr[:,num_cols]=sc.fit_transform(x_tr[:,num_cols])
x_ts[:,num_cols]=sc.transform(x_ts[:,num_cols])

### PCA visualization

In [None]:
from sklearn.decomposition import PCA
pca=PCA(n_components=3)
x_tr_pca=pca.fit_transform(x_tr)

x_viz=pd.concat(objs=[pd.DataFrame(x_tr_pca),pd.Series(y_tr)],axis=1).values

plt.figure(figsize=(10,10))
ax=plt.axes()
xv=x_viz[:,0]
yv=x_viz[:,1]
zv=x_viz[:,2]
cv=x_viz[:,3]
ax.scatter(xv, yv, c=cv, cmap='winter')
plt.show();

### Machine learning

We will start by deploying a few models on the raw, unbalanced data, and compare the results with a dummy classifier. If the results don't look good, we will upsample the minority class.

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,f1_score,precision_score,recall_score

#DataFrame to store performance metrics for later comparison between models
results=pd.DataFrame([], columns=['model', 'parameters','accuracy','precision','recall','F1-score'])

Naive Bayes

In [None]:
nb=GaussianNB()
nb.fit(x_tr,y_tr)
print('accuracy:',accuracy_score(y_ts, nb.predict(x_ts)))
sns.heatmap(confusion_matrix(y_ts, nb.predict(x_ts)),annot=True,fmt='d');
plt.ylabel('true')
plt.xlabel('predicted');

KNN

In [None]:
knn=KNeighborsClassifier()
knn.fit(x_tr,y_tr)
print('accuracy:',accuracy_score(y_ts, knn.predict(x_ts)))
sns.heatmap(confusion_matrix(y_ts, knn.predict(x_ts)),annot=True,fmt='d');
plt.ylabel('true')
plt.xlabel('predicted');

SVM

In [None]:
svm=SVC()
svm.fit(x_tr,y_tr)
print('accuracy:',accuracy_score(y_ts, svm.predict(x_ts)))
sns.heatmap(confusion_matrix(y_ts, svm.predict(x_ts)),annot=True,fmt='d');
plt.ylabel('true')
plt.xlabel('predicted');

SGD

In [None]:
sgd=SGDClassifier()
sgd.fit(x_tr, y_tr)
print('accuracy:',accuracy_score(y_ts, sgd.predict(x_ts)))
sns.heatmap(confusion_matrix(y_ts, sgd.predict(x_ts)),annot=True,fmt='d');
plt.ylabel('true')
plt.xlabel('predicted');

Logistic Regression

In [None]:
lr=LogisticRegression()
lr.fit(x_tr,y_tr)
print('accuracy"',accuracy_score(y_ts, lr.predict(x_ts)))
sns.heatmap(confusion_matrix(y_ts, lr.predict(x_ts)),annot=True,fmt='d');
plt.ylabel('true')
plt.xlabel('predicted');

Decision Tree

In [None]:
tree=DecisionTreeClassifier(max_depth=4, random_state=3)
tree.fit(x_tr,y_tr)
print('accuracy:',accuracy_score(y_ts, tree.predict(x_ts)))
sns.heatmap(confusion_matrix(y_ts, tree.predict(x_ts)),annot=True,fmt='d');
plt.ylabel('true')
plt.xlabel('predicted');

Random Forest

In [None]:
rf=RandomForestClassifier(n_estimators=100,max_depth=8,random_state=17)
rf.fit(x_tr,y_tr)
print('accuracy:',accuracy_score(y_ts, rf.predict(x_ts)))
sns.heatmap(confusion_matrix(y_ts, rf.predict(x_ts)),annot=True,fmt='d');
plt.ylabel('true')
plt.xlabel('predicted');

What would happen if instead of making a classification, we blindly assigned each sample to the majority class? How would the accuracy of this process compare with our models?

Let's try it

In [None]:
#dummy classifier
from sklearn.dummy import DummyClassifier

dum=DummyClassifier(strategy='most_frequent')
dum.fit(x_tr,y_tr)
pred=dum.predict(x_ts)
print('dummy class:',format(np.unique(pred)))
print('dummy accuracy:',accuracy_score(y_ts,pred) )
sns.heatmap(confusion_matrix(y_ts, dum.predict(x_ts)),annot=True,fmt='d');
plt.ylabel('true')
plt.xlabel('predicted');

In [None]:
#another way to show this, array with all zeros
accuracy_score(y_ts,np.zeros(x_ts.shape[0]))

The resulting accuracy is exactly equal to the proportion of the majority class in the dataset, which is 73.45%. Our models didn't score much higher than that, so they barely surpassed a dummy classification.

Before upsampling the minority class, let's consider some models with configurable class weights

In [None]:
svm=SVC(class_weight='balanced')
svm.fit(x_tr,y_tr)
print('accuracy:',accuracy_score(y_ts, svm.predict(x_ts)))
sns.heatmap(confusion_matrix(y_ts, svm.predict(x_ts)),annot=True,fmt='d');
plt.ylabel('true')
plt.xlabel('predicted');

In [None]:
sgd=SGDClassifier(class_weight='balanced')
sgd.fit(x_tr, y_tr)
print('accuracy:',accuracy_score(y_ts, sgd.predict(x_ts)))
sns.heatmap(confusion_matrix(y_ts,sgd.predict(x_ts)),annot=True,fmt='d');
plt.ylabel('true')
plt.xlabel('predicted');

In [None]:
lr=LogisticRegression(class_weight='balanced')
lr.fit(x_tr,y_tr)
print('accuracy:',accuracy_score(y_ts, lr.predict(x_ts)))
sns.heatmap(confusion_matrix(y_ts, lr.predict(x_ts)),annot=True,fmt='d');
plt.ylabel('true')
plt.xlabel('predicted');

In [None]:
tree=DecisionTreeClassifier(max_depth=4, random_state=3,class_weight='balanced')
tree.fit(x_tr,y_tr)
print('accuracy:',accuracy_score(y_ts, tree.predict(x_ts)))
sns.heatmap(confusion_matrix(y_ts, tree.predict(x_ts)),annot=True,fmt='d');
plt.ylabel('true')
plt.xlabel('predicted');

In [None]:
rf=RandomForestClassifier(n_estimators=100,max_depth=8,
                          random_state=17,class_weight='balanced')
rf.fit(x_tr,y_tr)
print('accuracy:',accuracy_score(y_ts, rf.predict(x_ts)))
sns.heatmap(confusion_matrix(y_ts, rf.predict(x_ts)),annot=True,fmt='d');
plt.ylabel('true')
plt.xlabel('predicted');

We see that although the overall accuracy has somewhat dropped, the ratio of true versus false negatives has been improved in all models, in some cases significantly. The performance is still unacceptable, though, and before playing with the models' parameters, we will try oversampling the minority class.

In [None]:
from sklearn.utils import resample
from sklearn.dummy import DummyClassifier

x_up, y_up=resample(x_tr[y_tr==1],y_tr[y_tr==1],replace=True,
                        n_samples=x_tr[y_tr==0].shape[0],random_state=42)
print(x_tr[y_tr==1].shape)
print(x_up.shape)

x_bal=np.vstack((x_tr[y_tr==0], x_up))
y_bal=np.hstack((y_tr[y_tr==0],y_up))


dum2=DummyClassifier(strategy='most_frequent')
dum2.fit(x_bal,y_bal)
print('dummy accuracy on balanced dataset:',
      accuracy_score(y_bal,dum2.predict(x_bal)))

The dummy accuracy has now been lowered to 50%, and any results our models have from now on will be considered 'real' results.

Naive Bayes

In [None]:
gb=GaussianNB()
gb.fit(x_bal,y_bal)
y_pred=gb.predict(x_ts)
accuracy=accuracy_score(y_ts,y_pred)
precision=precision_score(y_ts,y_pred)
recall=recall_score(y_ts,y_pred)
f1=f1_score(y_ts,y_pred)
results=results.append(pd.DataFrame([['Gaussian NB', 'default', accuracy,precision,recall,f1]],columns=list(results.columns)))
print('accuracy:',accuracy_score(y_ts, y_pred))
sns.heatmap(confusion_matrix(y_ts, gb.predict(x_ts)),annot=True,fmt='d');
plt.ylabel('true')
plt.xlabel('predicted');

KNN

In [None]:
knn=KNeighborsClassifier()
knn.fit(x_bal,y_bal)
y_pred=knn.predict(x_ts)
accuracy=accuracy_score(y_ts,y_pred)
precision=precision_score(y_ts,y_pred)
recall=recall_score(y_ts,y_pred)
f1=f1_score(y_ts,y_pred)
results=results.append(pd.DataFrame([['KNN', 'default', accuracy,precision,recall,f1]],columns=list(results.columns)))
print('accuracy:',accuracy_score(y_ts, y_pred))
sns.heatmap(confusion_matrix(y_ts, knn.predict(x_ts)),annot=True,fmt='d');
plt.ylabel('true')
plt.xlabel('predicted');

SVM

In [None]:
svm=SVC()
rng=[0.01, 0.1, 1.0, 10.0, 100.0]
params={'C':rng, 'gamma':rng}
gs=GridSearchCV(estimator=svm,param_grid=params)


gs.fit(x_bal,y_bal)
best_params=gs.best_params_
best_est=gs.best_estimator_

print('best params',best_params)

best_est.fit(x_bal,y_bal)

y_pred=best_est.predict(x_ts)
accuracy=accuracy_score(y_ts,y_pred)
precision=precision_score(y_ts,y_pred)
recall=recall_score(y_ts,y_pred)
f1=f1_score(y_ts,y_pred)
results=results.append(pd.DataFrame([['SVM', best_params, accuracy,precision,recall,f1]],columns=list(results.columns)))
print('accuracy:',accuracy_score(y_ts, y_pred))

sns.heatmap(confusion_matrix(y_ts, y_pred),annot=True,fmt='d');
plt.ylabel('true')
plt.xlabel('predicted');


In [None]:
svm=SVC()
svm.fit(x_bal,y_bal)


y_pred=svm.predict(x_ts)
accuracy=accuracy_score(y_ts,y_pred)
precision=precision_score(y_ts,y_pred)
recall=recall_score(y_ts,y_pred)
f1=f1_score(y_ts,y_pred)
results=results.append(pd.DataFrame([['SVM', 'default', accuracy,precision,recall,f1]],columns=list(results.columns)))
print('accuracy:',accuracy_score(y_ts, y_pred))

sns.heatmap(confusion_matrix(y_ts, svm.predict(x_ts)),annot=True,fmt='d');
plt.ylabel('true')
plt.xlabel('predicted');

SGD

In [None]:
sgd=SGDClassifier(random_state=3)
params={'loss':['log', 'modified_huber', 'squared_hinge'],
       'penalty': ['l1','l2'],
       'alpha':[0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0]}
gs=GridSearchCV(estimator=sgd, param_grid=params)
gs.fit(x_bal, y_bal)
best=gs.best_estimator_
best_params=gs.best_params_

best.fit(x_bal,y_bal)

y_pred=best.predict(x_ts)
accuracy=accuracy_score(y_ts,y_pred)
precision=precision_score(y_ts,y_pred)
recall=recall_score(y_ts,y_pred)
f1=f1_score(y_ts,y_pred)
results=results.append(pd.DataFrame([['SGC', best_params, accuracy,precision,recall,f1]],columns=list(results.columns)))


print('best parameters:',best_params)

print('best estimator accuracy:',accuracy_score(y_ts, y_pred))
sns.heatmap(confusion_matrix(y_ts, y_pred),annot=True,fmt='d');
plt.ylabel('true')
plt.xlabel('predicted');

Logistic Regression

In [None]:
lr=LogisticRegression(random_state=3)
params={'penalty':['l2'],
       'C':[0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0],
       'solver': [ 'sag','saga', 'lbfgs']}
gs=GridSearchCV(estimator=lr,param_grid=params)
gs.fit(x_bal,y_bal)
best=gs.best_estimator_
best_params=gs.best_params_
print('best params:', best_params)
y_pred=best.predict(x_ts)

best.fit(x_bal,y_bal)

y_pred=best.predict(x_ts)
accuracy=accuracy_score(y_ts,y_pred)
precision=precision_score(y_ts,y_pred)
recall=recall_score(y_ts,y_pred)
f1=f1_score(y_ts,y_pred)
results=results.append(pd.DataFrame([['Logistic Regression', best_params, accuracy,precision,recall,f1]],columns=list(results.columns)))
print('accuracy"',accuracy_score(y_ts, y_pred))
sns.heatmap(confusion_matrix(y_ts, y_pred),annot=True,fmt='d');
plt.ylabel('true')
plt.xlabel('predicted');

Decision Tree

In [None]:
tree=DecisionTreeClassifier(max_depth=4, random_state=3)
tree.fit(x_bal,y_bal)

y_pred=tree.predict(x_ts)
accuracy=accuracy_score(y_ts,y_pred)
precision=precision_score(y_ts,y_pred)
recall=recall_score(y_ts,y_pred)
f1=f1_score(y_ts,y_pred)
results=results.append(pd.DataFrame([['Decision Tree', 'max_depth=4, rand_state=3', accuracy,precision,recall,f1]],columns=list(results.columns)))
print('accuracy:',accuracy)
sns.heatmap(confusion_matrix(y_ts, y_pred),annot=True,fmt='d');
plt.ylabel('true')
plt.xlabel('predicted');

Random Forest

In [None]:
rf=RandomForestClassifier(n_estimators=1000,max_depth=10,random_state=17)
rf.fit(x_bal,y_bal)


y_pred=rf.predict(x_ts)
accuracy=accuracy_score(y_ts,y_pred)
precision=precision_score(y_ts,y_pred)
recall=recall_score(y_ts,y_pred)
f1=f1_score(y_ts,y_pred)
results=results.append(pd.DataFrame([['Random Forest', 'max_depth=10, rand_state=17', accuracy,precision,recall,f1]],columns=list(results.columns)))
print('accuracy:',accuracy_score(y_ts, y_pred))

sns.heatmap(confusion_matrix(y_ts, y_pred),annot=True,fmt='d');
plt.ylabel('true')
plt.xlabel('predicted');

Ada Boost

In [None]:
tree=DecisionTreeClassifier(criterion='entropy',random_state=1, max_depth=1)
ada=AdaBoostClassifier(base_estimator=tree,n_estimators=1000,learning_rate=0.1, random_state=5)
ada.fit(x_bal,y_bal)

y_pred=ada.predict(x_ts)
accuracy=accuracy_score(y_ts,y_pred)
precision=precision_score(y_ts,y_pred)
recall=recall_score(y_ts,y_pred)
f1=f1_score(y_ts,y_pred)
results=results.append(pd.DataFrame([['AdaBoost Tree', 'criterion=entropy, max_depth=1, rate=0.1, estimators=500',
                                      accuracy,precision,recall,f1]],columns=list(results.columns)))
print('accuracy:',accuracy)
sns.heatmap(confusion_matrix(y_ts, y_pred),annot=True,fmt='d');
plt.ylabel('true')
plt.xlabel('predicted');

Gradient Boosted Tree

In [None]:
gradb=GradientBoostingClassifier(random_state=42)
gradb.fit(x_bal,y_bal)

y_pred=gradb.predict(x_ts)
accuracy=accuracy_score(y_ts,y_pred)
precision=precision_score(y_ts,y_pred)
recall=recall_score(y_ts,y_pred)
f1=f1_score(y_ts,y_pred)
results=results.append(pd.DataFrame([['GradientBoosted Tree', 'default, random_state=42', accuracy,precision,recall,f1]],columns=list(results.columns)))
print('accuracy:',accuracy)
sns.heatmap(confusion_matrix(y_ts, y_pred),annot=True,fmt='d');
plt.ylabel('true')
plt.xlabel('predicted');

The models may seem to have dropped in performance, but previously the dummy classifier's accuracy was 73% and the models exceeded that by two percent or so, while now the dummy accuracy is 50% and the models supersede it by up to 28,5%, and with fewer false negatives.

In [None]:
results=results.reset_index().drop(columns='index')
results

In [None]:
sns.catplot(y='model', x='accuracy', kind='bar', data=results.sort_values(by='accuracy',ascending=False), color='grey');
plt.title('Model Accuracy');
print(' ')
sns.catplot(y='model', x='F1-score', kind='bar', data=results.sort_values(by='F1-score',ascending=False), color='black');
plt.title('model F1-score' );

From the above barplots is evident that Random Forest, Gradient Boosted Tree, and SVM give the best results, and should be use to predict customer churn.