In [None]:
#Some of the code in this notebook was inspired by notebooks from fellow Kaggle users- Gabriel Atkin and Audrey Guillot

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.metrics import f1_score, recall_score, confusion_matrix, classification_report, precision_recall_curve
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier 
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



In [None]:
# Import the data
telco_data = pd.read_csv('../input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv')

#Let's begin by looking at the data
telco_data.head()

In [None]:
#Now let's look at the all the columns we have and their data types 
telco_data.info()

CustomerID isn't a field that could really predict the churn so we can get rid of it. We also notice that the TotalCharges column is of Object type, let's convert it into float since it's numeric similar to MonthlyCharges.

In [None]:
#Let do some data preprocessing
telco_data = telco_data.drop('customerID', axis=1)

When we try to convert the TotalCharges field we noticed that there were some blanks, so we need to handle it and then convert it into float. To fill in these blanks let's use the Mean value.

In [None]:
telco_data['TotalCharges']= telco_data['TotalCharges'].replace(' ',np.NaN)
telco_data['TotalCharges']= telco_data['TotalCharges'].astype(np.float)
telco_data['TotalCharges']= telco_data['TotalCharges'].fillna(telco_data['TotalCharges'].mean())

Looking at the data above we noticed that most of the columns could be converted into categories, let's see what are the different categories we have for each of the columns.

In [None]:
def get_uniques(df, columns):
    return {column: list(df[column].unique()) for column in columns}

def get_categorical_columns(df):
    return [column for column in df.columns if df.dtypes[column] == 'object']

get_uniques(telco_data, get_categorical_columns(telco_data))

Most of the columns are binary with a simple Yes or No option or even 'Gender' which has just Male or Female. We also see columns like 'MultipleLines', 'OnlineSecurity' and so on which could be converted into Yes and No. Let's converge them so that we get a cleaner dataset and better analysis.

In [None]:
telco_data['MultipleLines'] = telco_data['MultipleLines'].replace('No phone service', 'No')

telco_data[['OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
      'TechSupport', 'StreamingTV', 'StreamingMovies']] = telco_data[['OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
                                                                'TechSupport', 'StreamingTV', 'StreamingMovies']].replace('No internet service', 'No')

In [None]:
get_uniques(telco_data, get_categorical_columns(telco_data))

Our columns look in a much better shape now. Next we have columns like 'InternetService' and 'Contract' which could be converted to ordinal features since the data can be seen as continuous.

In [None]:
internet_order = ['No', 'DSL', 'Fiber optic']
contract_order = ['Month-to-month', 'One year', 'Two year']

To summarize, we have all the columns grouped into these categories and this will help us in feature engineering:
* binary_features: Gender, Partner, Dependents, PhoneService, MultipleLines, OnlineSecurity, OnlineBackup, DeviceProtection, TechSupport, StreamingTV, StreamingMovies, PaperlessBilling
* ordinal_features: InternetService, Contract
* nominal_features: PaymentMethod

In [None]:
def binary_encode(df, column, positive_value):
    df = df.copy()
    df[column] = df[column].apply(lambda x: 1 if x == positive_value else 0)
    return df

def ordinal_encode(df, column, ordering):
    df = df.copy()
    df[column] = df[column].apply(lambda x: ordering.index(x))
    return df
    
def onehot_encode(df, column):
    df = df.copy()
    dummies = pd.get_dummies(df[column])
    df = pd.concat([df, dummies], axis=1)
    df = df.drop(column, axis=1)
    return df

In [None]:
telco_data = binary_encode(telco_data, 'gender', 'Male')

binary_features = ['Partner', 'Dependents', 'PhoneService', 'MultipleLines',
                'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
                'StreamingTV', 'StreamingMovies', 'PaperlessBilling']

for feature in binary_features:
    telco_data = binary_encode(telco_data, feature, 'Yes')


telco_data = ordinal_encode(telco_data, 'InternetService', internet_order)
telco_data = ordinal_encode(telco_data, 'Contract', contract_order)


telco_data = onehot_encode(telco_data, 'PaymentMethod')

telco_data = binary_encode(telco_data, 'Churn', 'Yes')

In [None]:
telco_data

Now we see that all the columnar data is converted to numbers, binaries are converted to 1s and 0s and ordinals/nominals are converted to a series of 0, 1, 2s. 

**Machine Learning Algorithms**

We will use these models for this dataset and compare which model predicts accurately

* Logistic Regression
* Random Forest
* AdaBoost
* XGBoost
* Support Vector Machines

In [None]:
#Let's begin with splitting the model into Test and Train datasets

y = telco_data['Churn']
X = telco_data.drop('Churn', axis=1)

In [None]:
#Using scaler function we can normalize our dataset to improve the performance of our algorithms

scaler = StandardScaler()

X = scaler.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)

In [None]:
#Logistic Regression

clf_log = LogisticRegression(random_state=0)
clf_log.fit(X_train, y_train)
y_pred  = clf_log.predict(X_test)

#Print the accuracy of our model
print("Score:", clf_log.score(X_test, y_test))
y_pred = clf_log.predict(X_test)
print(classification_report(y_test, y_pred))
N, train_score, val_score = learning_curve(clf_log, X_train, y_train, cv=4, scoring='f1', train_sizes=np.linspace(0.1,1,10))

#Plot the training and validation score for the model
plt.figure(figsize=(12,8))
plt.title('Logistic Regression')
plt.plot(N,train_score.mean(axis=1), label='training score')
plt.plot(N,val_score.mean(axis=1), label='validation score')
plt.legend()
plt.show()

In [None]:
#Random Forest 

clf_rf = RandomForestClassifier(random_state=0)
clf_rf.fit(X_train, y_train)
y_pred  = clf_rf.predict(X_test)

#Print the accuracy of our model

print("Score:", clf_rf.score(X_test, y_test))
y_pred = clf_rf.predict(X_test)
print(classification_report(y_test, y_pred))
N, train_score, val_score = learning_curve(clf_rf, X_train, y_train, cv=4, scoring='f1',train_sizes=np.linspace(0.1,1,10))

#Plot the training and validation score for the model
plt.figure(figsize=(12,8))
plt.title('Random Forest')
plt.plot(N,train_score.mean(axis=1), label='training score')
plt.plot(N,val_score.mean(axis=1), label='validation score')
plt.legend()
plt.show()

In [None]:
#AdaBoost

clf_ab = AdaBoostClassifier(random_state=0)
clf_ab.fit(X_train, y_train)
y_pred  = clf_ab.predict(X_test)

#Print the accuracy of our model

print("Score:", clf_ab.score(X_test, y_test))
y_pred = clf_ab.predict(X_test)
print(classification_report(y_test, y_pred))
N, train_score, val_score = learning_curve(clf_ab, X_train, y_train, cv=4, scoring='f1',train_sizes=np.linspace(0.1,1,10))

#Plot the training and validation score for the model
plt.figure(figsize=(12,8))
plt.title('AdaBoost')
plt.plot(N,train_score.mean(axis=1), label='training score')
plt.plot(N,val_score.mean(axis=1), label='validation score')
plt.legend()
plt.show()

In [None]:
#XGBoost

clf_xg = XGBClassifier(random_state=0)
clf_xg.fit(X_train, y_train)
y_pred  = clf_xg.predict(X_test)

#Print the accuracy of our model

print("Score:", clf_xg.score(X_test, y_test))
y_pred = clf_xg.predict(X_test)
print(classification_report(y_test, y_pred))
N, train_score, val_score = learning_curve(clf_xg, X_train, y_train, cv=4, scoring='f1',train_sizes=np.linspace(0.1,1,10))

#Plot the training and validation score for the model
plt.figure(figsize=(12,8))
plt.title('XGBoost')
plt.plot(N,train_score.mean(axis=1), label='training score')
plt.plot(N,val_score.mean(axis=1), label='validation score')
plt.legend()
plt.show()

In [None]:
#Support Vector Machine SVM

clf_svc = SVC(random_state=0)
clf_svc.fit(X_train, y_train)
y_pred  = clf_svc.predict(X_test)

#Print the accuracy % of our model

print("Scores:", clf_svc.score(X_test, y_test))
y_pred = clf_svc.predict(X_test)
print(classification_report(y_test, y_pred))
N, train_score, val_score = learning_curve(clf_svc, X_train, y_train, cv=4, scoring='f1',train_sizes=np.linspace(0.1,1,10))

#Plot the training and validation score for the model
plt.figure(figsize=(12,8))
plt.title('SVM')
plt.plot(N,train_score.mean(axis=1), label='training score')
plt.plot(N,val_score.mean(axis=1), label='validation score')
plt.legend()
plt.show()

Looking at all the models, we notice that Random Forest is not a good fit for this data. The other models such as Logistic Regression, AdaBoost and SVM are performing the best. Let's dive further into the Logistic Regression model and perform Hyperparameter tuning to see if we can improve the score.

In [None]:
#Create a parameter grid
param_grid = {'penalty' : ['l1', 'l2'],
              'C' : np.logspace(-4, 4, 20),'solver' : ['liblinear']}
lr = LogisticRegression()
grid_log = GridSearchCV(param_grid = param_grid, cv = 6, verbose=True, n_jobs=-1, estimator=lr)

grid_log.fit(X_train, y_train)

#Printing the best score and the best parameters for the model
print("Best Score:", grid_log.best_score_)
print('Best Params: ', grid_log.best_params_) 


In [None]:
#Let's look at the final classification report for our model
y_pred = grid_log.predict(X_test)
print("Score:", grid_log.score(X_test, y_test))
print(classification_report(y_test, y_pred))

We noticed that with the Hyperparameter tuning we were able to achieve a slightly better accuracy.