In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Standard
import pandas as pd
import numpy as np
from pandas_profiling import ProfileReport

# Plots
import seaborn as sns
from plotly.offline import iplot
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.figure_factory as ff
import plotly.express as px
import plotly.io as pio
from plotly.subplots import make_subplots
from plotly.offline import init_notebook_mode, iplot

pio.templates.default = "plotly_white"
init_notebook_mode()

# Preprocessing tools
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold
from scipy.stats import zscore

#Modeling Tools
import xgboost as xgb
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier

#Tuning Tools
from sklearn import metrics
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from xgboost import plot_importance

# Extras
from datetime import date
import warnings
warnings.filterwarnings("ignore")

import shap

# Datapath and Setup
data_path = "/kaggle/input/telco-customer-churn/"
random_seed = 1

In [None]:
###########################
## DISABLE WARNINGS
###########################
import sys
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")

Customer churn is one of the most important aspects of a business. Especially in a telecommunication setting where a customer produces monthly recurrent revenue, the loss  of customers can have a significant impact on the long term financial standing of the company. This notebook aims to provide a holistic approach to the telecommunications company. One the one hand, through exploratory data analysis, it aims to provide valuable feedback regarding the reasons that customers churn, 
thus providing useful insight for the company to minimize those reasons. On the other hand, it provides an accurate model to predict potentially churning customers, thus giving the company one more opportunity to follow a proactive approach to keep the customers.

In [None]:
df = pd.read_csv(data_path+"WA_Fn-UseC_-Telco-Customer-Churn.csv")
df.head(5)

## Exploratory Data Analysis

In [None]:
#Finding the columns and the type of each column
df.dtypes

TotalCharges is an object type instead of float64. This happened because there are some blank spaces in this column which caused Python to force the data type as object. To fix that, we will have to trim blank spaces before changing the data type.

In [None]:
# replace blanks with np.nan
df['TotalCharges'] = df['TotalCharges'].replace(' ', np.nan)

# convert to float64
df['TotalCharges'] = df['TotalCharges'].astype('float64')

In [None]:
# check missing values
df.isnull().sum()

Since we striped blank spaces Total charges now has 11 missing values. Since the missing values are just sucha a small part of the dataset they can be droped without affecting the final results.

In [None]:
# check missing values
df.dropna(subset= ['TotalCharges'], inplace = True)

In [None]:
#run pandas profiling for Exploratory Data Analysis 
profile = ProfileReport(df, title="Telco Customer Churn Profiling Report", explorative=True, )
profile.to_file("Churn_EDA_report.html")

#Due to the fact that Kaggle sometimes produces an error on the pandas profiling module the html report can be found in the output folder.

### Churn Distribution

In [None]:
px.pie(df,"Churn", opacity = 0.8, title = "<b>Client Churn Distribution<b>")

There's some imbalance on Churn Distribution, 26.5% of the clients have churned, and small occurences of a label could lead to bad predictor.\
It's possible to choose some ways to work with this case with Stratification, which will be used in the Cross Validation step of the process by default.\
Choosing a metric that deals with imbalanced datasets, in this case the AUC score. AUC is sensitive to class imbalance in the sense that when there is a minority class and it will have a strong impact on the AUC value. The Churn problem is about client retention, so is worth to check about false positives, so precision and recall metrics are a must for this situtation.

In [None]:

fig = px.scatter(x=df['tenure'], y=df['TotalCharges'], 
                 color = df['Churn'], template = 'presentation', 
                 opacity = 0.5, facet_col = df['Contract'], 
                 title = 'Customer Churn by Tenure, Charges, and Contract Type',
                 labels = {'x' : 'Customer Tenure', 'y' : 'Total Charges $'})
                
fig.show()
#some of the plottly express visualizations sometimes are not displayed properly on Kaggle on browsers other than chrome

Most churn can be seen in the contracts that are “Month-to-Month”. Makes sense, of course. 
Also, I can see that as the tenure increases and so are the total charges, the likelihood of customers with high tenure and low charges is less compared to customers with high tenure and high charges.

In [None]:
df_churn = df.query('(Churn == "Yes")')
df_no_churn = df.query('(Churn == "No")')

fig1 = px.histogram(df_churn, x="MonthlyCharges", color="Contract", marginal="rug",
                         title = "Churned customers")

fig2 = px.histogram(df_no_churn, x="MonthlyCharges", color="Contract", marginal="rug",
                         title = "Not Churned customers")
fig1.show()
fig2.show()

In [None]:
fig1 = px.histogram(df_churn, x="TotalCharges", color="Contract", marginal="rug",
                         title = "Churned customers")

fig2 = px.histogram(df_no_churn, x="TotalCharges", color="Contract", marginal="rug",
                         title = "Not Churned customers")
fig1.show()
fig2.show()

Monthly Charges have a high concentration of churned customer in higher values. \
Total Charges have similar distributions, but the ‘No churn’ distribution have lower values. \
Maybe the amount of chage value could lead the client to leave the service.

### Correlations

In [None]:
# define the mask to set the values in the upper triangle to True
np.triu(np.ones_like(df.corr()))
mask = np.triu(np.ones_like(df.corr(), dtype=np.bool))

# Increase the size of the heatmap.
plt.figure(figsize=(40, 18))

# Store heatmap object in a variable to easily access it when you want to include more features (such as title).
# Set the range of values to be displayed on the colormap from -1 to 1, and set the annotation to True to display the correlation values on the heatmap.
heatmap = sns.heatmap(df.corr(),mask=mask, vmin=-1, vmax=1, annot=True, cmap='RdBu')

# Give a title to the heatmap. Pad defines the distance of the title from the top of the heatmap.
heatmap.set_title('Churn Correlation Heatmap', fontdict={'fontsize':12}, pad=12)


## Preprocessing

In order to properly fit the data to the models, they need to be uniform. \
So, using the MultiColumnLabelEncoder class I convert the categorical features into numerical values, by using the scikit-learn.preprossecing classes OneHotEncoder and LabelEncoder. \
The churn and the customer ID columns are not coppied in the fetures array since the fisrst is the label and the second does not provide any value. \
As for the numerical values, they are normalized, by using the scikit-learn scikit-learn.preprossecing class StandardScaler in the MultiColumnLabelScaler class, to avoid having differences in scales of the measures that could affect the model. \
It is also important that there were no outliers detected in the profiling report so there was no preprocessing to replace outliers.


In [None]:
#remove customer ID column from the dataset
X = df.copy().drop(['Churn','customerID'] , axis = 1)
y = df['Churn'].copy()

In [None]:
# Creating the Class for Encoding the Numerical Values
class MultiColumnLabelScaler:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here

    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        StandardScaler(). If no columns specified, transforms all
        columns in X.
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = StandardScaler().fit_transform(output[col].values.reshape(-1, 1))
        else:
            for colname,col in output.iteritems():
                output[colname] = StandardScaler().fit_transform(col.values.reshape(-1, 1))
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

In [None]:
# Creating the Class for Encoding the Categorical Variables
class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here

    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        Stan(). If no columns specified, transforms all
        columns in X.
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            for colname,col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

In [None]:
# Encoding the Categorical Variables
X = MultiColumnLabelEncoder(columns = ["gender","Partner","Dependents","PhoneService","MultipleLines","InternetService", "OnlineSecurity",
                                        "OnlineBackup","DeviceProtection","TechSupport","StreamingTV","StreamingMovies","Contract",
                                        "PaperlessBilling","PaymentMethod"]).fit_transform(X)

y = LabelEncoder().fit_transform(y)

In [None]:
# Encoding the Numerical Variables
X = MultiColumnLabelScaler(columns = ["TotalCharges", "MonthlyCharges", "tenure"]).fit_transform(X)

In [None]:
#check the dataset
X.head()

One other preprocessing technique that was used on previous iterations of the notebook but proved to harm both the overall model perfomance and the explainability of the model was Principal Component Analysis with 17 components since under 17, the explained variance dropped lower than 99%. The technique rendered the model unexplainable since the fetured where projected into new ones and also lowered all the scores of the models so it was omitted on the final iteration of the notebook.

### Modeling

In the testing phase multiple algorithms were tested, like Decision Trees, Random Forests, Adaboost, Bagging, Extra Trees and Gradient Boosting. The best and second-best classifiers in my experiments were Gradient Boosting and XGBoost respectively. For this reason, I run a Grid Search for tuning the hyperparameters on both XGBoost and Gradient Boosting Classifier. I have tried different approaches at stacking the classifiers, which even though outperformed most of the 
models, it was consistently less accurate than all of the enseble base models.

In [None]:
#Initializing models
dt_classifier = DecisionTreeClassifier(max_depth=20)
rf_classifier = RandomForestClassifier(n_estimators = 5, criterion = 'entropy')
ada_classifier = AdaBoostClassifier()
bag_classifier = BaggingClassifier()
xtrees_classifier = ExtraTreesClassifier()
grad_classifier = GradientBoostingClassifier()
xgbclass = xgb.XGBClassifier(eval_metric='auc')

# hyperparameters from gridsearch below
grad_enh = GradientBoostingClassifier(learning_rate = 0.1, max_depth = 1, n_estimators = 500)

# hyperparameters from gridsearch below
xgbenhanced = xgb.XGBClassifier(colsample_bytree= 0.7,gamma= 0.5, learning_rate= 0.1, max_depth= 3, min_child_weight= 3, eval_metric='auc')

#Setting up the stacking model
def get_stacking():
  # define the base models
  level0 = list()
  level0.append(('Random Forest', rf_classifier))
  level0.append(('Bagging Classifier', bag_classifier))
  level0.append(('Gradient Boosting Classifier', grad_classifier))
  # define meta learner model
  level1 = xgbclass
  # define the stacking ensemble
  model = StackingClassifier(estimators=level0, final_estimator=level1, cv=5)
  return model

stack = get_stacking()

#Setting up the function to call the algorithms
def check_algorithms():
  models = []
  models.append(('Decision Tree', dt_classifier))
  models.append(('Random Forest', rf_classifier))
  models.append(('AdaBoost Classifier', ada_classifier))
  models.append(('Bagging Classifier', bag_classifier))
  models.append(('Extra Trees Classifier', xtrees_classifier))
  models.append(('Gradient Boosting Classifier', grad_classifier))
  models.append(('Gradient Boosting Enhanced', grad_enh))
  models.append(('XGB Classifier', xgbclass))
  models.append(('XGB Enhanced', xgbenhanced))
  models.append(('Stacking', get_stacking()))
  return models

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
#XGBoost Hyperparameter tuning

#xgbclass = xgb.XGBClassifier(tree_method='gpu_hist')
#xgboost_paramgrid = {'learning_rate': [0.05, 0.10, 0.25, 0.20], 
#                     'max_depth': [3, 4, 5, 6, 8], 
#                     'min_child_weight': [1, 3, 5, 6], 
#                     'gamma': [0.1, 0.2, 0.3, 0.4, 0.5],
#                     'colsample_bytree' : [0.4, 0.5, 0.7]
#                     }
#xgb_grid = GridSearchCV(xgbclass, xgboost_paramgrid, scoring='roc_auc', cv=5, verbose =3, n_jobs = 4)
#xgb_grid.fit(X, y)

In [None]:
#get the best params for xgboost to set them on the model rotation
#xgb_grid.best_params_

In [None]:
#Gradient Booster Hyperparameter tuning

#gradenhanced = xgb.XGBClassifier(tree_method='gpu_hist')
#grad_paramgrid = {'learning_rate': [0.05, 0.10, 0.20, 0.30], 
#                  'max_depth': [1, 3, 4, 5, 6, 7, 8, 9, 10], 
#                  "n_estimators":[50,250,500]}
#grad_grid = GridSearchCV(gradenhanced, grad_paramgrid, scoring='roc_auc', cv=5, verbose =3, n_jobs = 4)
#grad_grid.fit(X, y)

In [None]:
#get the best params for Gradient Booster
#grad_grid.best_params_

In [None]:
print('Results\n')
results = []
names = []
for name, model in check_algorithms():
    kfold = KFold(n_splits=10)
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='roc_auc')
    results.append(cv_results)
    names.append(name)
    print('%s: %s %f %s (%f)' % (name,'AUC', cv_results.mean(),', Standart Deviation', cv_results.std()))
print('-'*64)

In [None]:
#Plotting the results
fig_res = results
fig_names = ['DT', 'RF', 'ADA', 'BG', 'XTR', 'GB', 'GBE', 'XGB', 'XGBE', 'STCK']
fig = plt.figure()
fig.suptitle('Algorithm Comparison AUC')
ax = fig.add_subplot(111)
plt.boxplot(fig_res)
ax.set_xticklabels(fig_names)
plt.show()

In [None]:
grad_enh.fit(X_train, y_train)
y_pred_gbe = grad_enh.predict(X_test)

#y_pred = np.where(y_pred == 0, 'no', y_pred)
#y_pred = np.where(y_pred == 1, 'yes', y_pred)
#y_test = np.where(y_test == 0, 'no', y_test)
#y_test = np.where(y_test == 1, 'yes', y_test)

In [None]:
#print confusion matrix to get a clearer picture of the model's performance
confusion_matrix = metrics.confusion_matrix(y_test, y_pred_gbe)
sns.heatmap(confusion_matrix, annot=True, fmt="d")

plt.xlabel("Predicted Label", fontsize= 12)
plt.ylabel("True Label", fontsize= 12)

plt.show()

In [None]:
print(metrics.classification_report(y_test, y_pred_gbe, labels = [0, 1]))

In [None]:
gbe_pred_proba = grad_enh.predict_proba(X_test)[:,1]

gbe_roc_auc = metrics.roc_auc_score(y_test, gbe_pred_proba)
print('ROC_AUC: ', gbe_roc_auc)

gbe_fpr, gbe_tpr, thresholds = metrics.roc_curve(y_test, gbe_pred_proba)

plt.plot(gbe_fpr,gbe_tpr, label = 'ROC_AUC = %0.3f' % gbe_roc_auc)

plt.xlabel("False Positive Rate", fontsize= 12)
plt.ylabel("True Positive Rate", fontsize= 12)
plt.legend(loc="lower right")

plt.show()

In [None]:
xgbenhanced.fit(X_train, y_train)
y_pred_xgbe = xgbenhanced.predict(X_test)

In [None]:
#print confusion matrix to get a clearer picture of the models performance
confusion_matrix = metrics.confusion_matrix(y_test, y_pred_xgbe)
sns.heatmap(confusion_matrix, annot=True, fmt="d")

plt.xlabel("Predicted Label", fontsize= 12)
plt.ylabel("True Label", fontsize= 12)
plt.title("Confusion Matrix for Enchanced XGB")
plt.show()

In [None]:
print(metrics.classification_report(y_test, y_pred_xgbe, labels = [0, 1]))

In [None]:
xgbe_pred_proba = xgbenhanced.predict_proba(X_test)[:,1]

xgbe_roc_auc = metrics.roc_auc_score(y_test, xgbe_pred_proba)
print('ROC_AUC: ', xgbe_roc_auc)

xgbe_fpr, xgbe_tpr, thresholds = metrics.roc_curve(y_test, xgbe_pred_proba)

plt.plot(xgbe_fpr,xgbe_tpr, label = 'ROC_AUC = %0.3f' % xgbe_roc_auc)

plt.xlabel("False Positive Rate", fontsize= 12)
plt.ylabel("True Positive Rate", fontsize= 12)
plt.legend(loc="lower right")

plt.show()

## Explainability

In [None]:
explainer = shap.TreeExplainer(grad_enh)
shap_values = explainer.shap_values(X_test)

In [None]:
shap.summary_plot(shap_values, X_test, plot_type="bar")

Finaly we can use the Shap explainer library to extract the impact that each feature had on the model ouptut. \
This insight in my opinion is as valuable as the predictions themselves since it can provide recomendations to the telecom carriers for future offers to avoid churning. \
For example since contract, tenure and Monthly charges are the biggest drivers of the churning behaviour they should be prioritized and reexamined in the current offers by offering Lower Monthly Charges to Month to Month subscribers so they can stay longer and the lower individual payments will be subsedized by additional months before churning. 