# ***Telco customer churn predictions*** 
recommended music for exploring this notebook: 
* https://www.youtube.com/watch?v=t3217H8JppI&ab_channel=AnAmericanComposer

Was used while creating.

Cultural reference: 
* https://www.youtube.com/watch?v=z3Sj1mXrAoQ&ab_channel=MooCli

Was used while procrastinating.

* Some parts (EDA, hyperparameters tuning) are now commented out, to speed up the execution of notebook. Uncomment with selecting lines and pressing 'Ctrl' + '/'

# To do:

* explore evolutionary hyperparameters search, e.g. from https://github.com/rsteca/sklearn-deap or other that are reference in scikit guide https://scikit-learn.org/0.23//_downloads/scikit-learn-docs.pdf

****

# Import libraries, for starters

In [None]:

import numpy as np
import pandas as pd
from scipy import stats
import math

import seaborn as sns
import matplotlib.pyplot as plt


# Import data and explore basic properties

In [None]:
#import data from kaggle store
df=pd.read_csv('/kaggle/input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [None]:
# nice resume table to describe the data
def resumetable(df):
    print(f"Dataset Shape: {df.shape}")
    summary = pd.DataFrame(df.dtypes,columns=['dtypes'])
    summary = summary.reset_index()
    summary['Name'] = summary['index']
    summary = summary[['Name','dtypes']]
    summary['Missing'] = df.isnull().sum().values    
    summary['Uniques'] = df.nunique().values
    summary['First Value'] = df.loc[0].values
    summary['Second Value'] = df.loc[1].values
    summary['Third Value'] = df.loc[2].values
    summary['Fourth Value'] = df.loc[3].values
    summary['Fifth Value'] = df.loc[4].values

    for name in summary['Name'].value_counts().index:
        summary.loc[summary['Name'] == name, 'Entropy'] = round(stats.entropy(df[name].value_counts(normalize=True), base=10),4) 

    return summary

In [None]:
# retyping TotalCharges to numeric
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# TotalCharge vs Tenure x MonthlyCharges - Discounts effect
* TotalCharge should equal MonthlyCharges x Tenure. If not, it is a sign of a given discount or price inrease, that the customer got. 
* That might be a big factor for churning, lets see further

In [None]:
# Calculate differene between Totalcharge and Tenure*MonthlyCharges
df['TotalCharge_diff'] = (df['tenure'] * df['MonthlyCharges']) - df['TotalCharges']
df['TotalCharge_diff_abs'] = df['TotalCharge_diff'].abs()
# leaving both as a possible good features, from logic of the thing, I suppose only TotalCharges_diff will be of any use

In [None]:
# plot
plt.figure(figsize=(14, 4))
plt.title("KDE for {}".format('TotalCharge_diff'))
ax0 = sns.histplot(df[df['Churn'] == 'No']['TotalCharge_diff'].dropna(), color = "#22ff57", label= 'Churn: No')
ax1 = sns.histplot(df[df['Churn'] == 'Yes']['TotalCharge_diff'].dropna(), color= "#FF5722", label= 'Churn: Yes')
plt.legend(prop={'size': 12})


In [None]:
def kde_plot(feature):
    plt.figure(figsize=(9, 4))
    plt.title("KDE for {}".format(feature))
    ax0 = sns.kdeplot(data[data['Churn'] == 'No'][feature].dropna(), color = "#22ff57", label= 'Churn: No')
    ax1 = sns.kdeplot(data[data['Churn'] == 'Yes'][feature].dropna(), color= "#FF5722", label= 'Churn: Yes')
    plt.legend(prop={'size': 12})

In [None]:
#kde_plot('TotalCharge_diff')
#kde_plot('TotalCharge_diff_abs')

In [None]:
# borrowed fcn for plotting nice barplots
def barplot_percentages(feature, orient='v', axis_name="percentage of customers"):
    ratios = pd.DataFrame()
    g = df.groupby(feature)["Churn"].value_counts().to_frame()
    g = g.rename({"Churn": axis_name}, axis=1).reset_index()
    g[axis_name] = g[axis_name]/len(df)
    if orient == 'v':
        ax = sns.barplot(x=feature, y= axis_name, hue='Churn', data=g, orient=orient)
        ax.set_yticklabels(['{:,.0%}'.format(y) for y in ax.get_yticks()])
    else:
        ax = sns.barplot(x= axis_name, y=feature, hue='Churn', data=g, orient=orient)
        ax.set_xticklabels(['{:,.0%}'.format(x) for x in ax.get_xticks()])
    ax.plot()

In [None]:
# borrowed fcn for plotting pie plots with percentages of each category based on rule
def plot_var_percentages (df, var_list):

    n_rows = math.ceil(len(var_list)/3)
    mapper = []
    count_c = 0
    count_r = 0
    for n in range(len(var_list)):
        if count_c <= 2:
            mapper.append((count_r,count_c))
            count_c += 1
        else:
            count_r += 1
            count_c = 0
            
    #fig, axes = plt.subplots(nrows = n_rows,ncols = 3,figsize = (15,12))
    for i,var in enumerate(var_list):
        
        labels = list(df[var].value_counts().index)
        counts = list(df[var].value_counts())
        
        plt.figure(i)
        plt.pie(counts, labels=labels, autopct='%1.1f%%', shadow=True, startangle=90)
        plt.title(var)
    plt.show 

# Preprocessing: Data preps, feature adding based on EDA

In [None]:
df['Churn'] = df['Churn'].replace("No", 0).replace("Yes", 1)

df['SeniorCitizen'] = df['SeniorCitizen'].replace(0, "No").replace(1, "Yes")

In [None]:
# tenure - create two more categories, as the tenure feature does not have linear behaviour
df['tenure_short'] = np.where(df['tenure']<18, 1, 0)
df['tenure_long'] = np.where(df['tenure']>54, 1, 0)

df['TCh_diff_positive'] = np.where(df['TotalCharge_diff']>0, 1, 0)
df['TCh_diff_negative'] = np.where(df['TotalCharge_diff']<0, 1, 0)
#df.head()

In [None]:
# drop NaNs in TotalCharges
df = df.dropna()

# drop customerID, as would not be of any help
df.drop(['customerID'],axis=1,inplace=True)

In [None]:
non_dummy_cols = ['tenure','MonthlyCharges','TotalCharges','Churn','churn_rate','TotalCharge_diff','TotalCharge_diff_abs'] 
dummy_cols = list(set(df.columns) - set(non_dummy_cols))
#df = pd.get_dummies(df, columns=dummy_cols)
df = pd.get_dummies(df, columns=dummy_cols,drop_first=True)

In [None]:
# Show what we get here. Again.
# resumetable(df)

# Retype to ints and bools

In [None]:
# retype to boolean
non_int_cols = ['tenure','MonthlyCharges','TotalCharges','Churn','TotalCharge_diff','TotalCharge_diff_abs'] 
int_cols = list(set(df.columns) - set(non_int_cols))
df[int_cols] = df[int_cols].astype(bool)

# retype floats
float_cols = ['MonthlyCharges','TotalCharges','TotalCharge_diff','TotalCharge_diff_abs']
df[float_cols] = df[float_cols].astype(np.int64)


Looking good now. 

# Features correlation

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
from sklearn.metrics import plot_confusion_matrix

from sklearn.linear_model import LogisticRegression

In [None]:
# drop "No internet service" items and others with high correlation. It was nto clear to me what is the meaning, from the correlation it is clear there is no information added by multiple columns
# this was actually added after looking at Correrlation matrix, but I left it here for the sake of simplicity
df.drop(['OnlineBackup_No internet service',
         'TechSupport_No internet service',
         'StreamingTV_No internet service',
         'DeviceProtection_No internet service',
         'OnlineBackup_No internet service',
         'OnlineSecurity_No internet service', 
         'StreamingMovies_No internet service', 
         'MultipleLines_No phone service',
#         'PhoneService_No', 
         'TotalCharge_diff_abs',
         'TotalCharge_diff',],axis=1,inplace=True)
              # ,'MultipleLines_No',
              # 'OnlineSecurity_No','OnlineBackup_No','DeviceProtection_No','TechSupport_No','StreamingTV_No','StreamingMovies_No'],
              # axis=1,inplace=True)
        
# leaving out all the rest for now

In [None]:
# commeted for faster execution

corrMatrix = df.drop(['Churn'], axis=1).corr()
fig, ax = plt.subplots(figsize=(30,25))
sns.heatmap(corrMatrix,annot=True, annot_kws={'size':12},cmap="GnBu")
plt.show();

* The correlation matrix is heavy a lot, but nevertheless we see what features we can drop atm.

In [None]:
# df.drop(['OnlineBackup_No internet service',
#          'TechSupport_No internet service',
#          'StreamingTV_No internet service',
#          'DeviceProtection_No internet service',
#          'OnlineBackup_No internet service',
#          'OnlineSecurity_No internet service', 
#          'StreamingMovies_No internet service', 
#          'MultipleLines_No phone service',
#          'PhoneService_No', 
#          'TotalCharge_diff_abs',
#          'TotalCharge_diff',],axis=1,inplace=True)

In [None]:
#Correlation of "Churn" with other variables in 1D:

plt.figure(figsize=(15,8))
#corrMatrix = df.drop(['Churn'], axis=1).corr()
df.corr()['Churn'].sort_values(ascending = False).plot(kind='bar')


# Apply scaling and split dataset

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler

In [None]:
target0 = df['Churn'] # for y
features0 = df.drop(['Churn'], axis=1) # for X

In [None]:
# To preserve the shape of the dataset (no distortion), data will be min max scaled to values between (0, 1) 
# instead of standard scaled. I tried also StandardScaler, but results were worse since the distribution of data is not gaussian. 
# RobustScaler was similar in performance to MinMaxScaker
scaler0=MinMaxScaler()

f_scale0 = scaler0.fit_transform(features0)

In [None]:
# # create train and test split on scaled data
X_train0, X_test0, y_train0, y_test0 = train_test_split (f_scale0,target0,test_size=0.2, random_state=42)

# Recursive Feature Elimination - Random Forrest

In [None]:
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold

# The "accuracy" scoring is proportional to the number of correct classifications
clf_rf = RandomForestClassifier()
cv = StratifiedKFold(5) #5-fold stratified cross-validation
rfecv = RFECV(estimator=clf_rf, step=1, cv=cv,scoring='accuracy')
rfecv = rfecv.fit(X_train0, y_train0)

print('Optimal number of features :', rfecv.n_features_)
print('Best features :', features0.columns[rfecv.support_])

In [None]:
# Plot number of features VS. cross-validation scores
import matplotlib.pyplot as plt
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score of number of selected features")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()

In [None]:
# transform training and set for new set of features
X_train1 = rfecv.transform(X_train0)
X_test1 = rfecv.transform(X_test0)

# Feature importance using Random Forrest

In [None]:
clf_rfc = RandomForestClassifier()      
clr_rfc = clf_rfc.fit(X_train0,y_train0)
importances = clr_rfc.feature_importances_
std = np.std([tree.feature_importances_ for tree in clf_rfc.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
# print("Feature ranking:")

#for f in range(X_train0.shape[1]):
#    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

# Plot the feature importances of the forest

plt.figure(1, figsize=(14, 13))
plt.title("Feature importances")
plt.bar(range(X_train0.shape[1]), importances[indices],
       color="g", yerr=std[indices], align="center")
plt.xticks(range(X_train0.shape[1]), features0.columns[indices],rotation=90)
plt.xlim([-1, X_train0.shape[1]])
plt.show()

# Testing different tree-based methods w/o tuned hyperparameters

In [None]:
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.feature_selection import SelectFromModel

from sklearn.experimental import enable_hist_gradient_boosting

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier,AdaBoostClassifier, BaggingClassifier, HistGradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from sklearn.model_selection import cross_validate
from sklearn.pipeline import Pipeline

In [None]:
pipeline = Pipeline([
    ('clf', DecisionTreeClassifier()) # classifier to iterate afterwards
])
pipeline.steps

#random = 2
classifiers = []

classifiers = []
classifiers.append(DecisionTreeClassifier())
classifiers.append(BaggingClassifier(KNeighborsClassifier()))
classifiers.append(ExtraTreesClassifier())
classifiers.append(RandomForestClassifier())
classifiers.append(AdaBoostClassifier())
classifiers.append(CatBoostClassifier(verbose=0))
classifiers.append(GradientBoostingClassifier())
classifiers.append(HistGradientBoostingClassifier())
#classifiers.append(LGBMClassifier()) waiting for fix of issue in new build:-(
classifiers.append(XGBClassifier(use_label_encoder=False,eval_metric='logloss'))


In [None]:
for classifier in classifiers:
    pipeline.set_params(clf = classifier)
    scores = cross_validate(pipeline, X_train1, y_train0)
    print('---------------------------------')
    print(str(classifier))
    print('-----------------------------------')
    for key, values in scores.items():
            print(key,' mean ', values.mean())
   #         print(key,' std ', values.std())



# Using different weak learner for AdaBoost

In [None]:
# # import Support Vector Classifier
# from sklearn.svm import SVC
# from sklearn.metrics import accuracy_score # import scikit-learn metrics module for accuracy calculation
# svc = SVC(probability=True, kernel='sigmoid')
# abc = AdaBoostClassifier(base_estimator=svc) # create adaboost classifer object
# model2 = abc.fit(X_train0, y_train0) # train adaboost classifer
# y_pred = model2.predict(X_test0) # predict the response for test dataset
# print("Model Accuracy with SVC Base Estimator:",accuracy_score(y_test0, y_pred)) # calculate and print model accuracy

In [None]:
# import Logistic Regression
# Logistic regression was a good model alltogether in my other notebook, so I wanted to check how it would perform as weak learner in AdaBoost
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score # import scikit-learn metrics module for accuracy calculation
logreg0 = LogisticRegression(max_iter=500,C=10, penalty='l2', solver='lbfgs')
adaBoost = AdaBoostClassifier(base_estimator=logreg0) # create adaboost classifer object
ada_model = adaBoost.fit(X_train1, y_train0) # train adaboost classifer
y_pred = ada_model.predict(X_test1) # predict the response for test dataset
print("Model Accuracy on test set with LR Base Estimator:",accuracy_score(y_test0, y_pred)) # calculate and print model accuracy

# Sequential search for best parameters of AdaBoost

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import learning_curve
from sklearn.model_selection import validation_curve
### 1. Use of validation curves for both datasets.
#n_estimators
n_estimators_param_range = [15, 25, 50, 75, 100, 150, 200]

#prepare plot
plt.figure(figsize=(15, 10))

# Apply model to training data

adaBoost = AdaBoostClassifier()

# Plot validation curve
train_scores, test_scores = validation_curve(estimator=adaBoost
                                                            ,X=X_train1
                                                            ,y=y_train0
                                                            ,param_name='n_estimators'
                                                            ,param_range=n_estimators_param_range)

train_mean = np.mean(train_scores,axis=1)
test_mean = np.mean(test_scores,axis=1)

plt.plot(n_estimators_param_range
            ,train_mean
            ,color='blue'
            ,marker='o'
            ,markersize=5
            ,label='training accuracy')
    
plt.plot(n_estimators_param_range
            ,test_mean
            ,color='green'
            ,marker='x'
            ,markersize=5
            ,label='test accuracy') 
    
plt.xscale('log')
plt.xlabel('n_estimators')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.ylim([0.79,0.82])

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import learning_curve
from sklearn.model_selection import validation_curve
### 1. Use of validation curves for both datasets.
#n_estimators
learning_rate_param_range = [0.1, 0.3, 0.4, 0.5, 0.65 ,0.8 ,1, 1.2, 1.3, 1.35, 1.375, 1.4, 1.45]

#prepare plot
plt.figure(figsize=(15, 10))

# Apply model to training data

adaBoost = AdaBoostClassifier(n_estimators = 50)

# Plot validation curve
train_scores, test_scores = validation_curve(estimator=adaBoost
                                                            ,X=X_train1
                                                            ,y=y_train0
                                                            ,param_name='learning_rate'
                                                            ,param_range=learning_rate_param_range)

train_mean = np.mean(train_scores,axis=1)
test_mean = np.mean(test_scores,axis=1)

plt.plot(learning_rate_param_range
            ,train_mean
            ,color='blue'
            ,marker='o'
            ,markersize=5
            ,label='training accuracy')
    
plt.plot(learning_rate_param_range
            ,test_mean
            ,color='green'
            ,marker='x'
            ,markersize=5
            ,label='test accuracy') 
    
plt.xscale('log')
plt.xlabel('learning_rate')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.ylim([0.795,0.815])

# Cross-validated model with AdaBoost

In [None]:
from numpy import mean
from numpy import std

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold

In [None]:
# define the model
model = AdaBoostClassifier(n_estimators = 50, learning_rate = 1.35)
# evaluate the model
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X_train1, y_train0, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

well... that did not help :-( much

# Random search for best hyperparameters for 2nd model - GBC

In [None]:
from scipy.stats import uniform, randint
from sklearn.model_selection import cross_val_score, RandomizedSearchCV

params_gbc = {
    'max_depth':         randint(2,50),     #
    'max_leaf_nodes':    randint(1,200),    #
    'min_samples_leaf':  randint(1,200),   #
    'n_estimators':      randint(2,200),    #
    'max_features':      randint(5,25),     #
    'learning_rate':     uniform(0.01,0.95),#
    'n_iter_no_change':  randint(5,6),     # 
}

gbc_model = GradientBoostingClassifier()


search = RandomizedSearchCV(gbc_model, 
                            param_distributions=params_gbc, 
                            random_state=42, 
                            n_iter=500, 
                            cv=3, 
                            verbose=1, 
                            n_jobs=-1, 
                            return_train_score=True)

search.fit(X_train1, y_train0)

In [None]:
def report_best_scores(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")
report_best_scores(search.cv_results_, 1)

# Credits
* https://www.analyticsvidhya.com/blog/2016/02/complete-guide-parameter-tuning-gradient-boosting-gbm-python/ 
* https://towardsdatascience.com/the-dummys-guide-to-creating-dummy-variables-f21faddb1d40 
* https://machinelearningmastery.com/one-hot-encoding-for-categorical-data/ 
* https://www.analyticsvidhya.com/blog/2020/03/one-hot-encoding-vs-label-encoding-using-scikit-learn/
* https://machinelearningmastery.com/hyperparameters-for-classification-machine-learning-algorithms/
* https://www.kaggle.com/joparga3/2-tuning-parameters-for-logistic-regression
* ... and various other Kaggle kernels

* https://www.learndatasci.com/tutorials/intro-feature-engineering-machine-learning-python/
* https://towardsdatascience.com/how-to-avoid-multicollinearity-in-categorical-data-46eb39d9cd0d
* https://machinelearningmastery.com/rfe-feature-selection-in-python/
