In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
from tabulate import tabulate
import scipy.stats as stats
import sklearn.model_selection as mod
import sklearn.metrics as mets
import scikitplot as skplt
import sklearn.preprocessing as prep
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline
from sklearn import manifold as man
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from  xgboost import XGBClassifier,plot_importance

pd.options.display.max_columns = 50
pd.options.display.max_rows = 100
import warnings
warnings.filterwarnings('ignore')

## Data

In [None]:
data = pd.read_csv('../input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv')
print(data.shape)
data.head()

In [None]:
data.describe(include = 'all')

In [None]:
data.info()

There are 21 columns in dataset. Churn is our target column which is 'Yes' or 'No'. We have 17 categoric and 3 numeric. TotalCharges column has wrong data type, and will be changed to numeric.

### MISSING VALUES

In [None]:
data['TotalCharges'] = pd.to_numeric(data.TotalCharges, errors='coerce')
data.TotalCharges.isnull().sum()

There are 11 null values in **`TotalCharges`** column, lets see if we can found a pattern.

In [None]:
data[np.isnan(data['TotalCharges'])]

**`Tenure`** and **`Dependents`** column seems to be same for null TotalCharges. Lets check them

In [None]:
data[data['Dependents'] == 'Yes'][['tenure','TotalCharges']]

In [None]:
data[data['tenure'] == 0][['tenure','TotalCharges']]

Only when **`Tenure`** is 0 **`TotalCharges`** is also 0. **`TotalCharges`** is almost equal to **`Tenure`** multiplied by **`MonthlyCharges`**. Since these 11 records does not seem to be churned, they might be just contacted, and have not paid any bill yet. Therefore, I will fill them with 0's. 

In [None]:
data.fillna(0, inplace = True)
print(data.shape)

## Target : **`Churn`** : Whether the customer churned or not (Yes or No)

In [None]:
print(data.groupby('Churn')['Churn'].count())
print('')
print('Target Rate %',(data[data.Churn=='Yes']['customerID'].count()/data['customerID'].count()).round(2))
print('')
data['Churn'].value_counts().plot(kind='bar');

### Churn rate is %27.

## FEATURES

In [None]:
## Check whether categoric features has effect on Churn statistically
def CatColumnRelation(col):
    data['NUMBER'] = 1
    table = pd.pivot_table(data,values = 'NUMBER',index=[col],columns=['Churn'], aggfunc=np.sum,fill_value=0).rename_axis(None)

    X2, p, dof, expected = stats.chi2_contingency(table)
    n = np.sum(table).sum()
    minDim = min(table.shape)-1

    #calculate Cramer's V 
    V = np.sqrt((X2/n) / minDim)
    data.drop(columns = 'NUMBER',inplace = True)
    return X2,p,V

In [None]:
def CatCounts(col):
    data['total_count'] = 1
    total = data['total_count'].sum()

    group_total = data.groupby([col])['total_count'].count().reset_index()
    group_total.rename(columns = {'total_count':'Group_Count'},inplace = True)

    temp = pd.DataFrame(data.groupby([col,'Churn'])['total_count'].count()).reset_index()
    temp['% of Total'] = ((temp.total_count / total)*100).round(2)
    temp.rename(columns = {'total_count':'Count'},inplace = True)

    temp = pd.merge(temp,group_total, on =col)
    temp['% of Group'] = (temp['Count']/ temp['Group_Count']*100).round(2)
    temp.drop(columns = 'Group_Count',inplace = True)
    temp = temp[[col, 'Churn','Count','% of Group','% of Total']]
    return temp

In [None]:
## For Categoric Columns
def catColumnInspector(col):
    data['total_count'] = 1
    X2,p,V = CatColumnRelation(col)
    
    print('\033[1m\t\t' + col + '\033[0m')
    print('-'*60)
    print('Statistical Test Results')
    print('p-value: ', p.round(5), ' Chi-Square: ', X2.round(2), " Crammer's V: ",V.round(5) )
    print('-'*60)
    print(tabulate(CatCounts(col),headers=[col,'Churn','Count','% of Group','% of Total']))
    print('-'*60)
    print('')
    plt.figure(figsize=(10,5))
    sns.barplot(data = pd.DataFrame(data.groupby([col,'Churn'])['total_count'].count()).reset_index()
            , x = col,y='total_count', hue = 'Churn')
    plt.show()
    data.drop(columns='total_count',inplace = True)

In [None]:
## For Numeric Columns
def numColumnInspector(col):
    data['total_count'] = 1
    data['Churn_Flag'] = np.where(data.Churn=='Yes',1,0)
    ph_cor , p = stats.pointbiserialr(data['Churn_Flag'], data[col])
    print('\033[1m  \t\t\t' + col + '\033[0m')
    print('-'*75)
    print('Point Biseral Correlation Results ')
    print('Correlation: ', ph_cor.round(3), ' and  p-value: ', p)    
    print('-'*75)
    print(tabulate(data.groupby('Churn')[col].describe(),headers=['Churn','count','mean','std','min','25%','50%','75%','max']))
    print('-'*75)
    print('')
    sns.boxplot(x=data.Churn,y=data[col], showmeans=True)
    plt.show()       
    data.drop(columns=['total_count','Churn_Flag'],inplace = True)

### **`Gender`** : Whether the customer is a male or a female

In [None]:
catColumnInspector('gender')

Gendder does not seem to have a difference for churn 

### **`SeniorCitizen`** : Indicates if the customer is 65 or older: Yes, No (1, 0)

In [None]:
catColumnInspector('SeniorCitizen')

Seniors seems to be churning more then non-Seniors

### **`Partner`** : Whether the customer has a partner or not (Yes, No)

In [None]:
catColumnInspector('Partner')

Singles seems to become churn more than couples

### **`Dependents`** :Indicates if the customer lives with any dependents:Yes,No.Dependents could be children,parents,grandparents etc.

In [None]:
catColumnInspector('Dependents') 

Customer who does not have dependents become churn more than who has.

### **`Tenure`** : Number of months the customer has stayed with the company

In [None]:
numColumnInspector('tenure')

Customer who churns seems to stay twice as customer. After 30-40 period only a few customers seems to become churn

In [None]:
after_40_period_churn = data[(data.Churn == 'Yes') & (data.tenure >40)]['customerID'].count()
after_40_period_non_churn = data[(data.Churn == 'No') & (data.tenure >40)]['customerID'].count()
after_40_count = data[data.tenure >40]['customerID'].count()
print('Churn number after 40 period',after_40_period_churn)
print('Non-Churn number after 40 period',after_40_period_non_churn)
print('Churn Percent %', (after_40_period_churn / after_40_count).round(2))

In whole data our churn rate is 27%, as we observed after 40 months churn rate becomes only 10%. 

### **`PhoneService`** : Whether the customer has a phone service or not (Yes, No)

In [None]:
catColumnInspector('PhoneService')

Most of the customers have phone service and it does not seems to be effecting churn much

### **`MultipleLines`** : Whether the customer has multiple lines or not (Yes, No, No phone service)

In [None]:
catColumnInspector('MultipleLines')

Customers who has multipleLines seems to be churning less than who does not 

### **`InternetService`** : Customer’s internet service provider (DSL, Fiber optic, No)

In [None]:
catColumnInspector('InternetService')

Customers with Fiber optic internet service are churning much more than other group, the reason might be bad fiber optic infrastructure

### **`OnlineSecurity`** : Whether the customer has online security or not (Yes, No, No internet service)

In [None]:
catColumnInspector('OnlineSecurity')

Customer who have internet service but does not have online security seems to be churning much more

### **`OnlineBackup`** :Whether the customer has online backup or not (Yes, No, No internet service)

In [None]:
catColumnInspector('OnlineBackup')

Customer who have internet service but does not have Online Backup seems to be churning much more

### **`DeviceProtection`** : Whether the customer has device protection or not (Yes, No, No internet service)

In [None]:
catColumnInspector('DeviceProtection')

Customer who have internet service but does not have Device Protection seems to be churning much more

### **`TechSupport`** : Whether the customer has tech support or not (Yes, No, No internet service)

In [None]:
catColumnInspector('TechSupport')

Customer who have internet service but does not have TechSupport seems to be churning much more

### **`StreamingTV`** : Whether the customer has streaming TV or not (Yes, No, No internet service)

In [None]:
catColumnInspector('StreamingTV')

Customer who have internet service but does not have SteamingTV seems to be churning slightly more

### **`StreamingMovies`** : Whether the customer has streaming movies or not (Yes, No, No internet service)

In [None]:
catColumnInspector('StreamingMovies')

Customer who have internet service but does not have SteamingMovies seems to be churning slightly more

### **`Contract`** : The contract term of the customer (Month-to-month, One year, Two year)

In [None]:
catColumnInspector('Contract')

Month-to-Month customers churn about %40, while One year contracted customers churn %10 and two year contracted customers churn only 0.03 %

### **`PaperlessBilling`** : Whether the customer has paperless billing or not (Yes, No)

In [None]:
catColumnInspector('PaperlessBilling')

Paperless Billed customers churn more than customers which billed with paper

### **`PaymentMethod`** : The customer’s payment method

(Electronic check, Mailed check, Bank transfer (automatic), Credit card (automatic))

In [None]:
catColumnInspector('PaymentMethod')

### **`MonthlyCharges`** : The amount charged to the customer monthly

In [None]:
numColumnInspector('MonthlyCharges')

Higher the montly charges higher the churn rate

### **`TotalCharges`** : The total amount charged to the customer

In [None]:
numColumnInspector('TotalCharges')

Since the customers have stayed in company longer will have been payed more their churn rate will be less. But there is a group of people who have charged much more than average have a very high churn rate. Therefore, I will create a new feature by dividing total charges to tenure to check that.

### Pearson's Correlation between Continues Features

In [None]:
data[['tenure','MonthlyCharges','TotalCharges']].corr()

Total Charges is highly correlated with tenure and Monthly Charges. That will effect especially linear models.

In [None]:
data['Target'] = np.where(data.Churn == 'Yes',1,0)

### **`Service_No_Count`** : Service No Count

We saw that especially **`OnlineSecurity`**,**`OnlineBackup`**,**`DeviceProtection`**,**`TechSupport`** behaves similar on target wise

In [None]:
data.groupby(['OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport'])['Target'].agg({'mean','count'}).reset_index()

Seems that customers who has at leat 2 of these services churn less. Therefore I will create a flag for that

In [None]:
data['Service_No_Count'] = np.where(data['OnlineSecurity'] == 'No',1,0) + np.where(data['OnlineBackup'] == 'No',1,0) + np.where(data['DeviceProtection'] == 'No',1,0) + np.where(data['TechSupport'] == 'No',1,0)
data['Service_No_Count'] = pd.to_numeric(data.Service_No_Count, errors='raise')

In [None]:
numColumnInspector('Service_No_Count')

### Lastly lets check streaming columns

In [None]:
data.groupby(['StreamingTV','StreamingMovies'])['Target'].agg({'mean','count'}).reset_index()

### Encoding Categoric Features

In [None]:
cat_cols = ['gender','SeniorCitizen', 'Partner', 'Dependents','PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity'
            ,'OnlineBackup','DeviceProtection', 'TechSupport', 'StreamingTV','StreamingMovies', 'Contract', 'PaperlessBilling'
            ,'PaymentMethod']

data = pd.get_dummies(data, columns = cat_cols, drop_first= True,dtype=int)
print(data.shape)
data.head()

In [None]:
plt.figure(figsize=(15,15))
sns.heatmap(data.drop(columns = {'customerID','Churn'}).corr(), annot = True, fmt='.1g')
plt.show()

In [None]:
data.drop(columns= {'InternetService_No','InternetService_Fiber optic','OnlineBackup_No internet service','OnlineBackup_No internet service',
                    'DeviceProtection_No internet service','TechSupport_No internet service','StreamingTV_No internet service',
                    'StreamingMovies_No internet service','OnlineSecurity_No internet service','MultipleLines_No phone service'}, inplace = True)

In [None]:
plt.figure(figsize=(15,15))
sns.heatmap(data.drop(columns = {'customerID','Churn'}).corr(), annot = True, fmt='.1g')
plt.show()

### TRAIN - TEST SPLIT 

In [None]:
data.drop(columns = 'Target',inplace = True)
data['stratification_col'] = data['Churn'] + '-' +data['Service_No_Count'].map(str)
data.groupby('stratification_col')['Churn'].count().reset_index().rename(columns = {'Churn':'Count'}).sort_values(by = 'stratification_col')
data['Churn'] = np.where(data['Churn'] =='Yes',1,0)

In [None]:
X_train, X_test, y_train, y_test = mod.train_test_split(data.drop(columns = ['customerID','gender_Male','TotalCharges'])
                                                    , data['Churn'], test_size=0.2, random_state=42, stratify = data['stratification_col'])


print('-'*25)
print('Train Target Stats')
print(y_train.agg({'count','mean','std'}))
print('-'*25)
print('Test Target Stats')
print(y_test.agg({'count','mean','std'}))
print('-'*25)

In [None]:
table1 = pd.DataFrame(X_train.groupby('stratification_col')['stratification_col'].count()).rename(columns = {'stratification_col':'train_count'})
table2 = pd.DataFrame(X_test.groupby('stratification_col')['stratification_col'].count()).rename(columns = {'stratification_col':'test_count'})
table = table1.merge(table2, how = 'left', on = 'stratification_col')
table

In [None]:
X2, p, dof, expected = stats.chi2_contingency(table)
print('- '*50)
print('Chi Square:',X2)
print('p-value:', p)
print('Degrees of Freedom: ', dof)
print('- '*50)
expexted_values = pd.DataFrame(expected, columns=['train_expected_observations','test_expected_observations'],index = table.index.unique())
table.merge(expexted_values, how = 'left', on = 'stratification_col')

In [None]:
X_train.drop(columns = {'Churn','stratification_col'},inplace = True)
X_test.drop(columns = {'Churn','stratification_col'},inplace = True)
X_train.head()

### Preprocessing Features

In [None]:
features = X_train.columns.tolist()

num_cols = ['tenure','MonthlyCharges','Service_No_Count']


num_transformer = Pipeline(steps=[('scaler', prep.StandardScaler())])
preprocessor = ColumnTransformer(transformers=[('num', num_transformer, num_cols)],remainder = 'passthrough') 

### Plotting whole dataset in two Dimensions

In [None]:
pipe_tsne = Pipeline([
          ("preprocessor", preprocessor),
          ("tsne", man.TSNE(n_components=2, random_state=42))
])

data_tsne = pipe_tsne.fit_transform(data[features])

plt.figure(figsize=(10,7))
sns.scatterplot(x=data_tsne[:, 0], y=data_tsne[:,1], hue=data['Churn'])
plt.show()

## MODELLING

### 1) Logistic Regression

In [None]:
%%time
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier',   LogisticRegression( random_state=42))])

params = [{'classifier__C'      : np.arange(0.1, 5.0, 0.5), 
           'classifier__penalty': ['l1'], 
           'classifier__solver' : ['liblinear','saga']}, 
          
         {'classifier__C'      : np.arange(0.1, 5.0, 0.5), 
         'classifier__penalty': ['l2'], 
         'classifier__solver' : ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga']}
         ]

met_grid= ['roc_auc', 'accuracy','f1','precision','recall'] 
cv = mod.RepeatedStratifiedKFold(n_splits=4, n_repeats=2,random_state=42)
log_grid = mod.GridSearchCV(pipeline, param_grid = params, cv = cv, verbose = 1, n_jobs = -1, scoring = met_grid , refit = 'roc_auc')
log_grid.fit(X_train[features], y_train)
print(log_grid.best_params_)

In [None]:
print('='*25)
print('Roc-Auc Score:   ',log_grid.cv_results_['mean_test_roc_auc'].mean().round(2))
print('='*25)
print('Accuracy Score:  ',log_grid.cv_results_['mean_test_accuracy'].mean().round(2))
print('='*25)
print('f1 score:        ' ,log_grid.cv_results_['mean_test_f1'].mean().round(2))
print('='*25)
print('Precision Score: ',log_grid.cv_results_['mean_test_precision'].mean().round(2))
print('='*25)
print('Recall Score:    ',log_grid.cv_results_['mean_test_recall'].mean().round(2))
print('='*25)

In [None]:
log_test_probs = log_grid.predict_proba(X_test[features])
f = plt.figure(figsize=(14,6))
ax1 = f.add_subplot(121)
ax2 = f.add_subplot(122)

skplt.metrics.plot_cumulative_gain(y_test,log_test_probs,title='Logistic Regression Cumulative Gains Curve',ax=ax1)

skplt.metrics.plot_lift_curve(y_test, log_test_probs,title = 'Logistic Regression Lift Chart',ax = ax2)
ax2.legend(loc='upper right')
plt.show()

60% of 1407 test ->842 -> 26.5% Churn rate -> 225 Churn -> 95% - will be reached 213 churn 

In [None]:
log_grid.best_estimator_['classifier'].intercept_

In [None]:
log_coefs = pd.DataFrame(features).rename(columns={0:'feature'})
log_coefs['coefficient'] = log_grid.best_estimator_['classifier'].coef_.reshape(-1,1)
log_coefs.iloc[(-log_coefs['coefficient'].abs()).argsort()].reset_index(drop=True)

### 2) Random Forest 

In [None]:
%%time
pipeline = Pipeline(steps=[('preprocessor', preprocessor),('clf',RandomForestClassifier(random_state=42))])

param_grid = {  
    'clf__n_estimators' : [100,300,500],
    'clf__criterion' :['entropy','gini'],
    'clf__max_depth' : [5,7,9],
    'clf__max_features' :['auto'],
    'clf__min_samples_leaf' :[10,30,70],
    'clf__min_samples_split' : [10,30,70],
    'clf__class_weight' :['balanced',None],
    'clf__bootstrap': [True, False]
}                                            

met_grid= ['roc_auc', 'accuracy','f1','precision','recall'] 

cv = mod.RepeatedStratifiedKFold(n_splits=4, n_repeats=2,random_state=42)
grid_rdf = mod.GridSearchCV(pipeline, param_grid = param_grid,cv = cv,verbose = 1,n_jobs = -1, scoring =met_grid, refit='roc_auc')
grid_rdf.fit(X_train[features], y_train)
print(grid_rdf.best_params_)

In [None]:
print('='*25)
print('Roc-Auc Score:   ',grid_rdf.cv_results_['mean_test_roc_auc'].mean().round(2))
print('='*25)
print('Accuracy Score:  ',grid_rdf.cv_results_['mean_test_accuracy'].mean().round(2))
print('='*25)
print('f1 score:        ' ,grid_rdf.cv_results_['mean_test_f1'].mean().round(2))
print('='*25)
print('Precision Score: ',grid_rdf.cv_results_['mean_test_precision'].mean().round(2))
print('='*25)
print('Recall Score:    ',grid_rdf.cv_results_['mean_test_recall'].mean().round(2))
print('='*25)

In [None]:
rdf_test_probs = grid_rdf.predict_proba(X_test[features])
f = plt.figure(figsize=(14,6))
ax1 = f.add_subplot(121)
ax2 = f.add_subplot(122)

skplt.metrics.plot_cumulative_gain(y_test,rdf_test_probs,title='Random Forest Cumulative Gains Curve',ax=ax1)

skplt.metrics.plot_lift_curve(y_test, rdf_test_probs,title = 'Random Forest Lift Chart',ax = ax2)
ax2.legend(loc='upper right')
plt.show()

In [None]:
feat_imp_rdf_grid = pd.DataFrame(grid_rdf.best_estimator_['clf'].feature_importances_)
feat_imp_rdf_grid['feature'] = features
feat_imp_rdf_grid = feat_imp_rdf_grid.rename(columns={0:'rdf_importance'})
feat_imp_rdf_grid = feat_imp_rdf_grid.sort_values(by = 'rdf_importance' , ascending=False)
feat_imp_rdf_grid = feat_imp_rdf_grid[['feature', 'rdf_importance']]
feat_imp_rdf_grid

### 3) XGBOOST

In [None]:
%%time

param_grid = {  
    'clf__learning_rate' : [0.01],
    'clf__n_estimators' : [500],
    'clf__max_depth' : [3,4,5],
    'clf__colsample_bytree' : [0.5,0.7,0.8],
    'clf__subsample' : [0.6,0.7,0.8], 
    'clf__min_child_weight' : [30,40,50],
    'clf__objective' : ['binary:logistic'],
    'clf__eval_metric' :['error'],
    'clf__use_label_encoder' : [False]
}

pipe = Pipeline([('scaler', preprocessor),('clf',XGBClassifier(random_state=42))])

met_grid= ['roc_auc', 'accuracy', 'f1','precision','recall'] 

cv = mod.StratifiedKFold(n_splits=4, shuffle = True,random_state=42)

xgb_grid = mod.GridSearchCV(pipe, param_grid = param_grid,cv = cv,verbose = 1,n_jobs = -1, scoring =met_grid, refit='roc_auc')

xgb_grid.fit(X_train[features], y_train)
print(xgb_grid.best_params_)

In [None]:
print('='*25)
print('Roc-Auc Score:   ',xgb_grid.cv_results_['mean_test_roc_auc'].mean().round(2))
print('='*25)
print('Accuracy Score:  ',xgb_grid.cv_results_['mean_test_accuracy'].mean().round(2))
print('='*25)
print('f1 score:        ' ,xgb_grid.cv_results_['mean_test_f1'].mean().round(2))
print('='*25)
print('Precision Score: ',xgb_grid.cv_results_['mean_test_precision'].mean().round(2))
print('='*25)
print('Recall Score:    ',xgb_grid.cv_results_['mean_test_recall'].mean().round(2))
print('='*25)

In [None]:
xgb_test_probs = xgb_grid.predict_proba(X_test[features])
f = plt.figure(figsize=(14,6))
ax1 = f.add_subplot(121)
ax2 = f.add_subplot(122)

skplt.metrics.plot_cumulative_gain(y_test,xgb_test_probs,title='Random Forest Cumulative Gains Curve',ax=ax1)

skplt.metrics.plot_lift_curve(y_test, xgb_test_probs,title = 'Random Forest Lift Chart',ax = ax2)
ax2.legend(loc='upper right')
plt.show()

In [None]:
feat_imp_xgb_grid = pd.DataFrame(xgb_grid.best_estimator_['clf'].feature_importances_)
feat_imp_xgb_grid['feature'] = features
feat_imp_xgb_grid = feat_imp_xgb_grid.rename(columns={0:'xgb_importance'})
feat_imp_xgb_grid = feat_imp_xgb_grid.sort_values(by = 'xgb_importance' , ascending=False)
feat_imp_xgb_grid = feat_imp_xgb_grid[['feature', 'xgb_importance']]
feat_imp_xgb_grid

## MODEL COMPARITION

In [None]:
f = plt.figure(figsize=(13,11))
ax1 = f.add_subplot(321)
ax2 = f.add_subplot(322)
ax3 = f.add_subplot(323)
ax4 = f.add_subplot(324)
ax5 = f.add_subplot(325)
ax6 = f.add_subplot(326)

skplt.metrics.plot_cumulative_gain(y_test,log_test_probs,title='Logistic Regression Cumulative Gains Curve ',ax=ax1)
skplt.metrics.plot_lift_curve(y_test, log_test_probs,title = 'Logistic Regression Lift Chart',ax = ax2)
ax2.legend(loc='upper right')


skplt.metrics.plot_cumulative_gain(y_test,rdf_test_probs,title='Random Forest Cumulative Gains Curve',ax=ax3)
skplt.metrics.plot_lift_curve(y_test, rdf_test_probs,title = 'Random Forest Lift Chart',ax = ax4)
ax4.legend(loc='upper right')

skplt.metrics.plot_cumulative_gain(y_test,xgb_test_probs,title='XGBoost Cumulative Gains Curve',ax=ax5)
skplt.metrics.plot_lift_curve(y_test, xgb_test_probs,title = 'XGBoost Lift Chart',ax = ax6)
ax6.legend(loc='upper right')
plt.tight_layout()
plt.show()

In [None]:
feat_imp_xgb_grid.merge(feat_imp_rdf_grid, how = 'left', on ='feature').merge(log_coefs, how = 'left' , on='feature')