In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import xgboost as xgb
from sklearn import model_selection
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import plot_confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv')
df = df.set_index('customerID')

## Illustrates the data types in the Telco data

In [None]:
def utils_recognize_type(dtf, col, max_cat=20):
    if (dtf[col].dtype == "O") | (dtf[col].nunique() < max_cat):
        return "cat"
    else:
        return "num"
    
dic_cols = {col:utils_recognize_type(df, col, max_cat=20) for col in df.columns}
heatmap = df.isnull()
#print(heatmap)
for k,v in dic_cols.items():
 if v == "num":
   heatmap[k] = heatmap[k].apply(lambda x: 0.5 if x is False else 1)
 else:
   heatmap[k] = heatmap[k].apply(lambda x: 0 if x is False else 1)
   
sns.heatmap(heatmap, cbar=False).set_title('Dataset Overview')

plt.show()

print("\033[1;37;40m Categerocial ", "\033[1;30;41m Numeric ", "\033[1;30;47m NaN ")

In [None]:
df.dtypes

Total charges columns has some whitespace in the column- only occurs when tenure = 0 meaning the customer hasn't been billed yet, so we make totalcharges=0.0

In [None]:
df.loc[(df['TotalCharges'] == ' '), 'TotalCharges'] = 0
df.loc[df['tenure'] == 0]

In [None]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges']) #make sure the TotalCharges column is actually a number
df.replace(' ', '_', regex=True, inplace = True) #replace all the white space in the entire dataframe

df['Churn_value'] = np.where(df['Churn'] == 'Yes', 1, 0)
df.drop('Churn', axis =1, inplace = True)
df.head()

Create training data dataframe

In [None]:
X = df.drop('Churn_value', axis = 1).copy()
X.head()

Create y value series

In [None]:
y = df['Churn_value'].copy()
y.head()

Select all the columns of the X dataframe that are objects. We are going to One-hot encode these columns

In [None]:
df_object = df.select_dtypes(exclude=[np.number])
df_object.columns

In [None]:
X_encoded = pd.get_dummies(X, columns = df_object.columns)
X_encoded

26.5% of customers leave

In [None]:
sum(y)/ len(y)

We will split using stratification in order to maintain the same percentage of people who left in both the training and test set

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, random_state = 42, stratify = y)

Our training and test datasets retain the same percentage of customers who leave

In [None]:
sum(y_train)/ len(y_train)

In [None]:
sum(y_test)/ len(y_test)

In [None]:
clf_xgb = xgb.XGBClassifier(objective='binary:logistic', seed = 42)
clf_xgb.fit(X_train,y_train, verbose = False, eval_metric='aucpr', eval_set=[(X_test,y_test)])
plot_confusion_matrix(clf_xgb,X_test,y_test,values_format = 'd',display_labels=['Did not leave','Left'])

We correctly identify ~50% of customers that leave 

In [None]:
(237 / 467)* 100

In [None]:
#Round 2
param_grid1 = {
    'max_depth': [3,4],
    'learning_rate': [0.01,0.05,0.03],
    'gamma': [0.25,1.0,1.5],
    'reg_lamda': [0],
    'scale_pos_weight': [3,4,5]
}

optimal_params = GridSearchCV(
    estimator=xgb.XGBClassifier(objective='binary:logistic',
    seed = 42,
    use_label_encoder=False,
    subsample=0.9,
    colsample_bytree=0.5),
    param_grid=param_grid1,
    scoring='roc_auc',
    verbose=0,
    n_jobs=10,
    cv=3
)

""" optimal_params.fit(
    X_train,
    y_train,
    early_stopping_rounds = 10,
    eval_metric='auc',
    eval_set=[(X_test, y_test)],
    verbose = False)
 """
optimal_params.fit(X_train,y_train)
print(optimal_params.best_params_)

In [None]:
clf_xgb = xgb.XGBClassifier(
    seed=42,
    objective='binary:logistic',
    gamma = 0.25,
    learn_rate = 0.05,
    max_depth=3,
    reg_lamda = 0,
    scale_pos_weight = 3,
    subsample =0.9,
    colsample_bytree=0.5)

clf_xgb.fit(
    X_train,
    y_train,
    verbose=False,
    early_stopping_rounds=10,
    eval_metric='aucpr',
    eval_set=[(X_test,y_test)])

plot_confusion_matrix(clf_xgb,X_test,y_test,values_format = 'd',display_labels=['Did not leave','Left'])

We caught 82% of the people that left.

In [None]:
(1- (85/467))*100