In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
sns.set()

from sklearn.model_selection import train_test_split,GridSearchCV

import xgboost as xgb

from sklearn.metrics import plot_confusion_matrix


In [None]:
df = pd.read_csv('/kaggle/input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv')
df.head()

Since customer ID is unique to every user it won't be useful for prediction

In [None]:
df.drop('customerID',axis = 1,inplace = True)
df.info()

No null values are present in this dataset

Here we can observe that even though TotalCharges should be numeric but it is of object type.So converting it to numeric.

In [None]:
#df['TotalCharges'] = pd.to_numeric(df['TotalCharges'],errors = 'raise')

By applying the conversion to TotalCharges, an error is raised as the column contains "  " (empty) values which cannot be converted to numeric type.<br/>

Hence " " values should be dealt first before model building

In [None]:
len(df[df['TotalCharges'] == ' '])

Since there are only 11 columns so replacing the values with 0

In [None]:
df['TotalCharges'] = df['TotalCharges'].replace(' ',0)

#Now converting to numeric dtype
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'],errors = 'raise')

df.head()

Now in various categorical columns like 'MultipleLines','InternetService' it can be seen that white space(' ') is present between some category values.Hence replacing the spaces with underscore ('_') will do good for visualisation

In [None]:
df.replace(' ','_',regex = True,inplace = True)
df.head()

## One hot encoding

In [None]:
X = df.drop('Churn',axis = 1)
y = df['Churn']

In [None]:
X.dtypes

In [None]:
cat_cols = [c for c in X.columns if X[c].dtype == 'object'] #making list of category columns
            
X_encoded = pd.get_dummies(X,columns = cat_cols)
    
X_encoded.head()

In [None]:
y.unique()

replacing y values of 'Yes' with 1 and 'No' with 0

In [None]:
y_encoded = y.replace({'Yes':1,'No':0})

## Train_Test_Split

In [None]:
y.value_counts()

Since the classes are imbalanced we will use stratify in train test split to gain correct proportion of datas in both train and test set 

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X_encoded,y_encoded)

## Model Building

In [None]:
clf = xgb.XGBClassifier(objective = 'binary:logistic',missing = None,seed = 42)
clf.fit(X_train,y_train,verbose=True,early_stopping_rounds = 10,eval_metric = 'aucpr',eval_set = [(X_test,y_test)])

## Confusion Matrix


In [None]:
plot_confusion_matrix(clf,X_test,y_test,display_labels = ['Did Not Leave','Left'])

Not good!!

But Xgboost has a parameter 'scale_pos_weight' which will give penalty to majority class 
scale_pos_weight is the ratio of number of negative class to the positive class

In [None]:
print('Proportion of positive points in the dataset is: ',round(sum(y_encoded)/len(y_encoded),2) * 100)

So scale_pos_weight can be taken as 73/27 ~ 2.7 or 3 as round figure

## Cross Validation

In [None]:
params = {
    'max_depth' : [3,4,5],
    'learning_rate':[0.01,0.05,0.1,0.5],
    'gamma':[0,0.5,1.0],
    'reg_lambda':[0,1.0,10.0],
    'scale_pos_weight':[2,3]
    
}

optimal_params = GridSearchCV(estimator=xgb.XGBClassifier(objective = 'binary:logistic',missing = None,subsample = 0.8,colsample_bytree=0.6),
                             param_grid = params,
                             scoring = 'roc_auc',
                             n_jobs = -1,
                             cv = 4)


In [None]:
optimal_params.fit(X_train,y_train,early_stopping_rounds = 10,eval_metric = 'auc',eval_set = [(X_test,y_test)])
print(optimal_params.best_params_)

## Optimal Model Build

In [None]:
op_clf = xgb.XGBClassifier(objective = 'binary:logistic',gamma = 1.0, learning_rate= 0.05, max_depth= 3, reg_lambda= 10.0, scale_pos_weight= 3)
op_clf.fit(X_train,y_train,verbose = True,early_stopping_rounds = 10,eval_metric = 'aucpr',eval_set = [(X_test,y_test)])

In [None]:
plot_confusion_matrix(op_clf,X_test,y_test,display_labels = ['Did Not Leave','Left'])