# Importing the necessary packages

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns

import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, StratifiedKFold

from sklearn.metrics import balanced_accuracy_score, roc_auc_score, classification_report
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, plot_precision_recall_curve, plot_roc_curve, make_scorer

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Load the dataset

In [None]:
file_path = "/kaggle/input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv"
df = pd.read_csv(file_path)

In [None]:
df.head()

# Check the columns info

In [None]:
df.info()

# Statistical Summary (Five Number Summary)

In [None]:
df.describe().T

In [None]:
for cols in df.columns:
    print(cols, " : ", df[cols].unique())

In [None]:
df.replace(to_replace=["No_phone_service","No_internet_service"], value="No", inplace=True)

In [None]:
for cols in df.columns:
    print(cols, " : ", df[cols].unique())

In [None]:
len(df.loc[df["TotalCharges"]==" "])

In [None]:
df.loc[df["TotalCharges"]==" "]

In [None]:
df.loc[(df["TotalCharges"]==" "), "TotalCharges"] = 0

In [None]:
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"])

In [None]:
df.info()

In [None]:
df.replace(' ', '_', regex=True, inplace=True)
df.head()

In [None]:
df["Churn"] = df["Churn"].replace(to_replace=["Yes", "No"], value=[1, 0])
df["Churn"].head()

# Box Plot of all the Variables
### To understand the distribution and outliers

In [None]:
sns.boxplot(x=df["gender"], y=df["TotalCharges"], hue=df["Churn"]);

In [None]:
sns.boxplot(x=df["SeniorCitizen"], y=df["TotalCharges"], hue=df["Churn"]);

In [None]:
sns.boxplot(y=df["MonthlyCharges"], x=df["Churn"]);

In [None]:
sns.boxplot(y=df["TotalCharges"], x=df["Churn"]);

# Pairplot of all the variables

In [None]:
sns.pairplot(df);

# Separate the Independent and Dependent Variables

In [None]:
X = df.drop(columns="Churn", axis=1).copy()
X.head()

In [None]:
y = df["Churn"].copy()
y.head()

In [None]:
X.drop(columns="customerID", inplace=True)

In [None]:
cat_cols = list(X.columns[X.dtypes==object])
cat_cols

# One-Hot Encoding for the Categorical variables

In [None]:
X_encoded = pd.get_dummies(X, columns=cat_cols)

In [None]:
X_encoded.head(10)

In [None]:
y.unique()

### Checking the Class Imbalance 

In [None]:
sum(y)/len(y) * 100

In [None]:
display(X_encoded.shape, y.shape)

# Train Test Split with Stratification

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, stratify=y, test_size=0.3, random_state=24)

In [None]:
sum(y_train)/len(y_train) * 100

In [None]:
sum(y_test)/len(y_test) * 100

# Building XGBoost Baseline Model

In [None]:
clf_xgb = xgb.XGBClassifier(objective = 'binary:logistic',
                           missing=None,
                           seed=24)

In [None]:
clf_xgb.fit(X_train,
           y_train,
           verbose=True,
           early_stopping_rounds=10,
           eval_metric='aucpr',
           eval_set=[(X_test, y_test)])

## Confusion Matrix

In [None]:
plot_confusion_matrix(clf_xgb,
                     X_test,
                     y_test,
                     values_format='d',
                     display_labels=['Churned','Not Churned'])

## Tweaking the parameters for handling Class Imbalance in Dataset

`scale_pos_weight` is tuned for Imbalanced Datasets

In [None]:
param_grid = {
    'max_depth' : [3,4,5],
    'learning_rate' : [0,1,0.01,0.05],
    'gamma' : [0,0.25,1.0],
    'reg_lambda' : [0,1.0,10.0],
    'scale_pos_weight' : [1,3,5]
}

In [None]:
xgb_estimator = xgb.XGBClassifier(objective='binary:logistic',
                                  seed=24,
                                  subsample=0.9,
                                  colsample_bytree=0.5)

# Hyperparameter Tuning with GridSearchCV

In [None]:
clf_xgb_tuned = GridSearchCV(estimator=xgb_estimator,
                             param_grid=param_grid,
                             scoring='roc_auc',
                             verbose=2,
                             n_jobs=-1,
                             cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=24)
)

In [None]:
clf_xgb_tuned.fit(X_train,
                  y_train,
                  verbose=True,
                  early_stopping_rounds=10,
                  eval_metric='aucpr',
           eval_set=[(X_test, y_test)]
)

## GridSearchCV Results

In [None]:
clf_xgb_tuned.best_estimator_

In [None]:
clf_xgb_tuned.best_params_

In [None]:
clf_xgb_tuned.best_score_

In [None]:
plot_confusion_matrix(clf_xgb_tuned,
                     X_test,
                     y_test,
                     values_format='d',
                     display_labels=['Churned','Not Churned'])

In [None]:
y_pred = clf_xgb_tuned.predict(X_test)

## Classification Report for the Hyperparameter Tuned Model

In [None]:
print(classification_report(y_pred, y_test))

# ROC AUC Curve for the Hyperparameter Tuned Model

In [None]:
plot_roc_curve(clf_xgb_tuned,
               X_test,
               y_test,
               name='XGB Tuned ROC AUC');

# Precision-Recall Curve for the Hyperparameter Tuned Model

In [None]:
plot_precision_recall_curve(clf_xgb_tuned,
               X_test,
               y_test,
               name='XGB Precision-Recall Curve');

# Visualizing the XGBoost Tree

In [None]:
xgb_lone_estimator = xgb.XGBClassifier(
    objective='binary:logistic', 
    seed=24, 
    subsample=0.9, 
    colsample_bytree=0.5, 
    gamma = 0.25, 
    learning_rate=0.05, 
    max_depth=4,
    reg_lambda=10.0, 
    scale_pos_weight=3,
    n_estimator=1
)

In [None]:
xgb_lone_estimator.fit(X_train, y_train)

In [None]:
xgb_bst = xgb_lone_estimator.get_booster()

In [None]:
for importance_type in ('weight', 'gain', 'cover', 'total_gain', 'total_cover'):
    print('%s:  ' % importance_type, xgb_bst.get_score(importance_type=importance_type))

In [None]:
node_params = {
    'shape' : 'box',
    'style' : 'filled, rounded',
    'fillcolor' : '#78cbe'
}

leaf_params = {
    'shape' : 'box',
    'style' : 'filled',
    'fillcolor' : '#e48038'
}

In [None]:
xgb.to_graphviz(xgb_lone_estimator,
                num_trees=0,
                size="5,5",
                condition_node_params=node_params,
                leaf_node_params=leaf_params
)