<a href="https://colab.research.google.com/github/samuel-haddad/EnsembleClassifiersReview/blob/main/xgboost_py.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
'''
Churn prediction: XGBoost Review
> Author: Samuel Haddad Simões Machado
> Date: jun/2022
> Licence: Open Source
'''
# general libs
import pandas as pd 
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
from numpy import mean
from numpy import std

# modeling libs
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import ExtraTreesClassifier
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from xgboost import XGBClassifier

# preparation lib
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer


# import the dataset and create the dataframe
df_churn = pd.read_csv('https://raw.githubusercontent.com/samuel-haddad/TreeClassifiersReview/main/WA_Fn-UseC_-Telco-Customer-Churn.csv', delimiter=',')

#copy df
df_prep = df_churn.copy()

df_prep['TotalCharges'].replace(r'^\s*$', np.nan, regex=True, inplace=True)

df_prep['TotalCharges'] = pd.to_numeric(df_prep['TotalCharges'])

#df_prep.loc[:, ['TotalCharges']].replace (r'\s+', np.nan, regex = True, inplace = True)

#imp
imp = IterativeImputer(max_iter=10, random_state=0)
imp.fit(df_prep[['TotalCharges']])

#transform
df_ii = imp.transform(df_prep[['TotalCharges']])

#replace
df_prep[['TotalCharges']] = df_ii
df_churn_gold = df_prep

# dummization
df_churn_gold['Churn'].replace({'No':0, 'Yes':1}, inplace=True)
df_dummies = pd.get_dummies(df_churn_gold.drop(['customerID','SeniorCitizen', 'tenure','MonthlyCharges', 'TotalCharges', 'Churn'], axis=1))
df = pd.concat([df_dummies, df_churn_gold[['SeniorCitizen', 'tenure','MonthlyCharges', 'TotalCharges', 'Churn']]], axis=1)


# extract the explanatory variables
X, y = df.drop('Churn', axis=1), df['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# define the model & hyperparameter tunning
clf = XGBClassifier(objective = 'binary:logistic',
                    n_estimators = 100,
                    learning_rate = 0.1,
                    booster = 'gbtree',
                    importance_type = 'weight'
                    )

# training the model
clf.fit(X_train, y_train)
score = clf.score(X_train, y_train)
print('Train score: {}%'.format(round(score*100, 2)))

# evaluate the model
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)
n_scores = cross_val_score(clf, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')

# report performance
print('Accuracy: {}% | Std dev: {}%'.format(round(mean(n_scores)*100,2), round(std(n_scores)*100, 2)))

y_pred = clf.predict(X_test)

# compare train & test distribuition
y_train_dist = y_train.sum()/len(y_train)
y_test_dist = y_test.sum()/len(y_test)

# statistics
print('+',29*'-','MODEL STATISTICS',29*'-','+')
print('Train dist: {}%'.format(round(y_train_dist*100, 2))
        ,'| Test dist: {}%'.format(round(y_test_dist*100, 2)))
print(80*'-')
print("Accuracy: {}%".format((metrics.accuracy_score(y_test, y_pred)*100).round(2))
        ,"| Precision: {}%".format((metrics.precision_score(y_test, y_pred)*100).round(2))
        ,"| Recall: {}%".format((metrics.recall_score(y_test, y_pred)*100).round(2)))
print(80*'-')
print("AUC ROC: {}%".format((roc_auc_score(y_test, clf.predict_proba(X_test)[:,1])*100).round(2))
        ,"| F1: {}%".format(((f1_score(y_test, y_pred))*100).round(2))
        ,'| LL: {}'.format(-metrics.log_loss(y_test, clf.predict_proba(X_test), normalize=False)))
print('+',78*'-','+')
print(clf.feature_importances_) # percentual for wheight

Train score: 81.87%
Accuracy: 80.32% | Std dev: 1.56%
+ ----------------------------- MODEL STATISTICS ----------------------------- +
Train dist: 26.27% | Test dist: 27.17%
--------------------------------------------------------------------------------
Accuracy: 80.31% | Precision: 67.87% | Recall: 52.26%
--------------------------------------------------------------------------------
AUC ROC: 85.81% | F1: 59.06% | LL: -855.7245026208693
+ ------------------------------------------------------------------------------ +
[0.01190476 0.         0.00744048 0.         0.00595238 0.
 0.00892857 0.         0.02380952 0.         0.0014881  0.00446429
 0.01190476 0.         0.03720238 0.         0.00297619 0.02083333
 0.         0.0014881  0.         0.         0.00297619 0.01785714
 0.         0.         0.00297619 0.         0.         0.00595238
 0.         0.01934524 0.0297619  0.03422619 0.03125    0.02529762
 0.         0.02678571 0.00595238 0.04017857 0.00892857 0.01488095
 0.15625    