<a href="https://colab.research.google.com/github/samuel-haddad/EnsembleClassifiersReview/blob/main/randonforrest_py.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
'''
Churn prediction: Randon Forrest Review
> Author: Samuel Haddad Simões Machado
> Date: apr/2022
> Licence: Open Source
'''

# general libs
import pandas as pd 
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
from numpy import mean
from numpy import std

# modeling libs
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score

# preparation lib
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer


# import the dataset and create the dataframe
df_churn = pd.read_csv('https://raw.githubusercontent.com/samuel-haddad/TreeClassifiersReview/main/WA_Fn-UseC_-Telco-Customer-Churn.csv', delimiter=',')

#copy df
df_prep = df_churn.copy()

df_prep['TotalCharges'].replace(r'^\s*$', np.nan, regex=True, inplace=True)

df_prep['TotalCharges'] = pd.to_numeric(df_prep['TotalCharges'])

#df_prep.loc[:, ['TotalCharges']].replace (r'\s+', np.nan, regex = True, inplace = True)

#imp
imp = IterativeImputer(max_iter=10, random_state=0)
imp.fit(df_prep[['TotalCharges']])

#transform
df_ii = imp.transform(df_prep[['TotalCharges']])

#replace
df_prep[['TotalCharges']] = df_ii
df_churn_gold = df_prep

# dummization
df_churn_gold['Churn'].replace({'No':0, 'Yes':1}, inplace=True)
df_dummies = pd.get_dummies(df_churn_gold.drop(['customerID','SeniorCitizen', 'tenure','MonthlyCharges', 'TotalCharges', 'Churn'], axis=1))
df = pd.concat([df_dummies, df_churn_gold[['SeniorCitizen', 'tenure','MonthlyCharges', 'TotalCharges', 'Churn']]], axis=1)


# extract the explanatory variables
X, y = df.drop('Churn', axis=1), df['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# define the model
clf = RandomForestClassifier(oob_score = True, random_state=42, warm_start = True)

# training the model
clf.fit(X_train, y_train)
score = clf.score(X_train, y_train)
print("Train score: ", score)

# evaluate the model
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)
n_scores = cross_val_score(clf, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')

# report performance
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

y_pred = clf.predict(X_test)

# compare train & test distribuition
y_train_dist = y_train.sum()/len(y_train)
y_test_dist = y_test.sum()/len(y_test)

# statistics
print('+',29*'-','MODEL STATISTICS',29*'-','+')
print('Train dist: {}%'.format(round(y_train_dist*100, 2))
        ,'| Test dist: {}%'.format(round(y_test_dist*100, 2)))
print(80*'-')
print("Accuracy: {}%".format((metrics.accuracy_score(y_test, y_pred)*100).round(2))
        ,"| Precision: {}%".format((metrics.precision_score(y_test, y_pred)*100).round(2))
        ,"| Recall: {}%".format((metrics.recall_score(y_test, y_pred)*100).round(2)))
print(80*'-')
print("AUC: {}%".format((roc_auc_score(y_test, clf.predict_proba(X_test)[:,1])*100).round(2))
        ,"| F1: {}%".format(((f1_score(y_test, y_pred))*100).round(2))
        ,'| LL: {}'.format(-metrics.log_loss(y_test, clf.predict_proba(X_test), normalize=False)))
print('+',78*'-','+')
print(clf.feature_importances_)

Train score:  0.9983772819472616
Accuracy: 0.782 (0.018)
+ ----------------------------- MODEL STATISTICS ----------------------------- +
Train dist: 26.27% | Test dist: 27.17%
--------------------------------------------------------------------------------
Accuracy: 79.46% | Precision: 67.24% | Recall: 47.56%
--------------------------------------------------------------------------------
AUC: 83.32% | F1: 55.71% | LL: -976.1931136729795
+ ------------------------------------------------------------------------------ +
[0.01823215 0.01799257 0.01487657 0.01554457 0.01251715 0.01249316
 0.00285866 0.0027516  0.01289049 0.00329541 0.01270441 0.01163866
 0.02569041 0.00235165 0.03284959 0.00259931 0.01199469 0.01844728
 0.00212945 0.01273078 0.01349924 0.00070796 0.01269139 0.02644763
 0.00522148 0.00960451 0.01173663 0.00173479 0.01061675 0.0117992
 0.00284887 0.01190755 0.04098877 0.01046274 0.02164937 0.0149143
 0.01558283 0.01233103 0.01278832 0.0305188  0.0118992  0.01927884
 0.1416