In [38]:
import pandas as pd
import numpy as np
from utils import training_algorithms
pd.pandas.set_option('display.max_columns', None)

In [39]:
pred_df = pd.read_csv('BankChurners_modelagem.csv')

In [40]:
y = pred_df['Attrition_Flag']
X = pred_df.drop('Attrition_Flag',axis=1)

- Dividing the dataset between train and test set.

In [41]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42)

- Defining the models to train

In [42]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

clf1 = DecisionTreeClassifier()
clf2 = RandomForestClassifier()
clf3 = AdaBoostClassifier()
algoritmos = [clf1,clf2,clf3]
performances = training_algorithms(X_train,y_train,algoritmos)

clf1_trained, cross_val_1 = performances[0][0], performances[0][1]
clf2_trained, cross_val_2 = performances[1][0], performances[1][1]
clf3_trained, cross_val_3 = performances[2][0], performances[2][1]

- Performances on the training set.

In [43]:
cross_val_1, cross_val_2, cross_val_3

(array([0.95134228, 0.95142379, 0.94728033, 0.94763514, 0.94798658,
        0.95549958, 0.95341098, 0.95084746, 0.95652174, 0.95725063]),
 array([0.96345515, 0.96277916, 0.9626556 , 0.96399345, 0.96381579,
        0.96880131, 0.96715928, 0.97593361, 0.9611249 , 0.97359736]),
 array([0.95805369, 0.96345515, 0.96327212, 0.96052632, 0.96080067,
        0.96511628, 0.96099585, 0.95986622, 0.94920899, 0.96758105]))

In [44]:
np.mean(cross_val_1), np.mean(cross_val_2), np.mean(cross_val_3)

(0.9519198501824879, 0.9663315607754048, 0.9608876334305073)

# Is there a difference, statistically significant, between the performances of the trained models?

In [45]:
# Shapiro-Wilk and Kolmogorov-Sminorv Test
from scipy.stats import shapiro, kstest, f_oneway, ttest_rel, friedmanchisquare, wilcoxon

In [46]:
performances_train = [cross_val_1,cross_val_2,cross_val_3]

In [47]:
print('Normality Tests for the performance of: Decision Tree, Random Forest and AdaBoost: \n')
for i in performances_train:
    a = shapiro(i)[1]
    b = kstest(i,'norm')[1]
    print('--------------------')
    print("Shapiro P-Value:", a, "KS P-Value:", b)      

Normality Tests for the performance of: Decision Tree, Random Forest and AdaBoost: 

--------------------
Shapiro P-Value: 0.34260931611061096 KS P-Value: 4.549743125515367e-08
--------------------
Shapiro P-Value: 0.0613606721162796 KS P-Value: 3.687903518225239e-08
--------------------
Shapiro P-Value: 0.15195178985595703 KS P-Value: 4.4187840587005076e-08


##### Knowing that the null hypothesis of the Shapiro-Wilk test says that if the p-value of the test is greater than a specific significance level (0.05 in our case), the distribution of the sample is not significantly different from a normal distribution.

##### Unlike Shapiro-Wilk hypothesis, the null hypothesis of the Kolmogorov-Sminorv test says that if the p-value of the test is greater than a specific significance level (0.05 in our case), than there is a significant difference between the two distributions compared (the classifier performance distribution and the normal distribution). 

##### So, based on the p-values calculated before, we can conclude that we do have enough evidences to use parametric methods (assuming that the distribution of the performance of the classifiers are normally distributed) to compare the performance between them.

##### First we are going to perform a test, ANOVA, that can check if there is a significant difference between the performance of the group of classifiers. Once we indentifies this difference is statistically significant, we are going to check which of the classifiers performances differ from each other using the T-Test for paired samples.

In [48]:
f_oneway(cross_val_1,cross_val_2,cross_val_3)

F_onewayResult(statistic=25.201749217987302, pvalue=6.685637878969156e-07)

In [49]:
# Equivalent to the ANOVA oneway but Non-Parametric Test (use this instead of f_oneway method,
# incase the data were not normally distributed)

# friedmanchisquare(cross_val_clf1,cross_val_clf2,cross_val_clf3)

#### Accordingly with ANOVA test, the performance of the classifiers does not have the same distribution, so we are going to perform the T-Test for paired samples to check wich of them have the performance different from the others.

In [50]:
ttest_rel(cross_val_1,cross_val_2)

Ttest_relResult(statistic=-8.882499149397468, pvalue=9.505879843437891e-06)

In [51]:
ttest_rel(cross_val_1,cross_val_3)

Ttest_relResult(statistic=-4.463793685954248, pvalue=0.0015686446983005984)

In [52]:
ttest_rel(cross_val_2,cross_val_3)

Ttest_relResult(statistic=3.315290114832669, pvalue=0.009007652063853903)

In [20]:
# Equivalent to the T-Test but Non-Parametric Test (use this instead of ttest_rel method,
# incase the data were not normally distributed)

# wilcoxon(cross_val_clf1,cross_val_clf2)
# wilcoxon(cross_val_clf1,cross_val_clf3)
# wilcoxon(cross_val_clf2,cross_val_clf3)

##### As we can see, accordingly with the T-Test, the performance of the algorithms are statistically differente... So, as we saw before that the mean of the Random Forest classifier is greater than the others, we are going to use Random Forest to predict future samples.

In [53]:
RF_pred = clf2_trained.predict(X_test)

In [54]:
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, accuracy_score
confusion_matrix(y_test,RF_pred), f1_score(y_test,RF_pred),precision_score(y_test,RF_pred), recall_score(y_test,RF_pred), accuracy_score(y_test,RF_pred)

(array([[ 338,  158],
        [  43, 2500]], dtype=int64),
 0.9613535858488752,
 0.9405568096313017,
 0.9830908375933937,
 0.9338598223099703)

# The most important features accordingly with Random Fores are: Total_Trans_Amt, Total_Ct_Chng_Q4_Q1 and Total_Revolving_Bal.

In [55]:
pd.DataFrame([clf2.feature_importances_],columns=X_train.columns)

Unnamed: 0,Gender,Education_Level,Income_Category,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Marital_Status_Divorced,Marital_Status_Married,Marital_Status_Single,Marital_Status_Unknown,Card_Category_Blue,Card_Category_Gold,Card_Category_Platinum,Card_Category_Silver
0,0.014731,0.024354,0.021402,0.067857,0.036989,0.043067,0.061157,0.123519,0.090999,0.247042,0.155533,0.083175,0.003989,0.008358,0.008336,0.003167,0.002769,0.001048,0.000464,0.002046
