In [41]:
import numpy as np


import pandas as pd

# from sklearn import neighbors, metrics

from sklearn.model_selection import train_test_split


from collections import Counter

from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import time

# functions

In [42]:
def get_metrics(targets: np.ndarray, predicted_targets: np.ndarray, mode: str):
    '''

    :param targets:
    :param predicted_targets:
    :param mode: influence to return: mode = "show_metrics" - print metrics to the Console;
                                        mode = "return_metrics" - return dict which consists all metrics values
                                        with their names as keys
    :return:
    {
    "accuracy": accuracy_value, "precision": precision_value,
    "recall": recall_value, "f1_score": f1_score_value,
    "confusion_matrix": confusion_matrix
    }
    '''
    # token = (actual target, predicted target)
    # 1 - positive target, 0 - negative target
    tokens = [(j, i) for i in [1, 0] for j in [1, 0]]
    # tokens = [true_pos_token,
    #           false_pos_token,
    #           false_neg_token,
    #           true_neg_token]
    c = Counter(zip(targets, predicted_targets))
    [true_pos_value,
     false_pos_value,
     false_neg_value,
     true_neg_value] = [c[token] for token in tokens]
    accuracy = (true_neg_value + true_pos_value) / len(targets)
    confusion_matrix = np.array([[true_pos_value, false_pos_value],
                                 [false_neg_value, true_neg_value]])
    precision = true_pos_value / (true_pos_value + false_pos_value)
    recall = true_pos_value / (true_pos_value + false_neg_value)
    f1_score = 2 * precision * recall / (precision + recall)
    if mode == "show_metrics":
        print(f"accuracy = {accuracy}")
        print(f"confusion_matrix = \n{confusion_matrix}")
        print(f"precision = {precision}")
        print(f"recall = {recall}")
        print(f"f1_score = {f1_score}")
    elif mode == "return_metrics":
        res_dict = {"accuracy": accuracy, "confusion_matrix": confusion_matrix,
                    "f1_score": f1_score}
        return res_dict

In [4]:
pd.read_csv("BankChurners.csv")

Unnamed: 0,CLIENTNUM,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,...,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2
0,768805383,Existing Customer,45,M,3,High School,Married,$60K - $80K,Blue,39,...,12691.0,777,11914.0,1.335,1144,42,1.625,0.061,0.000093,0.999910
1,818770008,Existing Customer,49,F,5,Graduate,Single,Less than $40K,Blue,44,...,8256.0,864,7392.0,1.541,1291,33,3.714,0.105,0.000057,0.999940
2,713982108,Existing Customer,51,M,3,Graduate,Married,$80K - $120K,Blue,36,...,3418.0,0,3418.0,2.594,1887,20,2.333,0.000,0.000021,0.999980
3,769911858,Existing Customer,40,F,4,High School,Unknown,Less than $40K,Blue,34,...,3313.0,2517,796.0,1.405,1171,20,2.333,0.760,0.000134,0.999870
4,709106358,Existing Customer,40,M,3,Uneducated,Married,$60K - $80K,Blue,21,...,4716.0,0,4716.0,2.175,816,28,2.500,0.000,0.000022,0.999980
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10122,772366833,Existing Customer,50,M,2,Graduate,Single,$40K - $60K,Blue,40,...,4003.0,1851,2152.0,0.703,15476,117,0.857,0.462,0.000191,0.999810
10123,710638233,Attrited Customer,41,M,2,Unknown,Divorced,$40K - $60K,Blue,25,...,4277.0,2186,2091.0,0.804,8764,69,0.683,0.511,0.995270,0.004729
10124,716506083,Attrited Customer,44,F,1,High School,Married,Less than $40K,Blue,36,...,5409.0,0,5409.0,0.819,10291,60,0.818,0.000,0.997880,0.002118
10125,717406983,Attrited Customer,30,M,2,Graduate,Unknown,$40K - $60K,Blue,36,...,5281.0,0,5281.0,0.535,8395,62,0.722,0.000,0.996710,0.003294


# metrics using not normalized data

In [43]:
preproc_not_normalized_data = pd.read_csv("pre-processing_BankChurners_not_normalized.csv")
preproc_not_normalized_data

Unnamed: 0,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,Total_Relationship_Count,...,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2
0,0,45,1,3,2.0,2.0,70.000000,1,39,5,...,12691.0,777,11914.0,1.335,1144,42,1.625,0.061,0.000093,0.999910
1,0,51,1,3,4.0,2.0,100.000000,1,36,4,...,3418.0,0,3418.0,2.594,1887,20,2.333,0.000,0.000021,0.999980
2,0,40,0,4,2.0,2.0,26.666667,1,34,3,...,3313.0,2517,796.0,1.405,1171,20,2.333,0.760,0.000134,0.999870
3,0,40,1,3,1.0,2.0,70.000000,1,21,5,...,4716.0,0,4716.0,2.175,816,28,2.500,0.000,0.000022,0.999980
4,0,44,1,2,4.0,2.0,50.000000,1,36,3,...,4010.0,1247,2763.0,1.376,1088,24,0.846,0.311,0.000055,0.999940
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10115,0,50,1,2,4.0,1.0,50.000000,1,40,3,...,4003.0,1851,2152.0,0.703,15476,117,0.857,0.462,0.000191,0.999810
10116,1,41,1,2,4.0,3.0,50.000000,1,25,4,...,4277.0,2186,2091.0,0.804,8764,69,0.683,0.511,0.995270,0.004729
10117,1,44,0,1,2.0,2.0,26.666667,1,36,5,...,5409.0,0,5409.0,0.819,10291,60,0.818,0.000,0.997880,0.002118
10118,1,30,1,2,4.0,2.0,50.000000,1,36,4,...,5281.0,0,5281.0,0.535,8395,62,0.722,0.000,0.996710,0.003294


In [8]:
x = preproc_not_normalized_data.drop(["Attrition_Flag",
                       "Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1",
                       "Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2"
                       ], axis=1).values
y = preproc_not_normalized_data["Attrition_Flag"].values

raw_models = {"GaussianNB": GaussianNB(),
              "RandomForestClassifier": RandomForestClassifier(),
              "KNeighborsClassifier": KNeighborsClassifier()}
print("metrics using not normalized data")
for model in raw_models.items():
    t0 = time.time()
    print(f"model: {model[0]} : \n"
          f"f1-score: {cross_val_score(model[1], X=x, y=y, cv=5, scoring='f1').mean()} : \n"
          f"5 folds cross-validation time elapsed : {time.time() - t0} : \n"
          f"accuracy : {cross_val_score(model[1], X=x, y=y, cv=5, scoring='accuracy').mean()}\n")

model: GaussianNB : 
f1-score: 0.5347711618670622 : 
5 folds cross-validation time elapsed : 0.03901314735412598
 : accuracy : 0.8451581027667985
model: RandomForestClassifier : 
f1-score: 0.6777624537273202 : 
5 folds cross-validation time elapsed : 5.229551792144775
 : accuracy : 0.9234189723320159
model: KNeighborsClassifier : 
f1-score: 0.4903925805486895 : 
5 folds cross-validation time elapsed : 1.608734130859375
 : accuracy : 0.8610671936758895


# metrics using normalized data

In [10]:
preproc_data = pd.read_csv("pre-processing_BankChurners.csv")
preproc_data

Unnamed: 0,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,Total_Relationship_Count,...,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2
0,0,-0.165261,1.060261,0.503636,-0.893897,0.522808,0.425471,-0.263546,0.384495,0.763915,...,0.447062,-0.473285,0.489410,2.666758,-0.960392,-0.975049,3.892696,-0.776043,-0.437752,0.437762
1,0,0.583481,1.060261,0.503636,0.593115,0.522808,1.308928,-0.263546,0.008956,0.120645,...,-0.573420,-1.426735,-0.445377,8.499347,-0.741457,-1.913197,6.910076,-0.997334,-0.437950,0.437953
2,0,-0.789213,-0.943070,1.273662,-0.893897,0.522808,-0.850632,-0.263546,-0.241404,-0.522625,...,-0.584975,1.661853,-0.733868,2.991048,-0.952436,-1.913197,6.910076,1.759732,-0.437642,0.437652
3,0,-0.789213,1.060261,0.503636,-1.637403,0.522808,0.425471,-0.263546,-1.868743,0.763915,...,-0.430577,-1.426735,-0.302563,6.558239,-1.057042,-1.572052,7.621803,-0.997334,-0.437949,0.437953
4,0,-0.290051,1.060261,-0.266389,0.593115,0.522808,-0.163500,-0.263546,0.008956,-0.522625,...,-0.508271,0.103448,-0.517445,2.856699,-0.976893,-1.742625,0.572725,0.130886,-0.437857,0.437844
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10115,0,0.458691,1.060261,-0.266389,0.593115,-1.135236,-0.163500,-0.263546,0.509675,-0.522625,...,-0.509042,0.844611,-0.584671,-0.261119,3.262728,2.223186,0.619605,0.678672,-0.437484,0.437488
10116,1,-0.664422,1.060261,-0.266389,0.593115,2.180853,-0.163500,-0.263546,-1.368023,0.120645,...,-0.478888,1.255686,-0.591383,0.206786,1.284945,0.176316,-0.121955,0.856430,2.286428,-2.286431
10117,1,-0.290051,-0.943070,-1.036415,-0.893897,0.522808,-0.850632,-0.263546,0.008956,0.763915,...,-0.354313,-1.426735,-0.226314,0.276276,1.734897,-0.207472,0.453393,-0.997334,2.293573,-2.293577
10118,1,-2.037116,1.060261,-0.266389,0.593115,0.522808,-0.163500,-0.263546,0.008956,0.120645,...,-0.368399,-1.426735,-0.240398,-1.039415,1.176215,-0.122186,0.044257,-0.997334,2.290370,-2.290360


In [39]:
x = preproc_data.drop(["Attrition_Flag",
                       "Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1",
                       "Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2"
                       ], axis=1).values
y = preproc_data["Attrition_Flag"].values

print("metrics using normalized data")
for model in raw_models.items():
    t0 = time.time()
    print(f"model: {model[0]} : \n"
          f"f1-score: {cross_val_score(model[1], X=x, y=y, cv=5, scoring='f1').mean()} : \n"
          f"5 folds cross-validation time elapsed : {time.time() - t0} : \n"
          f"accuracy : {cross_val_score(model[1], X=x, y=y, cv=5, scoring='accuracy').mean()}\n")


metrics using normalized data
model: GaussianNB : 
f1-score: 0.5252172377780926 : 
5 folds cross-validation time elapsed : 0.03270459175109863 : 
accuracy : 0.8281620553359685

model: RandomForestClassifier : 
f1-score: 0.6851329307648933 : 
5 folds cross-validation time elapsed : 5.39909553527832 : 
accuracy : 0.9268774703557312

model: KNeighborsClassifier : 
f1-score: 0.576086087824387 : 
5 folds cross-validation time elapsed : 1.6423041820526123 : 
accuracy : 0.8941699604743082



# metrics(second type) using normalized data

In [40]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)
models = {"RandomForestClassifier": RandomForestClassifier(),
          "KNeighborsClassifier": KNeighborsClassifier()}
models["GaussianNB"] = GaussianNB()

for model in models.items():
    t0 = time.time()
    model[1].fit(x_train, y_train)
    print(f"model: {model[0]} training time = {time.time() - t0}")
    predicted_y = model[1].predict(x_test)
    metrics = get_metrics(targets=y_test, predicted_targets=predicted_y, mode='return_metrics')
    print(f"model: {model[0]} : \n"
          f"f1-score: {metrics['f1_score']} : \n"
          f"accuracy : {metrics['accuracy']}\n")

model: RandomForestClassifier training time = 0.9877898693084717
model: RandomForestClassifier : 
f1-score: 0.8568329718004338 : 
accuracy : 0.9565217391304348

model: KNeighborsClassifier training time = 0.0009975433349609375
model: KNeighborsClassifier : 
f1-score: 0.6257668711656442 : 
accuracy : 0.8995388669301713

model: GaussianNB training time = 0.003988027572631836
model: GaussianNB : 
f1-score: 0.6504065040650406 : 
accuracy : 0.8866930171277997



# metrics using best params

In [15]:
models = {"RandomForestClassifier": RandomForestClassifier(
    **{'bootstrap': False,
 'max_depth': 10,
 'max_features': 'sqrt',
 'n_estimators': 110}
),
          "KNeighborsClassifier": KNeighborsClassifier(
              **{'n_neighbors': 4, 'weights': 'distance'}
          )}

print("metrics using best params")
for model in models.items():
    t0 = time.time()
    print(f"model: {model[0]} : \n"
          f"f1-score: {cross_val_score(model[1], X=x, y=y, cv=5, scoring='f1').mean()} : \n"
          f"5 folds cross-validation time elapsed : {time.time() - t0} : \n"
          f"accuracy : {cross_val_score(model[1], X=x, y=y, cv=5, scoring='accuracy').mean()}\n")

metrics using best params
model: RandomForestClassifier : 
f1-score: 0.661319564497672 : 
5 folds cross-validation time elapsed : 7.26559042930603 : 
accuracy : 0.918379446640316

model: KNeighborsClassifier : 
f1-score: 0.5768115363620983 : 
5 folds cross-validation time elapsed : 1.4112370014190674 : 
accuracy : 0.8897233201581027



# metrics after feature selection with Pair Correlation

In [17]:
trash_features = ['Months_on_book',
 'Gender',
 'Credit_Limit',
 'Total_Revolving_Bal',
 'Total_Trans_Ct']

In [21]:
x = preproc_data.drop(["Attrition_Flag",
                       "Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1",
                       "Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2"
                       ] + trash_features, axis=1).values
y = preproc_data["Attrition_Flag"].values

print("metrics using normalized data")
for model in models.items():
    t0 = time.time()
    print(f"model: {model[0]} : \n"
          f"f1-score: {cross_val_score(model[1], X=x, y=y, cv=5, scoring='f1').mean()} : \n"
          f"5 folds cross-validation time elapsed : {time.time() - t0} : \n"
          f"accuracy : {cross_val_score(model[1], X=x, y=y, cv=5, scoring='accuracy').mean()}\n")

metrics using normalized data
model: RandomForestClassifier : 
f1-score: 0.5894415700322361 : 
5 folds cross-validation time elapsed : 5.650761842727661 : 
accuracy : 0.8983201581027668

model: KNeighborsClassifier : 
f1-score: 0.4468251171026364 : 
5 folds cross-validation time elapsed : 2.224774122238159 : 
accuracy : 0.8634387351778656



# metrics after feature selection with SelectKBest(k=8)

In [23]:
trash_features = ['Avg_Open_To_Buy', 'Card_Category', 'Credit_Limit', 'Customer_Age',
       'Dependent_count', 'Education_Level', 'Gender', 'Income_Category',
       'Marital_Status', 'Months_on_book']

In [27]:
x = preproc_data.drop(["Attrition_Flag",
                       "Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1",
                       "Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2"
                       ] + trash_features, axis=1).values
y = preproc_data["Attrition_Flag"].values

print("metrics using normalized data")
for model in models.items():
    t0 = time.time()
    print(f"model: {model[0]} : \n"
          f"f1-score: {cross_val_score(model[1], X=x, y=y, cv=5, scoring='f1').mean()} : \n"
          f"5 folds cross-validation time elapsed : {time.time() - t0} : \n"
          f"accuracy : {cross_val_score(model[1], X=x, y=y, cv=5, scoring='accuracy').mean()}\n")

metrics using normalized data
model: RandomForestClassifier : 
f1-score: 0.6498263621418523 : 
5 folds cross-validation time elapsed : 6.638484954833984 : 
accuracy : 0.9146245059288537

model: KNeighborsClassifier : 
f1-score: 0.6269448883026133 : 
5 folds cross-validation time elapsed : 0.735985517501831 : 
accuracy : 0.8963438735177865



# metrics(second type) after feature selection with SelectKBest(k=8)

In [38]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)
models = {"RandomForestClassifier": RandomForestClassifier(
    **{'bootstrap': False,
 'max_depth': 10,
 'max_features': 'sqrt',
 'n_estimators': 110}
),
          "KNeighborsClassifier": KNeighborsClassifier(
              **{'n_neighbors': 4, 'weights': 'distance'}
          )}
models["GaussianNB"] = GaussianNB()

for model in models.items():
    t0 = time.time()
    model[1].fit(x_train, y_train)
    print(f"model: {model[0]} training time = {time.time() - t0}")
    predicted_y = model[1].predict(x_test)
    metrics = get_metrics(targets=y_test, predicted_targets=predicted_y, mode='return_metrics')
    print(f"model: {model[0]} : \n"
          f"f1-score: {metrics['f1_score']} : \n"
          f"accuracy : {metrics['accuracy']}\n")

model: RandomForestClassifier training time = 1.446361780166626
model: RandomForestClassifier : 
f1-score: 0.8546448087431695 : 
accuracy : 0.9561923583662714

model: KNeighborsClassifier training time = 0.0
model: KNeighborsClassifier : 
f1-score: 0.6476190476190476 : 
accuracy : 0.9025032938076416

model: GaussianNB training time = 0.003988504409790039
model: GaussianNB : 
f1-score: 0.6504065040650406 : 
accuracy : 0.8866930171277997



In [35]:
metrics

{'accuracy': 0.9571805006587615,
 'confusion_matrix': array([[ 410,   43],
        [  87, 2496]]),
 'f1_score': 0.8631578947368421}

# Conclusions

dataset pre-processing with correct choice of machine learning model,
that is supported by model's parameters tuning and feature selection maybe can increase metrics of your model,
reduce computation time and give you better prediction accurate