In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report 

In [2]:
data_new = pd.read_csv("BankChurnersNormalized.csv")
data_old = pd.read_csv("BankChurners.csv")

In [3]:
def show_ME(Y_test, predicted):
    print ("Accuracy Score :",accuracy_score(Y_test, predicted))
    print ("Confusion Matrix :") 
    print(confusion_matrix(Y_test, predicted))  
    print ("Report : ") 
    print (classification_report(Y_test, predicted))
    
def Random_Forest(X_train, X_test, Y_train):
    classifier =  RandomForestClassifier()
    classifier = classifier.fit(X_train, Y_train)
    predicted = classifier.predict(X_test)
    return predicted    

def feat_sel(X, Y):
    feats_sel = SelectKBest(chi2)
    fit = feats_sel.fit(X, Y)
    scores = pd.DataFrame(fit.scores_)
    columns = pd.DataFrame(data_old.columns[2:21])
    feats = pd.concat([columns, scores], axis=1)
    feats.columns = ('Features', 'Score')
    useful_feats = []
    useless_feats = []
    for feat in feats.values:
        if feat[1] > 10:
            useful_feats.append(feat[0])
        if feat[1] < 0.1:
            useless_feats.append(feat[0])
    return (useful_feats, useless_feats)

In [4]:
print("Before feature selection:\n")
X_b = data_new[data_new.columns[1:20]]
Y = data_new[data_new.columns[0]]
X_train, X_test, Y_train, Y_test = train_test_split(X_b, Y, test_size=0.5)
pred_before = Random_Forest(X_train, X_test, Y_train)
show_ME(Y_test, pred_before)

Before feature selection:

Accuracy Score : 0.9504455553140622
Confusion Matrix :
[[3799   52]
 [ 176  574]]
Report : 
              precision    recall  f1-score   support

         0.0       0.96      0.99      0.97      3851
         1.0       0.92      0.77      0.83       750

    accuracy                           0.95      4601
   macro avg       0.94      0.88      0.90      4601
weighted avg       0.95      0.95      0.95      4601



In [5]:
print("After feature selection:\n")
X_a = data_new[feat_sel(X_b, Y)[0]]
Y = data_new[data_new.columns[0]]
X_train, X_test, Y_train, Y_test = train_test_split(X_a, Y, test_size=0.5)
pred_after = Random_Forest(X_train, X_test, Y_train)
show_ME(Y_test, pred_after)

After feature selection:

Accuracy Score : 0.9556618126494241
Confusion Matrix :
[[3813   76]
 [ 128  584]]
Report : 
              precision    recall  f1-score   support

         0.0       0.97      0.98      0.97      3889
         1.0       0.88      0.82      0.85       712

    accuracy                           0.96      4601
   macro avg       0.93      0.90      0.91      4601
weighted avg       0.95      0.96      0.95      4601



In [6]:
useful_f, useless_f = feat_sel(X_b, Y)
print("Useful features: ", useful_f)
print("Useless features: ", useless_f)

Useful features:  ['Total_Relationship_Count', 'Months_Inactive_12_mon', 'Contacts_Count_12_mon', 'Total_Revolving_Bal', 'Total_Trans_Amt', 'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio']
Useless features:  ['Marital_Status', 'Card_Category', 'Months_on_book', 'Avg_Open_To_Buy']
