In [1]:
from pandas import read_csv
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix   
from sklearn.metrics import accuracy_score,f1_score ,recall_score,precision_score,classification_report,fbeta_score
from sklearn.linear_model import LogisticRegression
import warnings
bank1 = read_csv('BankChurnfunc.csv')
bank2 = read_csv('ML.csv')

In [2]:
def accuracyscore(X,Y):
    time_1 = time.time()
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)  
    warnings.simplefilter('ignore')
    RF = RandomForestClassifier()
    RF_fit = RF.fit(X_train, Y_train)
    RF_pred = RF_fit.predict(X_test)
    time_2 = time.time()
    print("Random Forests is %f percent accurate" % (accuracy_score(RF_pred, Y_test)*100))
    print("F1 score :\n{0}".format(f1_score(RF_pred,Y_test)))
    print("Recall score :\n{0}".format(recall_score(RF_pred,Y_test)))
    print("Confusion Matrix : \n {0}".format(confusion_matrix(RF_pred, Y_test)))
    print("Report :\n{0}".format(classification_report(RF_pred, Y_test)))
    print("Time : {0}".format(time_2 - time_1))
    print("=======================================================")

In [3]:
#RF with all features and with ml features
Y1 = bank1["Attrition_Flag"]
X1 = bank1.drop(["Attrition_Flag"], axis=1) 
accuracyscore(X1,Y1)

X2 = bank2.iloc[:, :].values 
accuracyscore(X2,Y1)

Random Forests is 98.251192 percent accurate
F1 score :
0.9893376413570275
Recall score :
0.98330122029544
Confusion Matrix : 
 [[ 323    7]
 [  26 1531]]
Report :
              precision    recall  f1-score   support

         0.0       0.93      0.98      0.95       330
         1.0       1.00      0.98      0.99      1557

    accuracy                           0.98      1887
   macro avg       0.96      0.98      0.97      1887
weighted avg       0.98      0.98      0.98      1887

Time : 1.5270049571990967
Random Forests is 95.866455 percent accurate
F1 score :
0.9754871150219989
Recall score :
0.9675810473815462
Confusion Matrix : 
 [[ 257   26]
 [  52 1552]]
Report :
              precision    recall  f1-score   support

         0.0       0.83      0.91      0.87       283
         1.0       0.98      0.97      0.98      1604

    accuracy                           0.96      1887
   macro avg       0.91      0.94      0.92      1887
weighted avg       0.96      0.96      0.96  

In [4]:
#Churn Predicition
X_train, X_test, Y_train, Y_test = train_test_split(X1, Y1, test_size=0.2) 

RF = RandomForestClassifier()
RF_fit = RF.fit(X_train, Y_train)
RF_pred = RF_fit.predict(X_test)

classifier = LogisticRegression()
classifier.fit(X_train, Y_train)
# Predict the Test set results
y_pred = classifier.predict(X_test)
#Evaluate Model Results on Test Set:
acc = accuracy_score(Y_test, y_pred )
prec = precision_score(Y_test, y_pred )
rec = recall_score(Y_test, y_pred )
f1 = f1_score(Y_test, y_pred )
f2 = fbeta_score(Y_test, y_pred, beta=2.0)
results = pd.DataFrame([['RandomForestClassifier',
acc, prec, rec, f1, f2]],columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'F2 Score'])

In [5]:
lr_classifier = RandomForestClassifier()
lr_classifier.fit(X_train, Y_train)
# Predict the Test set results
y_pred = lr_classifier.predict(X_test)
#probability score
y_pred_probs = lr_classifier.predict_proba(X_test)
y_pred_probs  = y_pred_probs [:, 1]

In [6]:
train_identity = X_train['CLIENTNUM']
X_train = X_train.drop(columns = ['CLIENTNUM'])
test_identity = X_test['CLIENTNUM']
X_test = X_test.drop(columns = ['CLIENTNUM'])

In [7]:
final_results = pd.concat([test_identity, Y_test], axis = 1).dropna()
final_results['predictions'] = y_pred
final_results["propensity_to_churn(%)"] = y_pred_probs
final_results["propensity_to_churn(%)"] = final_results["propensity_to_churn(%)"]*100
final_results["propensity_to_churn(%)"] = final_results["propensity_to_churn(%)"].round(2)
final_results = final_results[['CLIENTNUM', 'Attrition_Flag', 'predictions', 'propensity_to_churn(%)']]
final_results ['Ranking'] = pd.qcut(final_results['propensity_to_churn(%)'].rank(method = 'first'),10,labels=range(10,0,-1))
final_results.sort_values(by=['propensity_to_churn(%)'],ascending=False)

Unnamed: 0,CLIENTNUM,Attrition_Flag,predictions,propensity_to_churn(%),Ranking
5863,715398858,1.0,1.0,100.0,3
1739,814776033,1.0,1.0,100.0,1
5046,771108933,1.0,1.0,100.0,3
6428,802822833,1.0,1.0,100.0,1
5194,716614683,1.0,1.0,100.0,3
...,...,...,...,...,...
7857,709102458,0.0,0.0,0.0,10
4182,712783083,0.0,0.0,0.0,10
4395,712864683,0.0,0.0,0.0,10
7815,714773508,0.0,0.0,0.0,10
