ABC bank is facing the challenge of high credit default rates. One of the strategies which the bank has come up with is to identify the risky customers (those who are likely to default) and take proactive measures to perform actions for these risky customers before they actually default. 

In [1]:
#import libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn import tree
from sklearn.metrics import confusion_matrix

In [2]:
#Read the Dataset 
dataset = pd.read_excel("case_study_data.xlsx")
df = pd.DataFrame(dataset)
df.head()

Unnamed: 0,checkin_acc,duration,credit_history,purpose,amount,svaing_acc,present_emp_since,inst_rate,personal_status,other_debtors,...,property,age,inst_plans,housing,num_credits,job,dependents,telephone,foreign_worker,status
0,A11,9,A34,A43,1754,A65,A75,6,A93,A101,...,A121,101,A143,A152,3,A173,2,A192,A201,1
1,A12,72,A32,A43,8927,A61,A73,3,A92,A101,...,A121,33,A143,A152,2,A173,2,A191,A201,2
2,A14,18,A34,A46,3144,A61,A74,3,A93,A101,...,A121,74,A143,A152,2,A172,3,A191,A201,1
3,A11,63,A32,A42,11823,A61,A74,3,A93,A103,...,A122,68,A143,A153,2,A173,3,A191,A201,1
4,A11,36,A33,A40,7305,A61,A73,5,A93,A101,...,A124,80,A143,A153,3,A173,3,A191,A201,2


In [3]:
cols = [0,2,3,5,6,8,9,11,13,14,16,18,19]
df1 = df[df.columns[cols]]
df1.head()

Unnamed: 0,checkin_acc,credit_history,purpose,svaing_acc,present_emp_since,personal_status,other_debtors,property,inst_plans,housing,job,telephone,foreign_worker
0,A11,A34,A43,A65,A75,A93,A101,A121,A143,A152,A173,A192,A201
1,A12,A32,A43,A61,A73,A92,A101,A121,A143,A152,A173,A191,A201
2,A14,A34,A46,A61,A74,A93,A101,A121,A143,A152,A172,A191,A201
3,A11,A32,A42,A61,A74,A93,A103,A122,A143,A153,A173,A191,A201
4,A11,A33,A40,A61,A73,A93,A101,A124,A143,A153,A173,A191,A201


In [4]:
# Finding no. of unique labels 
for i in df.columns[cols]:
       print(i, ': ', len(df[i].unique()), ' labels')

checkin_acc :  4  labels
credit_history :  5  labels
purpose :  10  labels
svaing_acc :  5  labels
present_emp_since :  5  labels
personal_status :  4  labels
other_debtors :  3  labels
property :  4  labels
inst_plans :  3  labels
housing :  3  labels
job :  4  labels
telephone :  2  labels
foreign_worker :  2  labels


In [5]:
# Feature Engineering-->replace all the labels with its frequency count
for i in df.columns[cols]:
    df_frequency_map = df[i].value_counts().to_dict()
    print(i, ': ',df[i].value_counts().to_dict())
    df[i] = df[i].map(df_frequency_map)

checkin_acc :  {'A14': 394, 'A11': 274, 'A12': 269, 'A13': 63}
credit_history :  {'A32': 530, 'A34': 293, 'A33': 88, 'A31': 49, 'A30': 40}
purpose :  {'A43': 280, 'A40': 234, 'A42': 181, 'A41': 103, 'A49': 97, 'A46': 50, 'A45': 22, 'A44': 12, 'A410': 12, 'A48': 9}
svaing_acc :  {'A61': 603, 'A65': 183, 'A62': 103, 'A63': 63, 'A64': 48}
present_emp_since :  {'A73': 339, 'A75': 253, 'A74': 174, 'A72': 172, 'A71': 62}
personal_status :  {'A93': 548, 'A92': 310, 'A94': 92, 'A91': 50}
other_debtors :  {'A101': 907, 'A103': 52, 'A102': 41}
property :  {'A123': 332, 'A121': 282, 'A122': 232, 'A124': 154}
inst_plans :  {'A143': 814, 'A141': 139, 'A142': 47}
housing :  {'A152': 713, 'A151': 179, 'A153': 108}
job :  {'A173': 630, 'A172': 200, 'A174': 148, 'A171': 22}
telephone :  {'A191': 596, 'A192': 404}
foreign_worker :  {'A201': 963, 'A202': 37}


In [6]:
df

Unnamed: 0,checkin_acc,duration,credit_history,purpose,amount,svaing_acc,present_emp_since,inst_rate,personal_status,other_debtors,...,property,age,inst_plans,housing,num_credits,job,dependents,telephone,foreign_worker,status
0,274,9,293,280,1754,183,253,6,548,907,...,282,101,814,713,3,630,2,404,963,1
1,269,72,530,280,8927,603,339,3,310,907,...,282,33,814,713,2,630,2,596,963,2
2,394,18,293,50,3144,603,174,3,548,907,...,282,74,814,713,2,200,3,596,963,1
3,274,63,530,181,11823,603,174,3,548,52,...,232,68,814,108,2,630,3,596,963,1
4,274,36,88,234,7305,603,339,5,548,907,...,154,80,814,108,3,630,3,596,963,2
5,394,54,530,50,13583,183,339,3,548,907,...,154,53,814,108,2,200,3,404,963,1
6,394,36,530,181,4253,63,253,5,548,907,...,232,80,814,713,2,630,2,596,963,1
7,269,54,530,103,10422,603,339,3,548,907,...,332,53,814,179,2,148,2,404,963,1
8,394,18,530,280,4589,48,174,3,50,907,...,282,92,814,713,2,200,2,596,963,1
9,269,45,293,234,7851,603,62,6,92,907,...,332,42,814,713,3,148,2,596,963,2


In [7]:
# Now divide the dataset into training and testing
X =  df[df.columns[0:20]]
y = df[df.columns[20]]
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=.25 , random_state = 10)

In [8]:
#1---Logistic Regression
logisticRegr = LogisticRegression()
logisticRegr.fit(train_x, train_y)
predicted = logisticRegr.predict(test_x)
print("Accuracy score: ", accuracy_score(test_y, predicted))

Accuracy score:  0.7




In [9]:
#2----Decision Tree
model = tree.DecisionTreeClassifier()
model.fit(train_x, train_y)
predictions = model.predict(test_x)
print("Accuracy score: ", accuracy_score(test_y, predictions))

Accuracy score:  0.696


In [10]:
#3--------XGBClassifier
model = XGBClassifier()
model.fit(train_x, train_y)
predicted = model.predict(test_x)
print("Accuracy score: ", accuracy_score(test_y, predicted))
original=[]
prediction=[]
for i in range(90, 200):
    print("Original:",y[i])
    original.append(y[i])
    print("Predictions:", predicted[i])
    prediction.append(predicted[i])
    print("\n")
dict = {'Original_status': original, 'Predictions': prediction}     
df2 = pd.DataFrame(dict)
df2.to_csv('file2.csv') 

Accuracy score:  0.772
Original: 1
Predictions: 1


Original: 1
Predictions: 1


Original: 2
Predictions: 1


Original: 1
Predictions: 2


Original: 1
Predictions: 1


Original: 2
Predictions: 1


Original: 1
Predictions: 2


Original: 1
Predictions: 1


Original: 1
Predictions: 1


Original: 1
Predictions: 2


Original: 1
Predictions: 1


Original: 1
Predictions: 1


Original: 1
Predictions: 1


Original: 1
Predictions: 1


Original: 1
Predictions: 1


Original: 2
Predictions: 1


Original: 2
Predictions: 1


Original: 1
Predictions: 1


Original: 1
Predictions: 1


Original: 1
Predictions: 1


Original: 1
Predictions: 1


Original: 1
Predictions: 1


Original: 1
Predictions: 1


Original: 2
Predictions: 1


Original: 1
Predictions: 2


Original: 1
Predictions: 1


Original: 2
Predictions: 2


Original: 1
Predictions: 1


Original: 2
Predictions: 1


Original: 1
Predictions: 1


Original: 2
Predictions: 1


Original: 1
Predictions: 1


Original: 1
Predictions: 1


Original: 1
Predicti

In [11]:
Accuracy_score1 = accuracy_score(test_y, predicted)
print("Accuracy_Score:", Accuracy_score1)
Precision_score = precision_score(test_y, predicted)
print("Precision_score:", Precision_score)
Recall_score = recall_score(test_y, predicted)
print("Recall_score:", Recall_score)
F1_score = f1_score(test_y, predicted)
print("F1_score:", F1_score)

Accuracy_Score: 0.772
Precision_score: 0.7839195979899497
Recall_score: 0.9176470588235294
F1_score: 0.8455284552845528


In [12]:
predicted = model.predict(test_x)
matrix = confusion_matrix(test_y, predicted)
print("Confusion_Matrix:",matrix)

Confusion_Matrix: [[156  14]
 [ 43  37]]
