### Importing Libraries

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import joblib
import warnings
warnings.filterwarnings("ignore")

### Importing dataset

In [2]:
dataset = pd.read_csv("bankloans.csv")

### Data preparation

In [3]:
# shows count of rows and columns
dataset.shape

(1150, 9)

In [4]:
#shows first few rows of the code
dataset.head()

Unnamed: 0,age,ed,employ,address,income,debtinc,creddebt,othdebt,default
0,41,3,17,12,176,9.3,11.359392,5.008608,1.0
1,27,1,10,6,31,17.3,1.362202,4.000798,0.0
2,40,1,15,14,55,5.5,0.856075,2.168925,0.0
3,41,1,15,14,120,2.9,2.65872,0.82128,0.0
4,24,2,2,0,28,17.3,1.787436,3.056564,1.0


In [5]:
dataset.columns
#age : age of the customers
#ed : Education level
#employ : work experience
#address : adress of the customer
#income : Yearly income of the customer
#debtinc : Debt to income ratio
#creddebt : Credit to Debt ratio
#othdebt : Other debts
#default : Customer defaulted in the past
#(1=defaulted ; 0= Never defaulted)



Index(['age', 'ed', 'employ', 'address', 'income', 'debtinc', 'creddebt',
       'othdebt', 'default'],
      dtype='object')

In [6]:
#dropping customer ID column from the dataset
dataset.drop(labels=["address","debtinc","creddebt","othdebt"],axis=1,inplace=True)
dataset.shape

(1150, 5)

In [7]:
# explore missing values
dataset.isna().sum()

age          0
ed           0
employ       0
income       0
default    450
dtype: int64

In [8]:
# deleting rows with missing values
dataset.dropna(inplace=True)


In [9]:
# explore missing values post missing value fix
dataset.isna().sum()

age        0
ed         0
employ     0
income     0
default    0
dtype: int64

In [10]:
# data summary across 0 & 1
dataset.groupby('default').mean()

Unnamed: 0_level_0,age,ed,employ,income
default,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.0,35.514507,1.659574,9.508704,47.154739
1.0,33.010929,1.901639,5.224044,41.213115


In [11]:
dataset.describe()

Unnamed: 0,age,ed,employ,income,default
count,700.0,700.0,700.0,700.0,700.0
mean,34.86,1.722857,8.388571,45.601429,0.261429
std,7.997342,0.928206,6.658039,36.814226,0.439727
min,20.0,1.0,0.0,14.0,0.0
25%,29.0,1.0,3.0,24.0,0.0
50%,34.0,1.0,7.0,34.0,0.0
75%,40.0,2.0,12.0,55.0,1.0
max,56.0,5.0,31.0,446.0,1.0


In [12]:
# count of good loans (0) and bad loans (1)
dataset['default'].value_counts()

0.0    517
1.0    183
Name: default, dtype: int64

In [13]:
dataset.corr()

Unnamed: 0,age,ed,employ,income,default
age,1.0,0.022325,0.536497,0.47871,-0.137657
ed,0.022325,1.0,-0.153621,0.23519,0.114676
employ,0.536497,-0.153621,1.0,0.619681,-0.282978
income,0.47871,0.23519,0.619681,1.0,-0.07097
default,-0.137657,0.114676,-0.282978,-0.07097,1.0


### fix imbalanced data problem

In [40]:
##### Shuffling our data
#dataset = dataset.sample(frac=1)

##### amount of default classes 183 rows
#default_df = dataset.loc[dataset["default"]==1]
#nb_default = len(default_df)
#non_default = dataset.loc[dataset["default"]==0][:nb_default]

#normal_distributed_df = pd.concat([default_df,non_default])

##### Shuffle dataframe rows
#new_df = normal_distributed_df.sample(frac=1,random_state=42)

#new_df.head()

In [41]:
##### count of good loans (0) and bad loans (1) after fixing imbalanced data problem 
#new_df['default'].value_counts()

### Train Test Split

In [42]:
y = dataset.iloc[:,-1].values
X = dataset.iloc[:, :-1].values

In [43]:
# splitting dataset into training and test (in ratio 80:20)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0,stratify=y)

In [44]:
X_train.shape , X_test.shape , y_train.shape , y_test.shape 

((560, 4), (140, 4), (560,), (140,))

In [45]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

### Risk Model building

In [46]:
classifier =  LogisticRegression()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

### Model performance

In [47]:
print(confusion_matrix(y_test,y_pred))

[[103   0]
 [ 36   1]]


In [48]:
print(accuracy_score(y_test, y_pred))

0.7428571428571429


In [49]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.74      1.00      0.85       103
         1.0       1.00      0.03      0.05        37

    accuracy                           0.74       140
   macro avg       0.87      0.51      0.45       140
weighted avg       0.81      0.74      0.64       140



In [51]:
filename= 'saved_model.sav'
saved_model=joblib.dump(classifier,filename)