In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [2]:
data = pd.read_sas("bankloan.sas7bdat")

In [3]:
data

Unnamed: 0,age,ed,employ,address,income,debtinc,creddebt,othdebt,default
0,41.0,3.0,17.0,12.0,176.0,9.3,11.359392,5.008608,1.0
1,27.0,1.0,10.0,6.0,31.0,17.3,1.362202,4.000798,0.0
2,40.0,1.0,15.0,14.0,55.0,5.5,0.856075,2.168925,0.0
3,41.0,1.0,15.0,14.0,120.0,2.9,2.658720,0.821280,0.0
4,24.0,2.0,2.0,0.0,28.0,17.3,1.787436,3.056564,1.0
...,...,...,...,...,...,...,...,...,...
845,34.0,1.0,12.0,15.0,32.0,2.7,0.239328,0.624672,
846,32.0,2.0,12.0,11.0,116.0,5.7,4.026708,2.585292,
847,48.0,1.0,13.0,11.0,38.0,10.8,0.722304,3.381696,
848,35.0,2.0,1.0,11.0,24.0,7.8,0.417456,1.454544,


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 850 entries, 0 to 849
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       850 non-null    float64
 1   ed        850 non-null    float64
 2   employ    850 non-null    float64
 3   address   850 non-null    float64
 4   income    850 non-null    float64
 5   debtinc   850 non-null    float64
 6   creddebt  850 non-null    float64
 7   othdebt   850 non-null    float64
 8   default   700 non-null    float64
dtypes: float64(9)
memory usage: 59.9 KB


In [7]:
df = data[~data["default"].isnull()]

In [8]:
df

Unnamed: 0,age,ed,employ,address,income,debtinc,creddebt,othdebt,default
0,41.0,3.0,17.0,12.0,176.0,9.3,11.359392,5.008608,1.0
1,27.0,1.0,10.0,6.0,31.0,17.3,1.362202,4.000798,0.0
2,40.0,1.0,15.0,14.0,55.0,5.5,0.856075,2.168925,0.0
3,41.0,1.0,15.0,14.0,120.0,2.9,2.658720,0.821280,0.0
4,24.0,2.0,2.0,0.0,28.0,17.3,1.787436,3.056564,1.0
...,...,...,...,...,...,...,...,...,...
695,36.0,2.0,6.0,15.0,27.0,4.6,0.262062,0.979938,1.0
696,29.0,2.0,6.0,4.0,21.0,11.5,0.369495,2.045505,0.0
697,33.0,1.0,15.0,3.0,32.0,7.6,0.491264,1.940736,0.0
698,45.0,1.0,19.0,22.0,77.0,8.4,2.302608,4.165392,0.0


In [9]:
def IQR(x):
    q1 = x.quantile(0.25)
    q3 = x.quantile(0.75)
    iqr = q3 - q1
    lf = q1 - (1.5*iqr)
    uf = q3+(1.5*iqr)
    print("LF",lf)
    print("UF",uf)

In [10]:
df["employ"] = np.where(df["employ"]>25.5,25.5,df["employ"])

df["address"] = np.where(df["address"]>25.5,25.5,df["address"])

df["income"] = np.where(df["income"]>101.5,101.5,df["income"])

df["debtinc"] = np.where(df["debtinc"]>27.812,27.812,df["debtinc"])

df["creddebt"] = np.where(df["creddebt"]>4.20,4.20,df["creddebt"])

df["othdebt"] = np.where(df["othdebt"]>8.24,8.24,df["othdebt"])

df["ed"] = np.where(df["ed"]>3.5,3.5,df["ed"])

In [11]:
df.columns

Index(['age', 'ed', 'employ', 'address', 'income', 'debtinc', 'creddebt',
       'othdebt', 'default'],
      dtype='object')

In [12]:
X = df[['age', 'ed', 'employ', 'address', 'income', 'debtinc', 'creddebt',
       'othdebt']]
y = df["default"]

In [35]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=55)

In [36]:
log = LogisticRegression()

In [37]:
log.fit(X_train,y_train)

In [38]:
print("Train accuracy",log.score(X_train,y_train))
print("Test accuracy",log.score(X_test,y_test))

Train accuracy 0.8
Test accuracy 0.8285714285714286


In [41]:
sample = data[data["default"].isnull()]

In [43]:
sample.drop(columns="default",inplace = True)

In [44]:
sample

Unnamed: 0,age,ed,employ,address,income,debtinc,creddebt,othdebt
700,36.0,1.0,16.0,13.0,32.0,10.9,0.544128,2.943872
701,50.0,1.0,6.0,27.0,21.0,12.9,1.316574,1.392426
702,40.0,1.0,9.0,9.0,33.0,17.0,4.880700,0.729300
703,31.0,1.0,5.0,7.0,23.0,2.0,0.046000,0.414000
704,29.0,1.0,4.0,0.0,24.0,7.8,0.866736,1.005264
...,...,...,...,...,...,...,...,...
845,34.0,1.0,12.0,15.0,32.0,2.7,0.239328,0.624672
846,32.0,2.0,12.0,11.0,116.0,5.7,4.026708,2.585292
847,48.0,1.0,13.0,11.0,38.0,10.8,0.722304,3.381696
848,35.0,2.0,1.0,11.0,24.0,7.8,0.417456,1.454544


In [46]:
sample["pre_y"] = log.predict(sample)

In [47]:
sample

Unnamed: 0,age,ed,employ,address,income,debtinc,creddebt,othdebt,pre_y
700,36.0,1.0,16.0,13.0,32.0,10.9,0.544128,2.943872,0.0
701,50.0,1.0,6.0,27.0,21.0,12.9,1.316574,1.392426,0.0
702,40.0,1.0,9.0,9.0,33.0,17.0,4.880700,0.729300,1.0
703,31.0,1.0,5.0,7.0,23.0,2.0,0.046000,0.414000,0.0
704,29.0,1.0,4.0,0.0,24.0,7.8,0.866736,1.005264,0.0
...,...,...,...,...,...,...,...,...,...
845,34.0,1.0,12.0,15.0,32.0,2.7,0.239328,0.624672,0.0
846,32.0,2.0,12.0,11.0,116.0,5.7,4.026708,2.585292,0.0
847,48.0,1.0,13.0,11.0,38.0,10.8,0.722304,3.381696,0.0
848,35.0,2.0,1.0,11.0,24.0,7.8,0.417456,1.454544,0.0


In [48]:
from sklearn.metrics import classification_report

In [51]:
# classification_report(actual , predicted )
print(classification_report(y_train , log.predict(X_train)))
print("========================================================")
print(classification_report(y_test , log.predict(X_test)))

              precision    recall  f1-score   support

         0.0       0.83      0.92      0.87       412
         1.0       0.68      0.46      0.55       148

    accuracy                           0.80       560
   macro avg       0.75      0.69      0.71       560
weighted avg       0.79      0.80      0.79       560

              precision    recall  f1-score   support

         0.0       0.85      0.93      0.89       105
         1.0       0.72      0.51      0.60        35

    accuracy                           0.83       140
   macro avg       0.79      0.72      0.75       140
weighted avg       0.82      0.83      0.82       140



In [63]:
# X_train probability
pro_train = pd.DataFrame(log.predict_proba(X_train),columns=["Pro_0","Pro_1"])

In [64]:
copy = X_train.copy()

In [65]:
copy["pre_y"] = log.predict(copy)

In [66]:
copy["mannual_y"] = np.where(pro_train["Pro_1"]>0.5,1,0)

In [67]:
copy

Unnamed: 0,age,ed,employ,address,income,debtinc,creddebt,othdebt,pre_y,mannual_y
518,41.0,1.0,7.0,22.0,32.0,2.9,0.072384,0.855616,0.0,0
382,39.0,2.0,2.0,12.0,46.0,16.0,4.003840,3.356160,1.0,1
103,42.0,2.0,21.0,11.0,101.5,3.1,1.365364,2.385636,0.0,0
438,32.0,2.0,8.0,2.0,45.0,3.7,0.982350,0.682650,0.0,0
293,24.0,1.0,4.0,2.0,21.0,2.6,0.099372,0.446628,0.0,0
...,...,...,...,...,...,...,...,...,...,...
37,32.0,2.0,12.0,1.0,54.0,14.4,3.195936,4.580064,0.0,0
541,26.0,1.0,10.0,2.0,32.0,4.7,0.126336,1.377664,0.0,0
295,45.0,1.0,10.0,14.0,52.0,5.4,0.932256,1.875744,0.0,0
666,44.0,2.0,21.0,14.0,101.5,13.2,3.192024,8.240000,0.0,0


In [55]:
pro_train

Unnamed: 0,Pro_1,Pro_0
0,0.980187,0.019813
1,0.138459,0.861541
2,0.992253,0.007747
3,0.865825,0.134175
4,0.895012,0.104988
...,...,...
555,0.584059,0.415941
556,0.958455,0.041545
557,0.951249,0.048751
558,0.959377,0.040623


In [76]:
actual = y_train
pre_y = np.where(pro_train["Pro_1"]>=0.3,1,0)

In [77]:
print(classification_report(actual,pre_y))

              precision    recall  f1-score   support

         0.0       0.89      0.77      0.82       412
         1.0       0.53      0.72      0.61       148

    accuracy                           0.76       560
   macro avg       0.71      0.74      0.72       560
weighted avg       0.79      0.76      0.77       560



In [69]:
print(classification_report(actual,pre_y))

              precision    recall  f1-score   support

         0.0       0.83      0.92      0.87       412
         1.0       0.68      0.46      0.55       148

    accuracy                           0.80       560
   macro avg       0.75      0.69      0.71       560
weighted avg       0.79      0.80      0.79       560

