In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,precision_score,recall_score,f1_score,accuracy_score
import pandas as pd

In [2]:
df = pd.read_csv('loan_approval_dataset.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Age                1000 non-null   int64 
 1   Salary             1000 non-null   int64 
 2   Credit_Score       1000 non-null   int64 
 3   Loan_Amount        1000 non-null   int64 
 4   Loan_Term          1000 non-null   object
 5   Employment_Status  1000 non-null   object
 6   Residence_Type     1000 non-null   object
 7   Previous_Default   1000 non-null   object
 8   Loan_Approved      1000 non-null   object
dtypes: int64(4), object(5)
memory usage: 70.4+ KB


In [4]:
X = df.drop('Loan_Approved',axis=1)
y = df.Loan_Approved

In [5]:
xtrain,xtest,ytrain,ytest = train_test_split(X,y,train_size=0.8,random_state=42)

In [6]:
# model = LogisticRegression()
# model.fit(xtrain,ytrain)

In [7]:
#preprocessing

def num_encode_concat(df):
    num_col = df.select_dtypes(include="number")
    obj_col = df.select_dtypes(exclude="number")
    encode = OneHotEncoder(sparse_output=False)
    obj_values = encode.fit_transform(obj_col)
    obj_col = pd.DataFrame(obj_values,columns=encode.get_feature_names_out())
    res = pd.concat([num_col.reset_index(drop=True),obj_col.reset_index(drop=True)],axis=1)
    return res

In [8]:
xtrain = num_encode_concat(xtrain)
xtest = num_encode_concat(xtest)

In [9]:
ytrain

29      No
535     No
695     No
557     No
836    Yes
      ... 
106     No
270     No
860     No
435    Yes
102    Yes
Name: Loan_Approved, Length: 800, dtype: object

In [10]:
model = LogisticRegression(max_iter=1000,penalty=None)
model.fit(xtrain,ytrain)
model.coef_

array([[-7.54042238e-03, -2.49509013e-06, -2.34758636e-04,
         1.62644196e-06, -5.64085797e-02,  8.49109746e-02,
         7.87937376e-02,  1.53683863e-01,  5.45754897e-02,
         2.24663761e-01, -1.82592774e-02, -2.40265111e-02,
         1.25940511e-01,  1.59066004e-01,  2.09230365e-01,
         5.17496258e-02]])

In [11]:
model = LogisticRegression()
model.fit(xtrain,ytrain)
model.coef_

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


array([[-4.04650306e-03, -1.60977279e-06,  1.15249244e-04,
         4.16239279e-06, -6.90962885e-02,  2.84443242e-02,
         1.52430623e-02,  7.56232082e-02, -2.75513731e-02,
         1.44960908e-01, -6.71952287e-02, -7.26453199e-02,
         4.75419833e-02,  7.53176429e-02,  1.39744000e-01,
        -8.95296941e-02]])

In [12]:
model = LogisticRegression(max_iter=1000,penalty='l1',solver='liblinear')
model.fit(xtrain,ytrain)
model.coef_

array([[-6.19393192e-03, -2.19352167e-06, -1.09982787e-04,
         2.45615001e-06, -1.06072087e-01,  0.00000000e+00,
         0.00000000e+00,  6.02382000e-02,  3.45367189e-02,
         2.10356791e-01,  0.00000000e+00,  0.00000000e+00,
         1.10483511e-01,  1.49879252e-01,  1.49762535e-01,
         0.00000000e+00]])

In [13]:
model.score(xtrain,ytrain)

0.53875

In [14]:
model.score(xtest,ytest)

0.495

In [15]:
ypred = model.predict(xtrain)

In [16]:
ytrain.value_counts()
'''
No - 0
Yes - 1
'''

'\nNo - 0\nYes - 1\n'

In [17]:
confusion_matrix(ytrain,ypred)

array([[205, 192],
       [177, 226]])

In [18]:
precision_score(ytrain,ypred,pos_label="Yes")

0.5406698564593302

In [None]:
precision_score(ytrain,ypred,labels=['Yes','No'],pos_label='Yes')

0.5406698564593302

In [39]:
precision_score(ytrain,ypred,labels=['Yes','No'],pos_label='Yes',average='micro')



0.53875

In [38]:
precision_score(ytrain,ypred,pos_label='Yes')

0.5406698564593302

In [19]:
precision_score(ytrain,ypred,pos_label="No")

0.5366492146596858

In [20]:
recall_score(ytrain,ypred,pos_label='Yes')

0.5607940446650124

In [21]:
recall_score(ytrain,ypred,pos_label='No')

0.5163727959697733

In [22]:
confusion_matrix(ytrain,ypred,labels=['Yes','No'])

array([[226, 177],
       [192, 205]])

In [23]:
model.intercept_

array([0.21701525])

In [24]:
model.coef_

array([[-6.19393192e-03, -2.19352167e-06, -1.09982787e-04,
         2.45615001e-06, -1.06072087e-01,  0.00000000e+00,
         0.00000000e+00,  6.02382000e-02,  3.45367189e-02,
         2.10356791e-01,  0.00000000e+00,  0.00000000e+00,
         1.10483511e-01,  1.49879252e-01,  1.49762535e-01,
         0.00000000e+00]])

In [25]:
ytrain.value_counts()

Loan_Approved
Yes    403
No     397
Name: count, dtype: int64

In [26]:
f1_score(ytrain,ypred,pos_label="Yes")

0.5505481120584653

In [27]:
(0.5370138017565872+0.5404732254047323)/2


0.5387435135806597

In [28]:
f1_score(ytrain,ypred,pos_label="No")


0.5263157894736842

In [29]:
accuracy_score(ytrain,ypred)

0.53875

In [30]:
xtrain.corr()

Unnamed: 0,Age,Salary,Credit_Score,Loan_Amount,Loan_Term_12 months,Loan_Term_24 months,Loan_Term_36 months,Loan_Term_48 months,Employment_Status_Employed,Employment_Status_Self-Employed,Employment_Status_Unemployed,Residence_Type_Mortgage,Residence_Type_Owned,Residence_Type_Rented,Previous_Default_No,Previous_Default_Yes
Age,1.0,0.041491,-0.027405,0.01782,0.020413,0.006698,0.003907,-0.031548,0.060324,0.023668,-0.082121,-0.096849,0.021086,0.077031,0.024481,-0.024481
Salary,0.041491,1.0,-0.026146,-0.022052,0.082948,-0.044094,-0.041489,0.001365,0.022633,0.017856,-0.039669,-0.033147,0.06397,-0.030943,-0.035479,0.035479
Credit_Score,-0.027405,-0.026146,1.0,-0.0028,-0.024819,-0.004394,0.038482,-0.00853,0.014264,-0.064383,0.049645,-0.027589,0.045489,-0.017923,-0.010338,0.010338
Loan_Amount,0.01782,-0.022052,-0.0028,1.0,-0.04186,-0.019876,0.052377,0.01075,0.004037,-0.012601,0.008503,0.026412,0.004943,-0.031805,-0.003898,0.003898
Loan_Term_12 months,0.020413,0.082948,-0.024819,-0.04186,1.0,-0.346787,-0.334237,-0.33652,0.03199,0.017179,-0.048117,-0.025462,0.056272,-0.030971,-0.020759,0.020759
Loan_Term_24 months,0.006698,-0.044094,-0.004394,-0.019876,-0.346787,1.0,-0.329895,-0.332148,-0.021678,-0.005053,0.026104,-0.017345,0.040467,-0.023253,-0.01475,0.01475
Loan_Term_36 months,0.003907,-0.041489,0.038482,0.052377,-0.334237,-0.329895,1.0,-0.320128,0.043753,-0.03828,-0.004843,0.030152,-0.065955,0.035987,0.018257,-0.018257
Loan_Term_48 months,-0.031548,0.001365,-0.00853,0.01075,-0.33652,-0.332148,-0.320128,1.0,-0.054275,0.025724,0.027484,0.013574,-0.032847,0.019386,0.017989,-0.017989
Employment_Status_Employed,0.060324,0.022633,0.014264,0.004037,0.03199,-0.021678,0.043753,-0.054275,1.0,-0.479995,-0.500422,0.007633,-0.0043,-0.003407,-0.037267,0.037267
Employment_Status_Self-Employed,0.023668,0.017856,-0.064383,-0.012601,0.017179,-0.005053,-0.03828,0.025724,-0.479995,1.0,-0.519326,0.005004,0.033111,-0.038514,0.023071,-0.023071


In [31]:
features = num_encode_concat(X)
y = y.map({"Yes":1,"No":0})

In [32]:
df = pd.concat([features,y],axis=1)

In [33]:
df

Unnamed: 0,Age,Salary,Credit_Score,Loan_Amount,Loan_Term_12 months,Loan_Term_24 months,Loan_Term_36 months,Loan_Term_48 months,Employment_Status_Employed,Employment_Status_Self-Employed,Employment_Status_Unemployed,Residence_Type_Mortgage,Residence_Type_Owned,Residence_Type_Rented,Previous_Default_No,Previous_Default_Yes,Loan_Approved
0,56,136748,584,38209,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1
1,46,25287,815,27424,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1
2,32,146593,398,42396,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1
3,60,54387,696,11370,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0
4,25,28512,788,14528,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,22,49241,500,41020,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1
996,40,116214,423,12415,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1
997,27,64569,300,28155,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1
998,61,31745,490,48884,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1


In [34]:
df.corr()['Loan_Approved']

Age                               -0.020147
Salary                            -0.073752
Credit_Score                      -0.009898
Loan_Amount                        0.037471
Loan_Term_12 months               -0.027735
Loan_Term_24 months                0.009141
Loan_Term_36 months               -0.006957
Loan_Term_48 months                0.026667
Employment_Status_Employed         0.002249
Employment_Status_Self-Employed   -0.011230
Employment_Status_Unemployed       0.008834
Residence_Type_Mortgage           -0.036869
Residence_Type_Owned               0.018644
Residence_Type_Rented              0.018742
Previous_Default_No                0.035052
Previous_Default_Yes              -0.035052
Loan_Approved                      1.000000
Name: Loan_Approved, dtype: float64

In [40]:
from sklearn.compose import ColumnTransformer

In [41]:
df

Unnamed: 0,Age,Salary,Credit_Score,Loan_Amount,Loan_Term_12 months,Loan_Term_24 months,Loan_Term_36 months,Loan_Term_48 months,Employment_Status_Employed,Employment_Status_Self-Employed,Employment_Status_Unemployed,Residence_Type_Mortgage,Residence_Type_Owned,Residence_Type_Rented,Previous_Default_No,Previous_Default_Yes,Loan_Approved
0,56,136748,584,38209,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1
1,46,25287,815,27424,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1
2,32,146593,398,42396,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1
3,60,54387,696,11370,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0
4,25,28512,788,14528,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,22,49241,500,41020,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1
996,40,116214,423,12415,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1
997,27,64569,300,28155,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1
998,61,31745,490,48884,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1
