# Loan Status Prediction -- Logistic Regression

## Data:

The data used here is taken from the analytics vidhya website. You can find the source there.

### Importing packages

In [126]:
#importing necessary packages
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
warnings.filterwarnings("ignore")

### Reading the training dataset

In [76]:
# read the dataset
df_loantrain = pd.read_csv(r'C:\Users\Srividhya\Desktop\loan dataset\train_ctrUa4K.csv')
print(df_loantrain.head())

    Loan_ID Gender Married Dependents     Education Self_Employed  \
0  LP001002   Male      No          0      Graduate            No   
1  LP001003   Male     Yes          1      Graduate            No   
2  LP001005   Male     Yes          0      Graduate           Yes   
3  LP001006   Male     Yes          0  Not Graduate            No   
4  LP001008   Male      No          0      Graduate            No   

   ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
0             5849                0.0         NaN             360.0   
1             4583             1508.0       128.0             360.0   
2             3000                0.0        66.0             360.0   
3             2583             2358.0       120.0             360.0   
4             6000                0.0       141.0             360.0   

   Credit_History Property_Area Loan_Status  
0             1.0         Urban           Y  
1             1.0         Rural           N  
2             1.0   

In [77]:
print('\n\nColumn Names\n\n')
print(df_loantrain.columns)



Column Names


Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')


In [80]:
df_loantrain.shape

(614, 13)

### Data exploration and manipulation

In [82]:
df_loantrain['Loan_Status'].value_counts()

Y    422
N    192
Name: Loan_Status, dtype: int64

In [83]:
df_loantrain['Gender'].value_counts()

Male      489
Female    112
Name: Gender, dtype: int64

In [84]:
df_loantrain['Married'].value_counts()

Yes    398
No     213
Name: Married, dtype: int64

In [85]:
df_loantrain['Dependents'].value_counts()

0     345
1     102
2     101
3+     51
Name: Dependents, dtype: int64

In [86]:
df_loantrain['Education'].value_counts()

Graduate        480
Not Graduate    134
Name: Education, dtype: int64

In [87]:
df_loantrain['Self_Employed'].value_counts()

No     500
Yes     82
Name: Self_Employed, dtype: int64

In [88]:
df_loantrain['Credit_History'].value_counts()

1.0    475
0.0     89
Name: Credit_History, dtype: int64

In [89]:
df_loantrain['Property_Area'].value_counts()

Semiurban    233
Urban        202
Rural        179
Name: Property_Area, dtype: int64

In [90]:
df_loantrain.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

### Filling the missing values

In [94]:
df_loantrain['Gender'].fillna(df_loantrain['Gender'].mode()[0],inplace=True)
df_loantrain['Married'].fillna(df_loantrain['Married'].mode()[0],inplace=True)
df_loantrain['Dependents'].fillna(df_loantrain['Dependents'].mode()[0],inplace=True)
df_loantrain['Self_Employed'].fillna(df_loantrain['Self_Employed'].mode()[0],inplace=True)
df_loantrain['Credit_History'].fillna(df_loantrain['Credit_History'].mode()[0],inplace=True)

In [95]:
df_loantrain['LoanAmount'].fillna(df_loantrain['LoanAmount'].median(),inplace=True)

In [96]:
df_loantrain['Loan_Amount_Term'].value_counts()

360.0    512
180.0     44
480.0     15
300.0     13
84.0       4
240.0      4
120.0      3
36.0       2
60.0       2
12.0       1
Name: Loan_Amount_Term, dtype: int64

In [97]:
df_loantrain['Loan_Amount_Term'].fillna(df_loantrain['Loan_Amount_Term'].mode()[0],inplace=True)

In [98]:
df_loantrain.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [99]:
df_loantrain.apply(lambda x: len(x.unique()))

Loan_ID              614
Gender                 2
Married                2
Dependents             4
Education              2
Self_Employed          2
ApplicantIncome      505
CoapplicantIncome    287
LoanAmount           203
Loan_Amount_Term      10
Credit_History         2
Property_Area          3
Loan_Status            2
dtype: int64

### Splitting the training data into train-test-split

In [104]:
train2=df_loantrain.drop('Loan_ID',axis=1)

In [105]:
y = train2.Loan_Status

In [106]:
X=train2.drop('Loan_Status',1)

In [108]:
X.shape

(614, 11)

In [109]:
y.head(5)

0    Y
1    N
2    Y
3    Y
4    Y
Name: Loan_Status, dtype: object

In [110]:
X.head(5)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,Male,No,0,Graduate,No,5849,0.0,128.0,360.0,1.0,Urban
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban


### Converting categorical variables into numerical data

In [111]:
X = pd.get_dummies(X)
train2 = pd.get_dummies(train2)

In [112]:
X.head(5)

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Female,Gender_Male,Married_No,Married_Yes,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Education_Graduate,Education_Not Graduate,Self_Employed_No,Self_Employed_Yes,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban
0,5849,0.0,128.0,360.0,1.0,0,1,1,0,1,0,0,0,1,0,1,0,0,0,1
1,4583,1508.0,128.0,360.0,1.0,0,1,0,1,0,1,0,0,1,0,1,0,1,0,0
2,3000,0.0,66.0,360.0,1.0,0,1,0,1,1,0,0,0,1,0,0,1,0,0,1
3,2583,2358.0,120.0,360.0,1.0,0,1,0,1,1,0,0,0,0,1,1,0,0,0,1
4,6000,0.0,141.0,360.0,1.0,0,1,1,0,1,0,0,0,1,0,1,0,0,0,1


### Creating new variables with the help of already existing variables

In [113]:
train2['TotalIncome'] = train2['ApplicantIncome'] + train2['CoapplicantIncome']

In [116]:
train2['TotalIncome_log'] = np.log(train2['TotalIncome']) 

#### Dropping the used variables which helped in developing new variables

In [118]:
train2 = train2.drop(columns=['ApplicantIncome', 'CoapplicantIncome'], axis=1)

In [120]:
train2.head(10)

Unnamed: 0,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Female,Gender_Male,Married_No,Married_Yes,Dependents_0,Dependents_1,Dependents_2,...,Education_Not Graduate,Self_Employed_No,Self_Employed_Yes,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Loan_Status_N,Loan_Status_Y,TotalIncome,TotalIncome_log
0,128.0,360.0,1.0,0,1,1,0,1,0,0,...,0,1,0,0,0,1,0,1,5849.0,8.674026
1,128.0,360.0,1.0,0,1,0,1,0,1,0,...,0,1,0,1,0,0,1,0,6091.0,8.714568
2,66.0,360.0,1.0,0,1,0,1,1,0,0,...,0,0,1,0,0,1,0,1,3000.0,8.006368
3,120.0,360.0,1.0,0,1,0,1,1,0,0,...,1,1,0,0,0,1,0,1,4941.0,8.505323
4,141.0,360.0,1.0,0,1,1,0,1,0,0,...,0,1,0,0,0,1,0,1,6000.0,8.699515
5,267.0,360.0,1.0,0,1,0,1,0,0,1,...,0,0,1,0,0,1,0,1,9613.0,9.170872
6,95.0,360.0,1.0,0,1,0,1,1,0,0,...,1,1,0,0,0,1,0,1,3849.0,8.255569
7,158.0,360.0,0.0,0,1,0,1,0,0,0,...,0,1,0,0,1,0,1,0,5540.0,8.61975
8,168.0,360.0,1.0,0,1,0,1,0,0,1,...,0,1,0,0,0,1,0,1,5532.0,8.618305
9,349.0,360.0,1.0,0,1,0,1,0,1,0,...,0,1,0,0,1,0,1,0,23809.0,10.077819


### Building model using Logistic Regression and K-Fold cross-validation

In [135]:
score1 = 0
i = 1
kf = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)
for train, test in kf.split(X,y):
    print("\n{} of K-fold {}".format(i,kf.n_splits))
    train_X, val_X = X.loc[train],X.loc[test]
    train_y, val_y = y[train], y[test]
    model1 = LogisticRegression(random_state=1)
    model1.fit(train_X, train_y)
    predictions1 = model1.predict(val_X)
    scoree = accuracy_score(val_y, predictions1)
    score1 = scoree + score1
    print('Accuracy score ',scoree)
    i+=1
print("\n Average Accuracy score: ", (score1/10))


1 of K-fold 10
Accuracy score  0.7777777777777778

2 of K-fold 10
Accuracy score  0.8095238095238095

3 of K-fold 10
Accuracy score  0.8032786885245902

4 of K-fold 10
Accuracy score  0.819672131147541

5 of K-fold 10
Accuracy score  0.7868852459016393

6 of K-fold 10
Accuracy score  0.8360655737704918

7 of K-fold 10
Accuracy score  0.8032786885245902

8 of K-fold 10
Accuracy score  0.7704918032786885

9 of K-fold 10
Accuracy score  0.819672131147541

10 of K-fold 10
Accuracy score  0.8360655737704918

 Average Accuracy score:  0.8062711423367162


#### The accuracy score of the built model is 80.62% 
With this accuracy rate, we are going to predict Loan_Status for test dataset

### Reading the test data

In [138]:
test_df = pd.read_csv(r'C:\Users\Srividhya\Desktop\loan dataset\test_lAUu6dG.csv')

In [139]:
test_df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban


In [140]:
test_df.columns

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area'],
      dtype='object')

In [141]:
test_df.shape

(367, 12)

### Exploratory data analysis and data manipulation in test.csv dataset

In [143]:
test_df['Gender'].value_counts()

Male      286
Female     70
Name: Gender, dtype: int64

In [149]:
test_df['Married'].value_counts()

Yes    233
No     134
Name: Married, dtype: int64

In [150]:
test_df['Dependents'].value_counts()

0     200
2      59
1      58
3+     40
Name: Dependents, dtype: int64

In [151]:
test_df['Education'].value_counts()

Graduate        283
Not Graduate     84
Name: Education, dtype: int64

In [152]:
test_df['Self_Employed'].value_counts()

No     307
Yes     37
Name: Self_Employed, dtype: int64

In [153]:
test_df['Credit_History'].value_counts()

1.0    279
0.0     59
Name: Credit_History, dtype: int64

In [155]:
test_df['Property_Area'].value_counts()

Urban        140
Semiurban    116
Rural        111
Name: Property_Area, dtype: int64

In [156]:
test_df.isnull().sum()

Loan_ID               0
Gender               11
Married               0
Dependents           10
Education             0
Self_Employed        23
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            5
Loan_Amount_Term      6
Credit_History       29
Property_Area         0
dtype: int64

### Filling the missing data

In [157]:
test_df['Gender'].fillna(test_df['Gender'].mode()[0],inplace=True)
test_df['Married'].fillna(test_df['Married'].mode()[0],inplace=True)
test_df['Dependents'].fillna(test_df['Dependents'].mode()[0],inplace=True)
test_df['Self_Employed'].fillna(test_df['Self_Employed'].mode()[0],inplace=True)
test_df['Credit_History'].fillna(test_df['Credit_History'].mode()[0],inplace=True)

In [158]:
test_df['LoanAmount'].fillna(test_df['LoanAmount'].median(),inplace=True)

In [159]:
test_df['Loan_Amount_Term'].value_counts()

360.0    311
180.0     22
480.0      8
300.0      7
240.0      4
84.0       3
6.0        1
120.0      1
36.0       1
350.0      1
12.0       1
60.0       1
Name: Loan_Amount_Term, dtype: int64

In [162]:
test_df['Loan_Amount_Term'].fillna(test_df['Loan_Amount_Term'].mode()[0],inplace=True)

In [163]:
test_df.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
dtype: int64

In [164]:
test_df.apply(lambda x: len(x.unique()))

Loan_ID              367
Gender                 2
Married                2
Dependents             4
Education              2
Self_Employed          2
ApplicantIncome      314
CoapplicantIncome    194
LoanAmount           144
Loan_Amount_Term      12
Credit_History         2
Property_Area          3
dtype: int64

In [165]:
test2=test_df.drop('Loan_ID',axis=1)

### Converting categorical variables in test data into numerical/non-categorical data

In [166]:
test2=pd.get_dummies(test2)

#### Creating new variables

In [167]:
test2['TotalIncome'] = test2['ApplicantIncome'] + test2['CoapplicantIncome']

In [168]:
test2['TotalIncome_log'] = np.log(test2['TotalIncome']) 

#### Dropping the old variables used to create new ones

In [169]:
test2 = test2.drop(columns=['ApplicantIncome', 'CoapplicantIncome'], axis=1)

### Prediction of Loan_Status for test data using the model developed

In [170]:
pred_test = model1.predict(test2)

In [184]:
pred_test[:]

array(['Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y

In [185]:
pred_skl = model1.predict_proba(val_X)[:,1] ## probability of prediction nearest to 1

In [186]:
pred_skl[:]

array([0.83104835, 0.85383564, 0.61754945, 0.13751318, 0.7804063 ,
       0.83286192, 0.78792725, 0.79950669, 0.20652804, 0.79438593,
       0.71226513, 0.71271287, 0.69244558, 0.76966531, 0.72003688,
       0.82189792, 0.77580627, 0.39397402, 0.59586444, 0.20466423,
       0.64514865, 0.81560574, 0.67939746, 0.72547988, 0.90678644,
       0.08552628, 0.82837151, 0.83358356, 0.83298353, 0.69837629,
       0.84226914, 0.90310745, 0.71613135, 0.80730267, 0.7437864 ,
       0.22286676, 0.14321793, 0.54697925, 0.79870308, 0.65021776,
       0.18012477, 0.90948699, 0.78158362, 0.85835145, 0.83068781,
       0.68471809, 0.21777333, 0.79680799, 0.6223995 , 0.7221497 ,
       0.66152776, 0.61810386, 0.88006767, 0.84833552, 0.77056723,
       0.70445541, 0.83583999, 0.89740714, 0.68969788, 0.6729803 ,
       0.78233296])

In [187]:
output = pd.DataFrame({'Loan_ID': test_df.Loan_ID,
                      'Loan_Status': pred_test})

In [188]:
output.to_csv('solution.csv', index=False)

## Conclusion:

This model can be improved or the accuracy rate can also be improved by building other models. 

### Thank you :)