In [106]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pickle

In [34]:
data=pd.read_csv('loan_dataset.csv')

In [35]:

data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [36]:
data.shape

(614, 13)

In [37]:

data.info()  

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [38]:
data.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


In [39]:
categorical_variables=['Gender','Married','Dependents','Education','Self_Employed','Credit_History', 'Property_Area', 'Loan_Status']
print('Categorical variable are :\n')
for i in range (len(categorical_variables)):
    print(categorical_variables[i])

Categorical variable are :

Gender
Married
Dependents
Education
Self_Employed
Credit_History
Property_Area
Loan_Status


In [40]:
data.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [41]:
categorialMissingFeats=["Gender","Married","Dependents","Self_Employed","Credit_History"]

for feat in categorialMissingFeats:
    data.loc[data[feat].isnull(),feat]=data[feat].mode()[0]



In [42]:
data.isnull().sum()

Loan_ID               0
Gender                0
Married               0
Dependents            0
Education             0
Self_Employed         0
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History        0
Property_Area         0
Loan_Status           0
dtype: int64

In [43]:
continousMissingFeats=["LoanAmount","Loan_Amount_Term"]

for feat in continousMissingFeats:
    data.loc[data[feat].isnull(),feat]=data[feat].median()

In [44]:
data.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [45]:
dummies1=pd.get_dummies(data['Gender'])
dummies2=pd.get_dummies(data['Married'])

In [46]:
dummies2

Unnamed: 0,No,Yes
0,True,False
1,False,True
2,False,True
3,False,True
4,True,False
...,...,...
609,True,False
610,False,True
611,False,True
612,False,True


In [47]:
dummies2=dummies2.rename(columns = {'Yes':'Married_yes','No':'NotMarried'})

In [48]:
dummies3=pd.get_dummies(data['Dependents'])
dummies3=dummies3.rename(columns = {'0': 'Dependents_0','1': 'Dependents_1','2': 'Dependents_2','3+': 'Dependents_3+'})

In [49]:
dummies4=pd.get_dummies(data['Education'])
dummies5=pd.get_dummies(data['Self_Employed'])
dummies5=dummies5.rename(columns = {'Yes':'Self_Employed_yes','No':'NotSelf_Employed'})

In [50]:
dummies6=pd.get_dummies(data['Property_Area'])
dummies6

Unnamed: 0,Rural,Semiurban,Urban
0,False,False,True
1,True,False,False
2,False,False,True
3,False,False,True
4,False,False,True
...,...,...,...
609,True,False,False
610,True,False,False
611,False,False,True
612,False,False,True


In [51]:
dummies=[dummies1,dummies2,dummies3,dummies4,dummies5,dummies6]
for dummy in dummies:
    data= pd.concat([data,dummy],axis=1)

In [52]:
data

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,...,Dependents_1,Dependents_2,Dependents_3+,Graduate,Not Graduate,NotSelf_Employed,Self_Employed_yes,Rural,Semiurban,Urban
0,LP001002,Male,No,0,Graduate,No,5849,0.0,128.0,360.0,...,False,False,False,True,False,True,False,False,False,True
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,...,True,False,False,True,False,True,False,True,False,False
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,...,False,False,False,True,False,False,True,False,False,True
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,...,False,False,False,False,True,True,False,False,False,True
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,...,False,False,False,True,False,True,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,...,False,False,False,True,False,True,False,True,False,False
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,...,False,False,True,True,False,True,False,True,False,False
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,...,True,False,False,True,False,True,False,False,False,True
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,...,False,True,False,True,False,True,False,False,False,True


In [53]:
data=data.drop(['Loan_ID','Gender','Married','Property_Area','Dependents','Self_Employed','Education','NotMarried','NotSelf_Employed','Not Graduate','Female'] , axis=1)

In [54]:
data.loc[data['Loan_Status']=="Y",'Loan_Status']=1
data.loc[data['Loan_Status']=="N",'Loan_Status']=0

In [55]:
data['Loan_Status']

0      1
1      0
2      1
3      1
4      1
      ..
609    1
610    1
611    1
612    1
613    0
Name: Loan_Status, Length: 614, dtype: object

In [56]:
data.head(10)

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status,Male,Married_yes,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Graduate,Self_Employed_yes,Rural,Semiurban,Urban
0,5849,0.0,128.0,360.0,1.0,1,True,False,True,False,False,False,True,False,False,False,True
1,4583,1508.0,128.0,360.0,1.0,0,True,True,False,True,False,False,True,False,True,False,False
2,3000,0.0,66.0,360.0,1.0,1,True,True,True,False,False,False,True,True,False,False,True
3,2583,2358.0,120.0,360.0,1.0,1,True,True,True,False,False,False,False,False,False,False,True
4,6000,0.0,141.0,360.0,1.0,1,True,False,True,False,False,False,True,False,False,False,True
5,5417,4196.0,267.0,360.0,1.0,1,True,True,False,False,True,False,True,True,False,False,True
6,2333,1516.0,95.0,360.0,1.0,1,True,True,True,False,False,False,False,False,False,False,True
7,3036,2504.0,158.0,360.0,0.0,0,True,True,False,False,False,True,True,False,False,True,False
8,4006,1526.0,168.0,360.0,1.0,1,True,True,False,False,True,False,True,False,False,False,True
9,12841,10968.0,349.0,360.0,1.0,0,True,True,False,True,False,False,True,False,False,True,False


In [57]:

X=data.drop(columns='Loan_Status')
Y=pd.DataFrame(data['Loan_Status'])

In [100]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.15,random_state=3)

In [101]:

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   ApplicantIncome    614 non-null    int64  
 1   CoapplicantIncome  614 non-null    float64
 2   LoanAmount         614 non-null    float64
 3   Loan_Amount_Term   614 non-null    float64
 4   Credit_History     614 non-null    float64
 5   Loan_Status        614 non-null    object 
 6   Male               614 non-null    bool   
 7   Married_yes        614 non-null    bool   
 8   Dependents_0       614 non-null    bool   
 9   Dependents_1       614 non-null    bool   
 10  Dependents_2       614 non-null    bool   
 11  Dependents_3+      614 non-null    bool   
 12  Graduate           614 non-null    bool   
 13  Self_Employed_yes  614 non-null    bool   
 14  Rural              614 non-null    bool   
 15  Semiurban          614 non-null    bool   
 16  Urban              614 non

In [102]:
X_train.replace({True:1,False:0},inplace=True)
X_test.replace({True:1,False:0},inplace=True)
Y_train['Loan_Status']=Y_train['Loan_Status'].astype("int")
Y_test['Loan_Status']=Y_test['Loan_Status'].astype("int")

  X_train.replace({True:1,False:0},inplace=True)
  X_test.replace({True:1,False:0},inplace=True)


In [107]:
classifier = RandomForestClassifier(
    n_estimators=1000,
    max_features=15,
    max_depth=5,
    bootstrap=True)

In [108]:
classifier.fit(X=X_train,y=Y_train)

  return fit_method(estimator, *args, **kwargs)


In [109]:
classifier.fit(X_train,Y_train)
predictions = classifier.predict(X_test)
accuracyScores = accuracy_score(predictions, Y_test)
print(accuracyScores)

  return fit_method(estimator, *args, **kwargs)


0.8709677419354839


In [110]:
X_train.shape

(521, 16)

In [111]:
filename = 'Random_Forest.sav'
pickle.dump(obj=classifier, file=open(file=filename, mode='wb'))