In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt 
plt.rc("font", size=14)
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import seaborn as sns
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)

### Several libraries are imported, including pandas, numpy, sklearn and matplotlib, which are commonly used for data analysis and machine learning tasks.

In [3]:
data = pd.read_csv(r"C:\Users\Dell\Downloads\loan_prediction.csv")

In [4]:
data

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


### The above code reads a dataset from a CSV file loacted at the specifies file path using pandas.

In [5]:
data.dtypes

Loan_ID               object
Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object

In [8]:
column_names = data.columns
for column in column_names:
    print(column + ' - ' + str(data[column].isnull().sum()))


Loan_ID - 0
Gender - 13
Married - 3
Dependents - 15
Education - 0
Self_Employed - 32
ApplicantIncome - 0
CoapplicantIncome - 0
LoanAmount - 22
Loan_Amount_Term - 14
Credit_History - 50
Property_Area - 0
Loan_Status - 0


In [9]:
data['Dependents'] = data['Dependents'].str.replace('3+', '3')
data['Dependents'] = pd.to_numeric(data['Dependents'], errors='coerce')


  data['Dependents'] = data['Dependents'].str.replace('3+', '3')


In [10]:
columns_to_fill = ['Gender','Married','Dependents','Self_Employed','Loan_Amount_Term','Credit_History','Education','ApplicantIncome','CoapplicantIncome','Property_Area','Loan_Status']

data = data[columns_to_fill].fillna(0)

### The above code performs  some data cleaning task: 
###                                  * It replaces 3+ in the depedant column with 3 and convertsbthe column to numeric values
###                                  * it fills missing values in various columns with zero

In [11]:
data.isnull().sum()

Gender               0
Married              0
Dependents           0
Self_Employed        0
Loan_Amount_Term     0
Credit_History       0
Education            0
ApplicantIncome      0
CoapplicantIncome    0
Property_Area        0
Loan_Status          0
dtype: int64

In [12]:
data

Unnamed: 0,Gender,Married,Dependents,Self_Employed,Loan_Amount_Term,Credit_History,Education,ApplicantIncome,CoapplicantIncome,Property_Area,Loan_Status
0,Male,No,0.0,No,360.0,1.0,Graduate,5849,0.0,Urban,Y
1,Male,Yes,1.0,No,360.0,1.0,Graduate,4583,1508.0,Rural,N
2,Male,Yes,0.0,Yes,360.0,1.0,Graduate,3000,0.0,Urban,Y
3,Male,Yes,0.0,No,360.0,1.0,Not Graduate,2583,2358.0,Urban,Y
4,Male,No,0.0,No,360.0,1.0,Graduate,6000,0.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...
609,Female,No,0.0,No,360.0,1.0,Graduate,2900,0.0,Rural,Y
610,Male,Yes,0.0,No,180.0,1.0,Graduate,4106,0.0,Rural,Y
611,Male,Yes,1.0,No,360.0,1.0,Graduate,8072,240.0,Urban,Y
612,Male,Yes,2.0,No,360.0,1.0,Graduate,7583,0.0,Urban,Y


In [13]:
data['Dependents']

0      0.0
1      1.0
2      0.0
3      0.0
4      0.0
      ... 
609    0.0
610    0.0
611    1.0
612    2.0
613    0.0
Name: Dependents, Length: 614, dtype: float64

In [16]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
data['Gender'] = data['Gender'].astype(str)
data['Gender'] = label_encoder.fit_transform(data['Gender'])


In [17]:
data['Married'] = data['Married'].astype(str)
data['Education'] = data['Education'].astype(str)
data['Self_Employed'] = data['Self_Employed'].astype(str)
data['Property_Area'] = data['Property_Area'].astype(str)
data['Loan_Status'] = data['Loan_Status'].astype(str)

# Apply label encoding
data['Married'] = label_encoder.fit_transform(data['Married'])
data['Education'] = label_encoder.fit_transform(data['Education'])
data['Self_Employed'] = label_encoder.fit_transform(data['Self_Employed'])
data['Property_Area'] = label_encoder.fit_transform(data['Property_Area'])
data['Loan_Status'] = label_encoder.fit_transform(data['Loan_Status'])
#data['Married'] = label_encoder.fit_transform(data['Married'])


In [19]:
X = data.drop(['Loan_Status'], axis=1)
y = data['Loan_Status']

X

Unnamed: 0,Gender,Married,Dependents,Self_Employed,Loan_Amount_Term,Credit_History,Education,ApplicantIncome,CoapplicantIncome,Property_Area
0,2,1,0.0,1,360.0,1.0,0,5849,0.0,2
1,2,2,1.0,1,360.0,1.0,0,4583,1508.0,0
2,2,2,0.0,2,360.0,1.0,0,3000,0.0,2
3,2,2,0.0,1,360.0,1.0,1,2583,2358.0,2
4,2,1,0.0,1,360.0,1.0,0,6000,0.0,2
...,...,...,...,...,...,...,...,...,...,...
609,1,1,0.0,1,360.0,1.0,0,2900,0.0,0
610,2,2,0.0,1,180.0,1.0,0,4106,0.0,0
611,2,2,1.0,1,360.0,1.0,0,8072,240.0,2
612,2,2,2.0,1,360.0,1.0,0,7583,0.0,2


In [20]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state=42)

In [25]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)  # You can adjust n_estimators and other hyperparameters
rf_regressor.fit(X_train, y_train)
rf_regressor.score(X_train, y_train)

# A random forest regressor model is created and fitted to the trainning data. this model is used for regression task

0.8712876231406256

In [22]:
y_pred = rf_regressor.predict(X_test)
y_pred

array([0.84, 0.28, 0.52, 0.65, 0.95, 0.76, 0.98, 0.83, 0.87, 0.45, 0.59,
       0.78, 0.56, 0.64, 0.92, 0.29, 0.6 , 0.74, 0.96, 0.79, 0.98, 0.97,
       0.19, 1.  , 0.83, 0.28, 0.88, 0.75, 0.85, 0.57, 0.93, 0.84, 1.  ,
       0.97, 0.5 , 0.32, 0.85, 0.96, 0.94, 0.8 , 1.  , 0.77, 0.98, 1.  ,
       0.75, 0.92, 0.17, 0.41, 0.29, 0.13, 0.39, 0.99, 0.49, 0.46, 0.98,
       0.7 , 0.86, 0.74, 0.25, 0.98, 0.33, 0.31, 0.64, 0.45, 0.86, 0.97,
       0.81, 0.85, 0.96, 0.78, 0.32, 0.92, 0.91, 0.87, 0.34, 0.91, 0.93,
       0.7 , 0.39, 0.59, 0.72, 0.77, 0.99, 0.53, 0.84, 0.85, 0.98, 0.33,
       0.94, 0.5 , 0.94, 0.09, 0.42, 0.98, 0.79, 0.79, 0.31, 0.32, 0.63,
       0.88, 0.99, 0.97, 0.87, 0.46, 0.26, 0.37, 0.98, 0.48, 0.92, 0.93,
       0.64, 0.38, 0.89, 0.95, 0.12, 0.92, 0.81, 0.85, 0.4 , 0.39, 0.41,
       0.75, 0.7 ])

In [23]:
y_test

350    1
377    1
163    1
609    1
132    1
      ..
231    1
312    1
248    1
11     1
333    1
Name: Loan_Status, Length: 123, dtype: int32

In [24]:
print('Training set score: {:.4f}'.format(rf_regressor.score(X_train, y_train)))

print('Test set score: {:.4f}'.format(rf_regressor.score(X_test, y_test)))

Training set score: 0.8713
Test set score: 0.0682


###