In [2]:
import os
import gc
import joblib
import warnings
import matplotlib
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

warnings.filterwarnings("ignore")

sns.set_style("whitegrid")
data_path = "../datasets/loan_approval_dataset_updated.csv"

In [3]:
df = pd.read_csv(data_path, encoding='latin-1', sep=";")

df.columns = df.columns.map(lambda x: x.strip().lower())

df.head()

Unnamed: 0,loan_id,no_of_dependents,city,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Ankara,Graduate,No,9600000,29900000,12,778.0,2400000.0,17600000,22700000,8000000,Approved
1,2,0,Ankara,Not Graduate,Yes,4100000,12200000,8,417.0,2700000.0,2200000,8800000,3300000,Rejected
2,3,3,Ankara,Graduate,No,9100000,29700000,20,,7100000.0,4500000,33300000,12800000,Rejected
3,4,3,Ankara,Graduate,No,8200000,30700000,8,,18200000.0,3300000,23300000,7900000,Rejected
4,5,5,Ankara,Not Graduate,Yes,9800000,24200000,20,,12400000.0,8200000,29400000,5000000,Rejected


In [4]:
df.loc[df["city"] == "Ýstanbul", "city"] = "Istanbul"
df.loc[df["city"] == "Ýzmir", "city"] = "Izmir"

df.loc[df["education"] == " Graduate", "education"] = "Graduate"
df.loc[df["education"] == " Not Graduate", "education"] = "Not Graduate"

df.loc[df["self_employed"] == " No", "self_employed"] = "No"
df.loc[df["self_employed"] == " Yes", "self_employed"] = "Yes"

In [6]:
cat_cols = ["city", "education", "self_employed"]
target = "loan_status"

for col in cat_cols + [target]:
    enc = LabelEncoder()
    df[col] = enc.fit_transform(df[col])

df.head()

Unnamed: 0,loan_id,no_of_dependents,city,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,0,0,0,9600000,29900000,12,778.0,2400000.0,17600000,22700000,8000000,0
1,2,0,0,1,1,4100000,12200000,8,417.0,2700000.0,2200000,8800000,3300000,1
2,3,3,0,0,0,9100000,29700000,20,,7100000.0,4500000,33300000,12800000,1
3,4,3,0,0,0,8200000,30700000,8,,18200000.0,3300000,23300000,7900000,1
4,5,5,0,1,1,9800000,24200000,20,,12400000.0,8200000,29400000,5000000,1


In [12]:
input_cols = [
 'no_of_dependents',
 'city',
 'education',
 'self_employed',
 'income_annum',
 'loan_amount',
 'loan_term',
 'cibil_score',
 'residential_assets_value',
 'commercial_assets_value',
 'luxury_assets_value',
 'bank_asset_value']

target = "loan_status"

In [52]:
len(input_cols)

12

In [39]:
model = RandomForestClassifier()

model.fit(df[input_cols], df[target])

In [41]:
joblib.dump(model, '../models/random_forest_model.pkl')

['../models/random_forest_model.pkl']

In [51]:
import json

with open('../datasets/sample_input.json', 'w') as json_file:
    json.dump(df[input_cols].iloc[:1, :].to_dict("records")[0], json_file, indent=4)

In [8]:
model = joblib.load('../models/random_forest_model.pkl')

In [13]:
df.iloc[:1,:][input_cols]

Unnamed: 0,no_of_dependents,city,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value
0,2,0,0,0,9600000,29900000,12,778.0,2400000.0,17600000,22700000,8000000


In [15]:
model.predict_proba(df.iloc[:1,:][input_cols])

array([[0.91, 0.09]])