# Loan approval minimal model

This model aims at providing a minimal classification model with numerical-only features for a loan approval.
Let's star by important the raw data into a `Pandas` dataframe.

In [10]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings('ignore')

In [11]:
df = pd.read_csv("../data/raw/train.csv")

In [12]:
df.head(20)

Unnamed: 0,Key,Male,Age,Debt,Married,BankCustomer,EducationLevel,Ethnicity,YearsEmployed,PriorDefault,Employed,CreditScore,DriversLicense,Citizen,ZipCode,Income,Approved
0,1,?,40.83,3.5,u,g,i,bb,0.5,f,f,0,f,s,1160,0,-
1,2,?,32.25,1.5,u,g,c,v,0.25,f,f,0,t,g,372,122,-
2,3,?,28.17,0.585,u,g,aa,v,0.04,f,f,0,f,g,260,1004,-
3,4,?,29.75,0.665,u,g,w,v,0.25,f,f,0,t,g,300,0,-
4,5,?,26.5,2.71,y,p,?,?,0.085,f,f,0,f,s,80,0,-
5,6,?,45.33,1.0,u,g,q,v,0.125,f,f,0,t,g,263,0,-
6,7,?,42.25,1.75,y,p,?,?,0.0,f,f,0,t,g,150,1,-
7,8,?,33.17,2.25,y,p,cc,v,3.5,f,f,0,t,g,200,141,-
8,9,a,38.58,5.0,u,g,cc,v,13.5,t,f,0,t,g,980,0,-
9,10,a,19.17,0.585,y,p,aa,v,0.585,t,f,0,t,g,160,0,-


At this point we will remove any rows with empty values.

In [13]:
df = df.dropna()

We can see from the dataframe above that some values in the `Age` variable are not empty, but have a string value in a otherwise numerical column.
We will also drop those.

In [14]:
df = df[pd.to_numeric(df['Age'], errors='coerce').notnull()]

We will now select only the numerical features. These are:

- `Age`
- `Debt`
- `YearsEmployed`
- `Income`

In [15]:
inputs = df[['Age', 'Debt', 'YearsEmployed', 'Income']]

In [16]:
inputs.head()

Unnamed: 0,Age,Debt,YearsEmployed,Income
0,40.83,3.5,0.5,0
1,32.25,1.5,0.25,122
2,28.17,0.585,0.04,1004
3,29.75,0.665,0.25,0
4,26.5,2.71,0.085,0


In [17]:
inputs.describe()

Unnamed: 0,Debt,YearsEmployed,Income
count,578.0,578.0,578.0
mean,4.62846,2.0691,921.179931
std,4.870476,3.199962,5072.488441
min,0.0,0.0,0.0
25%,1.0,0.165,0.0
50%,2.71,1.0,4.0
75%,7.0,2.5,363.0
max,26.335,20.0,100000.0


We can see that the `Age` variable is not being considered as a continuous numerical variable, so we'll cast the dataframe column to `float64`.

In [18]:
inputs['Age'] = inputs['Age'].astype('float64')

In [19]:
inputs.describe()

Unnamed: 0,Age,Debt,YearsEmployed,Income
count,578.0,578.0,578.0,578.0
mean,31.25827,4.62846,2.0691,921.179931
std,12.014247,4.870476,3.199962,5072.488441
min,13.75,0.0,0.0,0.0
25%,22.5,1.0,0.165,0.0
50%,28.125,2.71,1.0,4.0
75%,37.5,7.0,2.5,363.0
max,80.25,26.335,20.0,100000.0


In [20]:
inputs.head(20)

Unnamed: 0,Age,Debt,YearsEmployed,Income
0,40.83,3.5,0.5,0
1,32.25,1.5,0.25,122
2,28.17,0.585,0.04,1004
3,29.75,0.665,0.25,0
4,26.5,2.71,0.085,0
5,45.33,1.0,0.125,0
6,42.25,1.75,0.0,1
7,33.17,2.25,3.5,141
8,38.58,5.0,13.5,0
9,19.17,0.585,0.585,0


In [12]:
outputs = df[['Approved']]

In [13]:
outputs.sample(10)

Unnamed: 0,Approved
256,-
430,+
220,-
442,+
106,-
416,+
546,+
425,+
203,-
589,+


In [14]:
outputs = outputs.replace({'-':0, '+':1}).astype('int8')

In [15]:
outputs.sample(10)

Unnamed: 0,Approved
255,0
516,1
107,0
117,0
333,0
519,1
485,1
330,0
138,0
276,0


In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(inputs, outputs, test_size=0.4, random_state=23)

In [17]:
rf = RandomForestClassifier(verbose=True, n_jobs=-1)

In [20]:
rf = rf.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.1s finished


In [21]:
MSE = rf.score(X_test, y_test)*100
print(f"MSE: {MSE}%")

MSE: 75.0%


[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished
