- [Main code](#main-code)
- [Step by step flow](#step-by-step-flow)

### Main code

In [45]:
import sys
import torch
from torch import nn
import pandas as pd

df = pd.read_csv("data/loan_data.csv")
df = df[["loan_status", "person_income", "loan_intent", "loan_percent_income", "credit_score"]]
df = pd.get_dummies(df, columns=["loan_intent"])

y = torch.tensor(df["loan_status"], dtype=torch.float32)\
    .reshape((-1, 1))

X_data = df.drop("loan_status", axis=1).astype('float32').values
X = torch.tensor(X_data, dtype=torch.float32)
X_mean = X.mean(axis=0)
X_std = X.std(axis=0)
X = (X - X_mean) / X_std
print(X.shape)

model = nn.Sequential(
    nn.Linear(9, 32),
    nn.ReLU(),
    nn.Linear(32, 16),
    nn.ReLU(),
    nn.Linear(16, 1)
)

loss_fn = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

num_entries = X.size(0)
batch_size = 32

for i in range(0, 100):
    loss_sum = 0
    for start in range(0, num_entries, batch_size):
        end = min(num_entries, start + batch_size)
        X_data = X[start:end]
        y_data = y[start:end]

        optimizer.zero_grad()
        outputs = model(X_data)
        loss = loss_fn(outputs, y_data)
        loss.backward()
        loss_sum += loss.item()
        optimizer.step()

    if i % 10 == 0:
        print(loss_sum)

model.eval()
with torch.no_grad():
    outputs = model(X)
    y_pred = nn.functional.sigmoid(outputs) > 0.5
    y_pred_correct = y_pred.type(torch.float32) == y
    print(y_pred_correct.type(torch.float32).mean())

torch.Size([45000, 9])
799.7212304249406
609.4699079096317
600.6172490641475
592.9709645062685
587.2934746034443
583.5341814570129
581.183487020433
579.4011880271137
577.8830453269184
576.5402340516448
tensor(0.8185)


### Step by step flow

In [46]:
import pandas as pd

df = pd.read_csv("./data/loan_data.csv")
df.head()

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_amnt,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file,loan_status
0,22.0,female,Master,71948.0,0,RENT,35000.0,PERSONAL,16.02,0.49,3.0,561,No,1
1,21.0,female,High School,12282.0,0,OWN,1000.0,EDUCATION,11.14,0.08,2.0,504,Yes,0
2,25.0,female,High School,12438.0,3,MORTGAGE,5500.0,MEDICAL,12.87,0.44,3.0,635,No,1
3,23.0,female,Bachelor,79753.0,0,RENT,35000.0,MEDICAL,15.23,0.44,2.0,675,No,1
4,24.0,male,Master,66135.0,1,RENT,35000.0,MEDICAL,14.27,0.53,4.0,586,No,1


Some suggestions:
- loan_intent is categorical variable
- Use one hot encoding

In [47]:
# It is not complete data.. just exploring
import pandas as pd

df = pd.DataFrame({
    "loan_intent":["PERSONAL", "EDUCATION"], 
    "person_income": [30000, 50000]
})
df = pd.get_dummies(df, columns=["loan_intent"]).astype("float32") # Convert into multiple columns using one hot encoding
# without conversion to float32, we cannot pass to pytorch tensor
df.head()

Unnamed: 0,person_income,loan_intent_EDUCATION,loan_intent_PERSONAL
0,30000.0,0.0,1.0
1,50000.0,1.0,0.0


In [48]:
df.values

array([[3.e+04, 0.e+00, 1.e+00],
       [5.e+04, 1.e+00, 0.e+00]], dtype=float32)

In [49]:
import torch

X = torch.tensor(df.values, dtype=torch.float32)
print(X)

tensor([[3.0000e+04, 0.0000e+00, 1.0000e+00],
        [5.0000e+04, 1.0000e+00, 0.0000e+00]])


Now lets explore the main data

In [50]:
import pandas as pd

df = pd.read_csv("data/loan_data.csv")
df.head()

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_amnt,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file,loan_status
0,22.0,female,Master,71948.0,0,RENT,35000.0,PERSONAL,16.02,0.49,3.0,561,No,1
1,21.0,female,High School,12282.0,0,OWN,1000.0,EDUCATION,11.14,0.08,2.0,504,Yes,0
2,25.0,female,High School,12438.0,3,MORTGAGE,5500.0,MEDICAL,12.87,0.44,3.0,635,No,1
3,23.0,female,Bachelor,79753.0,0,RENT,35000.0,MEDICAL,15.23,0.44,2.0,675,No,1
4,24.0,male,Master,66135.0,1,RENT,35000.0,MEDICAL,14.27,0.53,4.0,586,No,1


Lets consider some important columns here

In [51]:
import pandas as pd

df = pd.read_csv("data/loan_data.csv")
df = df[["loan_status", "person_income", "loan_intent", "loan_percent_income", "credit_score"]]
df.head()

Unnamed: 0,loan_status,person_income,loan_intent,loan_percent_income,credit_score
0,1,71948.0,PERSONAL,0.49,561
1,0,12282.0,EDUCATION,0.08,504
2,1,12438.0,MEDICAL,0.44,635
3,1,79753.0,MEDICAL,0.44,675
4,1,66135.0,MEDICAL,0.53,586


In [52]:
# Let's explore the group data by a column
# This code is just for understanding

print(df.groupby("loan_intent").agg({"loan_status": ["mean"]}))
print(df.groupby("loan_intent").agg({"person_income": ["mean", "min", "max"]}))
print(df.groupby("loan_intent").agg({"credit_score": ["min", "max" ]}))

                  loan_status
                         mean
loan_intent                  
DEBTCONSOLIDATION    0.302729
EDUCATION            0.169562
HOMEIMPROVEMENT      0.263015
MEDICAL              0.278194
PERSONAL             0.201404
VENTURE              0.144264
                  person_income                    
                           mean      min        max
loan_intent                                        
DEBTCONSOLIDATION  80608.216935  10606.0  1728974.0
EDUCATION          77765.025019   8000.0  2139143.0
HOMEIMPROVEMENT    89148.745766   8037.0   994533.0
MEDICAL            73414.871900   8000.0  1661567.0
PERSONAL           83030.910222   8000.0  7200766.0
VENTURE            82571.965852   8000.0  5556399.0
                  credit_score     
                           min  max
loan_intent                        
DEBTCONSOLIDATION          390  764
EDUCATION                  418  807
HOMEIMPROVEMENT            420  768
MEDICAL                    419  784
PERSONAL  

In [53]:
df.groupby("loan_status").agg({
    "person_income": ["mean", "min", "max"],
    "credit_score": ["mean", "min", "max"],
    "loan_percent_income": ["mean", "min", "max"]
})

Unnamed: 0_level_0,person_income,person_income,person_income,credit_score,credit_score,credit_score,loan_percent_income,loan_percent_income,loan_percent_income
Unnamed: 0_level_1,mean,min,max,mean,min,max,mean,min,max
loan_status,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
0,86157.040743,9595.0,7200766.0,632.814914,390,850,0.121783,0.0,0.66
1,59886.0969,8000.0,845636.0,631.8872,431,767,0.202521,0.0,0.62


Mean of the credit score is not changed much to the loan status.
- If we are using single neuron, then credit_score column is meaningless
- If we are using full neuron netwoek, then credit_score column may be useful

In [54]:
print(df.groupby("loan_intent").agg({"loan_status": ["mean"]}))

                  loan_status
                         mean
loan_intent                  
DEBTCONSOLIDATION    0.302729
EDUCATION            0.169562
HOMEIMPROVEMENT      0.263015
MEDICAL              0.278194
PERSONAL             0.201404
VENTURE              0.144264


- 30 % of DEBTCONSOLIDATION users got loan approved
- 16 % of EDUCATION users got loan approved...

Prepare the input data

In [55]:
import sys
import torch
from torch import nn
import pandas as pd

df = pd.read_csv("data/loan_data.csv")
df = df[["loan_status", "person_income", "loan_intent", "loan_percent_income", "credit_score"]]
df = pd.get_dummies(df, columns=["loan_intent"])
print(df.columns)

y = torch.tensor(df["loan_status"], dtype=torch.float32)\
    .reshape((-1, 1))

print(df.drop("loan_status", axis=1))
X_data = df.drop("loan_status", axis=1).astype('float32').values 
"""
It is important to convert the data into float32 before sending it to the tensor. 
Because tensor expects us to give float32 data. 
But in person_income column we have 71948.0....(int), credit_score column we have loan_intent.. columns have True or False(boolean) data.. 
So we need to convert it into float32 first.
"""
print(X_data.dtype)
X = torch.tensor(X_data, dtype=torch.float32)
print(X)

Index(['loan_status', 'person_income', 'loan_percent_income', 'credit_score',
       'loan_intent_DEBTCONSOLIDATION', 'loan_intent_EDUCATION',
       'loan_intent_HOMEIMPROVEMENT', 'loan_intent_MEDICAL',
       'loan_intent_PERSONAL', 'loan_intent_VENTURE'],
      dtype='object')
       person_income  loan_percent_income  credit_score  \
0            71948.0                 0.49           561   
1            12282.0                 0.08           504   
2            12438.0                 0.44           635   
3            79753.0                 0.44           675   
4            66135.0                 0.53           586   
...              ...                  ...           ...   
44995        47971.0                 0.31           645   
44996        65800.0                 0.14           621   
44997        56942.0                 0.05           668   
44998        33164.0                 0.36           604   
44999        51609.0                 0.13           628   

       loa

Before creating the neuron network, we need to know how the shape should be of the network.
It is based on the X and y data

In [56]:
import sys
import torch
from torch import nn
import pandas as pd

df = pd.read_csv("data/loan_data.csv")
df = df[["loan_status", "person_income", "loan_intent", "loan_percent_income", "credit_score"]]
df = pd.get_dummies(df, columns=["loan_intent"])

y = torch.tensor(df["loan_status"], dtype=torch.float32)\
    .reshape((-1, 1))

X_data = df.drop("loan_status", axis=1).astype('float32').values
X = torch.tensor(X_data, dtype=torch.float32)
print(X.shape)


# Basic neuron network

model = nn.Sequential(
    nn.Linear(9, 1)
) # 9 inputs because we have 9 columns in the X.

loss_fn = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

num_entries = X.size(0)
batch_size = 32

for i in range(0, 10): # Decreased from 1000 to 10 while running the cell
    loss_sum = 0
    for start in range(0, num_entries, batch_size):
        end = min(num_entries, start + batch_size)
        X_data = X[start:end]
        y_data = y[start:end]

        optimizer.zero_grad()
        outputs = model(X_data)
        loss = loss_fn(outputs, y_data)
        loss.backward()
        loss_sum += loss.item()
        optimizer.step()

    if i % 10 == 0:
        print(loss_sum)

model.eval()
with torch.no_grad():
    outputs = model(X)
    y_pred = nn.functional.sigmoid(outputs) > 0.5
    y_pred_correct = y_pred.type(torch.float32) == y
    print(y_pred_correct.type(torch.float32).mean())

torch.Size([45000, 9])
205712.57806427777
tensor(0.2222)


In [57]:
"""
If you see the output the loss is not at all improving.
This is due to improper X data.. some columns have bigger values.
Some are just 0's and 1's
Y is completely fine
"""
print(X)

tensor([[7.1948e+04, 4.9000e-01, 5.6100e+02,  ..., 0.0000e+00, 1.0000e+00,
         0.0000e+00],
        [1.2282e+04, 8.0000e-02, 5.0400e+02,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [1.2438e+04, 4.4000e-01, 6.3500e+02,  ..., 1.0000e+00, 0.0000e+00,
         0.0000e+00],
        ...,
        [5.6942e+04, 5.0000e-02, 6.6800e+02,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [3.3164e+04, 3.6000e-01, 6.0400e+02,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [5.1609e+04, 1.3000e-01, 6.2800e+02,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00]])


In [58]:
"""
So first normalize the data.
"""
import sys
import torch
from torch import nn
import pandas as pd

df = pd.read_csv("data/loan_data.csv")
df = df[["loan_status", "person_income", "loan_intent", "loan_percent_income", "credit_score"]]
df = pd.get_dummies(df, columns=["loan_intent"])

y = torch.tensor(df["loan_status"], dtype=torch.float32)\
    .reshape((-1, 1))

X_data = df.drop("loan_status", axis=1).astype('float32').values
X = torch.tensor(X_data, dtype=torch.float32)
X_mean = X.mean(axis=0)
X_std = X.std(axis=0)
X = (X - X_mean) / X_std
print(X.shape)

model = nn.Sequential(
    nn.Linear(9, 1)
)

loss_fn = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

num_entries = X.size(0)
batch_size = 32

for i in range(0, 20): # Changed from 10 to 20 while running
    loss_sum = 0
    for start in range(0, num_entries, batch_size):
        end = min(num_entries, start + batch_size)
        X_data = X[start:end]
        y_data = y[start:end]

        optimizer.zero_grad()
        outputs = model(X_data)
        loss = loss_fn(outputs, y_data)
        loss.backward()
        loss_sum += loss.item()
        optimizer.step()

    if i % 10 == 0:
        print(loss_sum)

model.eval()
with torch.no_grad():
    outputs = model(X)
    y_pred = nn.functional.sigmoid(outputs) > 0.5
    y_pred_correct = y_pred.type(torch.float32) == y
    print(y_pred_correct.type(torch.float32).mean())

torch.Size([45000, 9])
642.7200459763408
615.8091294839978
tensor(0.7632)


In [59]:
"""
Let's improve the neuron network.
"""
import sys
import torch
from torch import nn
import pandas as pd

df = pd.read_csv("data/loan_data.csv")
df = df[["loan_status", "person_income", "loan_intent", "loan_percent_income", "credit_score"]]
df = pd.get_dummies(df, columns=["loan_intent"])

y = torch.tensor(df["loan_status"], dtype=torch.float32)\
    .reshape((-1, 1))

X_data = df.drop("loan_status", axis=1).astype('float32').values
X = torch.tensor(X_data, dtype=torch.float32)
X_mean = X.mean(axis=0)
X_std = X.std(axis=0)
X = (X - X_mean) / X_std
print(X.shape)

model = nn.Sequential(
    nn.Linear(9, 32),
    nn.ReLU(),
    nn.Linear(32, 16),
    nn.ReLU(),
    nn.Linear(16, 1),
)

loss_fn = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

num_entries = X.size(0)
batch_size = 32

for i in range(0, 20): # Changed from 10 to 20 while running
    loss_sum = 0
    for start in range(0, num_entries, batch_size):
        end = min(num_entries, start + batch_size)
        X_data = X[start:end]
        y_data = y[start:end]

        optimizer.zero_grad()
        outputs = model(X_data)
        loss = loss_fn(outputs, y_data)
        loss.backward()
        loss_sum += loss.item()
        optimizer.step()

    if i % 10 == 0:
        print(loss_sum)

model.eval()
with torch.no_grad():
    outputs = model(X)
    y_pred = nn.functional.sigmoid(outputs) > 0.5
    y_pred_correct = y_pred.type(torch.float32) == y
    print(y_pred_correct.type(torch.float32).mean())

torch.Size([45000, 9])
490.4763981060314
544.3772693928331
tensor(0.2858)


In [60]:
"""
We can see that the loss is increasing. So we should decrease the learning rate.
"""
import sys
import torch
from torch import nn
import pandas as pd

df = pd.read_csv("data/loan_data.csv")
df = df[["loan_status", "person_income", "loan_intent", "loan_percent_income", "credit_score"]]
df = pd.get_dummies(df, columns=["loan_intent"])

y = torch.tensor(df["loan_status"], dtype=torch.float32)\
    .reshape((-1, 1))

X_data = df.drop("loan_status", axis=1).astype('float32').values
X = torch.tensor(X_data, dtype=torch.float32)
X_mean = X.mean(axis=0)
X_std = X.std(axis=0)
X = (X - X_mean) / X_std
print(X.shape)

model = nn.Sequential(
    nn.Linear(9, 32),
    nn.ReLU(),
    nn.Linear(32, 16),
    nn.ReLU(),
    nn.Linear(16, 1),
)

loss_fn = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

num_entries = X.size(0)
batch_size = 32

for i in range(0, 20): # Changed from 10 to 20 while running
    loss_sum = 0
    for start in range(0, num_entries, batch_size):
        end = min(num_entries, start + batch_size)
        X_data = X[start:end]
        y_data = y[start:end]

        optimizer.zero_grad()
        outputs = model(X_data)
        loss = loss_fn(outputs, y_data)
        loss.backward()
        loss_sum += loss.item()
        optimizer.step()

    if i % 10 == 0:
        print(loss_sum)

model.eval()
with torch.no_grad():
    outputs = model(X)
    y_pred = nn.functional.sigmoid(outputs) > 0.5
    y_pred_correct = y_pred.type(torch.float32) == y
    print(y_pred_correct.type(torch.float32).mean())

torch.Size([45000, 9])
837.427164375782
610.6820350959897
tensor(0.8233)


In [61]:
"""
The loss is decreasing, it is a good sign.
Now increase the iterations.
"""
import sys
import torch
from torch import nn
import pandas as pd

df = pd.read_csv("data/loan_data.csv")
df = df[["loan_status", "person_income", "loan_intent", "loan_percent_income", "credit_score"]]
df = pd.get_dummies(df, columns=["loan_intent"])

y = torch.tensor(df["loan_status"], dtype=torch.float32)\
    .reshape((-1, 1))

X_data = df.drop("loan_status", axis=1).astype('float32').values
X = torch.tensor(X_data, dtype=torch.float32)
X_mean = X.mean(axis=0)
X_std = X.std(axis=0)
X = (X - X_mean) / X_std
print(X.shape)

model = nn.Sequential(
    nn.Linear(9, 32),
    nn.ReLU(),
    nn.Linear(32, 16),
    nn.ReLU(),
    nn.Linear(16, 1),
)

loss_fn = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

num_entries = X.size(0)
batch_size = 32

for i in range(0, 100): # Changed from 10 to 20 while running
    loss_sum = 0
    for start in range(0, num_entries, batch_size):
        end = min(num_entries, start + batch_size)
        X_data = X[start:end]
        y_data = y[start:end]

        optimizer.zero_grad()
        outputs = model(X_data)
        loss = loss_fn(outputs, y_data)
        loss.backward()
        loss_sum += loss.item()
        optimizer.step()

    if i % 10 == 0:
        print(loss_sum)

model.eval()
with torch.no_grad():
    outputs = model(X)
    y_pred = nn.functional.sigmoid(outputs) > 0.5
    y_pred_correct = y_pred.type(torch.float32) == y
    print(y_pred_correct.type(torch.float32).mean())

torch.Size([45000, 9])
779.9897756278515
611.3381649181247
604.3701142333448
596.5612174384296
590.4232226386666
586.702490746975
584.5695892870426
582.8881355859339
581.639404155314
580.5680832080543
tensor(0.8215)


In [62]:
import sys
import torch
from torch import nn
import pandas as pd

df = pd.read_csv("data/loan_data.csv")
df = df[["loan_status", "person_income", "loan_intent", "loan_percent_income", "credit_score"]]
df = pd.get_dummies(df, columns=["loan_intent"])

y = torch.tensor(df["loan_status"], dtype=torch.float32)\
    .reshape((-1, 1))

X_data = df.drop("loan_status", axis=1).astype('float32').values
X = torch.tensor(X_data, dtype=torch.float32)
X_mean = X.mean(axis=0)
X_std = X.std(axis=0)
X = (X - X_mean) / X_std
print(X.shape)

model = nn.Sequential(
    nn.Linear(9, 32),
    nn.ReLU(),
    nn.Linear(32, 16),
    nn.ReLU(),
    nn.Linear(16, 1)
)

loss_fn = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

num_entries = X.size(0)
batch_size = 32

for i in range(0, 100):
    loss_sum = 0
    for start in range(0, num_entries, batch_size):
        end = min(num_entries, start + batch_size)
        X_data = X[start:end]
        y_data = y[start:end]

        optimizer.zero_grad()
        outputs = model(X_data)
        loss = loss_fn(outputs, y_data)
        loss.backward()
        loss_sum += loss.item()
        optimizer.step()

    if i % 10 == 0:
        print(loss_sum)

model.eval()
with torch.no_grad():
    outputs = model(X)
    y_pred = nn.functional.sigmoid(outputs) > 0.5
    y_pred_correct = y_pred.type(torch.float32) == y
    print(y_pred_correct.type(torch.float32).mean())



torch.Size([45000, 9])
758.1052347794175
612.8799675516784
604.2817144989967
595.006852183491
587.7183279767632
583.4615176804364
580.9182597510517
579.0159925296903
577.7343189157546
576.723461791873
tensor(0.8234)
