In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

# Other files
from experiments.additive_models import TwoLayerAdditiveModel, train_model

# Package imports
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
import lightning.pytorch as pl

# Pre-processing

In [3]:
# Load data
df = pd.read_csv("../data/heloc_dataset_v1.csv")

# define x and y
X = df.drop(['RiskPerformance'], axis=1)
y = df['RiskPerformance']

# convert y to 0 and 1
y = y.replace({'Bad': 0, 'Good': 1})

In [4]:
# Dealing with categorical variables
X.loc[(X['MaxDelq2PublicRecLast12M'] == 5),'MaxDelq2PublicRecLast12M']=6
X.loc[(X['MaxDelq2PublicRecLast12M'] == 8),'MaxDelq2PublicRecLast12M']=9
conditions = {
    1: X['MaxDelq2PublicRecLast12M'] == 7,
    2: X['MaxDelq2PublicRecLast12M'] == 6,
    3: X['MaxDelq2PublicRecLast12M'] == 4,
    4: X['MaxDelq2PublicRecLast12M'] == 3,
    5: X['MaxDelq2PublicRecLast12M'] == 2,
    6: X['MaxDelq2PublicRecLast12M'] == 1,
    7: X['MaxDelq2PublicRecLast12M'] == 0,
    -8: X['MaxDelq2PublicRecLast12M'] == 9
}

X['MaxDelq2PublicRecLast12M'] = np.select(conditions.values(), conditions.keys(), default=X['MaxDelq2PublicRecLast12M'])

X.loc[(X['MaxDelqEver'] == 1),'MaxDelqEver']=9
conditions = {
    1: X['MaxDelqEver'] == 8,
    2: X['MaxDelqEver'] == 7,
    3: X['MaxDelqEver'] == 6,
    4: X['MaxDelqEver'] == 5,
    5: X['MaxDelqEver'] == 4,
    6: X['MaxDelqEver'] == 3,
    7: X['MaxDelqEver'] == 2,
    -8: X['MaxDelqEver'] == 9
}

X['MaxDelqEver'] = np.select(conditions.values(), conditions.keys(), default=X['MaxDelqEver'])

In [5]:
# Dummy variables for -7, -8 and -9 where necessary
for val in range(-9, -6):
    for col in X:
        new_col = col + "_" + str(val*-1)
        if np.sum(X[col] == val)>0:
            X[new_col] = X[col] == val
            X[new_col] = X[new_col].astype(int)

In [6]:
df

Unnamed: 0,RiskPerformance,ExternalRiskEstimate,MSinceOldestTradeOpen,MSinceMostRecentTradeOpen,AverageMInFile,NumSatisfactoryTrades,NumTrades60Ever2DerogPubRec,NumTrades90Ever2DerogPubRec,PercentTradesNeverDelq,MSinceMostRecentDelq,...,PercentInstallTrades,MSinceMostRecentInqexcl7days,NumInqLast6M,NumInqLast6Mexcl7days,NetFractionRevolvingBurden,NetFractionInstallBurden,NumRevolvingTradesWBalance,NumInstallTradesWBalance,NumBank2NatlTradesWHighUtilization,PercentTradesWBalance
0,Bad,55,144,4,84,20,3,0,83,2,...,43,0,0,0,33,-8,8,1,1,69
1,Bad,61,58,15,41,2,4,4,100,-7,...,67,0,0,0,0,-8,0,-8,-8,0
2,Bad,67,66,5,24,9,0,0,100,-7,...,44,0,4,4,53,66,4,2,1,86
3,Bad,66,169,1,73,28,1,1,93,76,...,57,0,5,4,72,83,6,4,3,91
4,Bad,81,333,27,132,12,0,0,100,-7,...,25,0,1,1,51,89,3,1,0,80
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10454,Good,73,131,5,57,21,0,0,95,80,...,19,7,0,0,26,-8,5,2,0,100
10455,Bad,65,147,39,68,11,0,0,92,28,...,42,1,1,1,86,53,2,2,1,80
10456,Bad,74,129,6,64,18,1,1,100,-7,...,33,3,4,4,6,-8,5,-8,0,56
10457,Bad,72,234,12,113,42,2,2,96,35,...,20,6,0,0,19,-8,4,1,0,38


In [287]:
# Sort columns alphabetically
X = X.reindex(sorted(X.columns), axis=1)

In [288]:
# Make mask for FICO subscales
df = pd.read_excel("../data/FICO subscale mapping.xlsx", usecols="A,C")
df['x'] = 1
# reshape wide
df = df.pivot(index='Original Column', columns='Subscale number', values='x').fillna(0)
mask = torch.Tensor(df.to_numpy()).T

In [289]:
X

Unnamed: 0,AverageMInFile,AverageMInFile_9,ExternalRiskEstimate,ExternalRiskEstimate_9,MSinceMostRecentDelq,MSinceMostRecentDelq_7,MSinceMostRecentDelq_8,MSinceMostRecentDelq_9,MSinceMostRecentInqexcl7days,MSinceMostRecentInqexcl7days_7,...,NumTrades90Ever2DerogPubRec_9,NumTradesOpeninLast12M,NumTradesOpeninLast12M_9,PercentInstallTrades,PercentInstallTrades_9,PercentTradesNeverDelq,PercentTradesNeverDelq_9,PercentTradesWBalance,PercentTradesWBalance_8,PercentTradesWBalance_9
0,84,0,55,0,2,0,0,0,0,0,...,0,1,0,43,0,83,0,69,0,0
1,41,0,61,0,-7,1,0,0,0,0,...,0,0,0,67,0,100,0,0,0,0
2,24,0,67,0,-7,1,0,0,0,0,...,0,4,0,44,0,100,0,86,0,0
3,73,0,66,0,76,0,0,0,0,0,...,0,3,0,57,0,93,0,91,0,0
4,132,0,81,0,-7,1,0,0,0,0,...,0,0,0,25,0,100,0,80,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10454,57,0,73,0,80,0,0,0,7,0,...,0,5,0,19,0,95,0,100,0,0
10455,68,0,65,0,28,0,0,0,1,0,...,0,0,0,42,0,92,0,80,0,0
10456,64,0,74,0,-7,1,0,0,3,0,...,0,1,0,33,0,100,0,56,0,0
10457,113,0,72,0,35,0,0,0,6,0,...,0,0,0,20,0,96,0,38,0,0


# Train model

In [290]:
input = torch.Tensor(X.to_numpy())
labels = torch.unsqueeze(torch.Tensor(y.to_numpy()), dim=1)

# test train split
input_train, input_test, label_train, label_test = train_test_split(input, labels, test_size=0.2)

In [291]:
model = TwoLayerAdditiveModel(input_dim = X.shape[1], output_dim=1, hidden_dim=10)
model.prune_weights(mask)

opt = torch.optim.Adam(model.parameters(), lr=0.01)
loss_fn = torch.nn.BCELoss()
trained_model = train_model(model=model, input=input_train, labels=label_train, epochs=1000, optimiser=opt, loss_func=loss_fn, sep_layers=False, verbose=True)

Epoch: 0 | Loss: 0.9481233358383179
Epoch: 100 | Loss: 0.6654854416847229
Epoch: 200 | Loss: 0.631005585193634
Epoch: 300 | Loss: 0.6047910451889038
Epoch: 400 | Loss: 0.5892512798309326
Epoch: 500 | Loss: 0.5813308954238892
Epoch: 600 | Loss: 0.5753066539764404
Epoch: 700 | Loss: 0.5700642466545105
Epoch: 800 | Loss: 0.5642005801200867
Epoch: 900 | Loss: 0.5604522228240967


In [292]:
# TRAIN ACCURACY
label_pred = torch.round(model(input_train))
print(f"Train accuracy: {100*torch.sum(label_pred == label_train)/label_pred.shape[0]}")

# TEST ACCURACY
label_pred = torch.round(model(input_test))
print(f"Test accuracy: {100*torch.sum(label_pred == label_test)/label_pred.shape[0]}")

Train accuracy: 71.97322845458984
Test accuracy: 71.74951934814453


In [293]:
model = TwoLayerAdditiveModel(input_dim = X.shape[1], output_dim=1, hidden_dim=10)
model.prune_weights(mask)

opt = torch.optim.Adam(model.parameters(), lr=0.01)
loss_fn = torch.nn.BCELoss()
trained_model = train_model(model=model, input=input_train, labels=label_train, epochs=1000, optimiser=opt, loss_func=loss_fn, sep_layers=True, verbose=True)

Layer: 1 | Epoch: 0 | Loss: 0.8223185539245605
Layer: 1 | Epoch: 100 | Loss: 0.660942792892456
Layer: 1 | Epoch: 200 | Loss: 0.6328890919685364
Layer: 1 | Epoch: 300 | Loss: 0.6224381923675537
Layer: 1 | Epoch: 400 | Loss: 0.6160193085670471
Layer: 1 | Epoch: 500 | Loss: 0.6113172769546509
Layer: 1 | Epoch: 600 | Loss: 0.608805239200592
Layer: 1 | Epoch: 700 | Loss: 0.6051337718963623
Layer: 1 | Epoch: 800 | Loss: 0.602563738822937
Layer: 1 | Epoch: 900 | Loss: 0.600875973701477
Layer: 2 | Epoch: 0 | Loss: 0.5995553731918335
Layer: 2 | Epoch: 100 | Loss: 0.5710224509239197
Layer: 2 | Epoch: 200 | Loss: 0.5706514120101929
Layer: 2 | Epoch: 300 | Loss: 0.5706260800361633
Layer: 2 | Epoch: 400 | Loss: 0.5706239342689514
Layer: 2 | Epoch: 500 | Loss: 0.5706222057342529
Layer: 2 | Epoch: 600 | Loss: 0.5706204175949097
Layer: 2 | Epoch: 700 | Loss: 0.5706184506416321
Layer: 2 | Epoch: 800 | Loss: 0.5706163048744202
Layer: 2 | Epoch: 900 | Loss: 0.5706141591072083


In [294]:
# TRAIN ACCURACY
label_pred = torch.round(model(input_train))
print(f"Train accuracy: {100*torch.sum(label_pred == label_train)/label_pred.shape[0]}")

# TEST ACCURACY
label_pred = torch.round(model(input_test))
print(f"Test accuracy: {100*torch.sum(label_pred == label_test)/label_pred.shape[0]}")

Train accuracy: 71.1246566772461
Test accuracy: 70.41108703613281


In [295]:
model = TwoLayerAdditiveModel(input_dim = X.shape[1], output_dim=1, hidden_dim=10)

opt = torch.optim.Adam(model.parameters(), lr=0.01)
loss_fn = torch.nn.BCELoss()
trained_model = train_model(model=model, input=input_train, labels=label_train, epochs=1000, optimiser=opt, loss_func=loss_fn, sep_layers=False, verbose=True)

Epoch: 0 | Loss: 1.2320494651794434
Epoch: 100 | Loss: 0.6210504770278931
Epoch: 200 | Loss: 0.6015959978103638
Epoch: 300 | Loss: 0.5909579396247864
Epoch: 400 | Loss: 0.5756918787956238
Epoch: 500 | Loss: 0.565953254699707
Epoch: 600 | Loss: 0.5585778951644897
Epoch: 700 | Loss: 0.55474853515625
Epoch: 800 | Loss: 0.5510203838348389
Epoch: 900 | Loss: 0.5490441918373108


In [296]:
# TRAIN ACCURACY
label_pred = torch.round(model(input_train))
print(f"Train accuracy: {100*torch.sum(label_pred == label_train)/label_pred.shape[0]}")

# TEST ACCURACY
label_pred = torch.round(model(input_test))
print(f"Test accuracy: {100*torch.sum(label_pred == label_test)/label_pred.shape[0]}")

Train accuracy: 72.6066665649414
Test accuracy: 71.22370910644531


In [297]:
from torch.nn.utils import prune
import torch.nn as nn

# define the LightningModule
class GenericMLP(nn.Module):
    def __init__(self, sizes: list):
        super().__init__()

        # Setup the layers
        self.layers = nn.ModuleList()
        for i in range(len(sizes)-1):
            self.layers.append(nn.Linear(sizes[i], sizes[i+1], bias=True))
            self.layers.append(nn.Sigmoid())

        # Initialise the weights
        for layer in self.layers:
            if isinstance(layer, nn.Linear):
                nn.init.xavier_uniform_(layer.weight)
                nn.init.zeros_(layer.bias)

    def prune_weights(self, mask):
        prune.custom_from_mask(self.layers[0], name='weight', mask=mask)

    def sep_train_layer(self, layer):
        pass

    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x


In [298]:
model = GenericMLP([58,10, 5,1])
model.prune_weights(mask)
opt = torch.optim.Adam(model.parameters(), lr=0.01)
loss_fn = torch.nn.BCELoss()
trained_model = train_model(model=model, input=input_train, labels=label_train, epochs=1000, optimiser=opt, loss_func=loss_fn, sep_layers=False, verbose=True)

Epoch: 0 | Loss: 0.7130616307258606
Epoch: 100 | Loss: 0.6150779724121094
Epoch: 200 | Loss: 0.574833333492279
Epoch: 300 | Loss: 0.5561997294425964
Epoch: 400 | Loss: 0.5523011088371277
Epoch: 500 | Loss: 0.5510300397872925
Epoch: 600 | Loss: 0.5512269139289856
Epoch: 700 | Loss: 0.5515717267990112
Epoch: 800 | Loss: 0.5484270453453064
Epoch: 900 | Loss: 0.5469260811805725


In [299]:
# TRAIN ACCURACY
label_pred = torch.round(model(input_train))
print(f"Train accuracy: {100*torch.sum(label_pred == label_train)/label_pred.shape[0]}")

# TEST ACCURACY
label_pred = torch.round(model(input_test))
print(f"Test accuracy: {100*torch.sum(label_pred == label_test)/label_pred.shape[0]}")

Train accuracy: 72.59471893310547
Test accuracy: 72.41873931884766
