# Case2: Risk Score Prediction - MLP + Ensemble Learning with PyTorch

In [1]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tqdm as tqdm
import argparse

# import prediction model
from models import RiskScoreEnsembleModel

# PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchmetrics import Accuracy

# Sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# parse hyperparameters
parser = argparse.ArgumentParser(description='MLP_Ensemble')
parser.add_argument('--batch-size', default=32, type=int, metavar='N',
                    help='batch size for training')
parser.add_argument('--num-models', default=15, type=int, metavar='N',
                    help='number of models for ensemble learning')
parser.add_argument('--hidden-size', default=128, type=int, metavar='N',
                    help='hidden size of MLP')
parser.add_argument('--epochs', default=200, type=int, metavar='N',
                    help='number of total epochs for training')
parser.add_argument('--lr', default=1e-3, type=float, metavar='LR',
                    help='learning rate')

# if you run argparse in jupyter notebook:
args = parser.parse_args(args=[])

# # if you run argparse in .py file:
# args = parser.parse_args()

parser.print_help()

usage: ipykernel_launcher.py [-h] [--batch-size N] [--num-models N]
                             [--hidden-size N] [--epochs N] [--lr LR]

MLP_Ensemble

optional arguments:
  -h, --help       show this help message and exit
  --batch-size N   batch size for training
  --num-models N   number of models for ensemble learning
  --hidden-size N  hidden size of MLP
  --epochs N       number of total epochs for training
  --lr LR          learning rate


## Data Preprocessing

In [3]:
# training data
df = pd.read_csv('New Risk Score ML Training Data.csv')

df_len = len(df) # number of rows
print(f"Number of rows: {df_len}")

# display the first 5 rows
df.head()

Number of rows: 2000


Unnamed: 0,Name,Risk Score,Bachelor's Degree,Master's Degree,More Than 5 Years of Work Experience,More Than 10 Years of Work Experience,More Than 15 Years of Work Experience,More Than 20 Years of Work Experience,More Than 25 Years of Work Experience,More Than 30 Years of Work Experience,...,Industry Sanctions & Other Legal Actions,Professional Licenses,LinkedIn Potential Controversies,LinkedIn Political Activity,Twitter Potential Controversies,Twitter Political Activity,Facebook Potential Controversies,Facebook Political Activity,Instagram Potential Controversies,Instagram Political Activity
0,Anthony Ivy,5,Yes,Yes,Yes,Yes,No,No,No,No,...,No,No,No,No,No,No,No,No,No,No
1,Blaise Carroz,5,Yes,No,Yes,Yes,Yes,Yes,No,No,...,No,No,No,No,No,No,No,No,No,No
2,Callie Thomas,1,Yes,No,Yes,Yes,Yes,Yes,No,No,...,No,No,No,No,No,No,No,No,No,No
3,Campbell Murray,3,Yes,Yes,Yes,Yes,Yes,Yes,No,No,...,No,No,No,Yes,No,No,No,No,No,No
4,Daniel Beck,3,Yes,No,Yes,Yes,Yes,No,No,No,...,No,No,No,No,No,No,No,No,No,No


In [4]:
# make the 1st column the index
df = df.set_index('Name')

# display the first 5 rows
df.head()

Unnamed: 0_level_0,Risk Score,Bachelor's Degree,Master's Degree,More Than 5 Years of Work Experience,More Than 10 Years of Work Experience,More Than 15 Years of Work Experience,More Than 20 Years of Work Experience,More Than 25 Years of Work Experience,More Than 30 Years of Work Experience,Years of Work Experience,...,Industry Sanctions & Other Legal Actions,Professional Licenses,LinkedIn Potential Controversies,LinkedIn Political Activity,Twitter Potential Controversies,Twitter Political Activity,Facebook Potential Controversies,Facebook Political Activity,Instagram Potential Controversies,Instagram Political Activity
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Anthony Ivy,5,Yes,Yes,Yes,Yes,No,No,No,No,13,...,No,No,No,No,No,No,No,No,No,No
Blaise Carroz,5,Yes,No,Yes,Yes,Yes,Yes,No,No,21,...,No,No,No,No,No,No,No,No,No,No
Callie Thomas,1,Yes,No,Yes,Yes,Yes,Yes,No,No,23,...,No,No,No,No,No,No,No,No,No,No
Campbell Murray,3,Yes,Yes,Yes,Yes,Yes,Yes,No,No,23,...,No,No,No,Yes,No,No,No,No,No,No
Daniel Beck,3,Yes,No,Yes,Yes,Yes,No,No,No,17,...,No,No,No,No,No,No,No,No,No,No


In [5]:
# convert Yes or No to 1 or 0 from the 2nd column to the last column
df.iloc[:, 1:] = df.iloc[:, 1:].replace({'Yes': 1, 'No': 0})

# display the first 5 rows
df.head()

Unnamed: 0_level_0,Risk Score,Bachelor's Degree,Master's Degree,More Than 5 Years of Work Experience,More Than 10 Years of Work Experience,More Than 15 Years of Work Experience,More Than 20 Years of Work Experience,More Than 25 Years of Work Experience,More Than 30 Years of Work Experience,Years of Work Experience,...,Industry Sanctions & Other Legal Actions,Professional Licenses,LinkedIn Potential Controversies,LinkedIn Political Activity,Twitter Potential Controversies,Twitter Political Activity,Facebook Potential Controversies,Facebook Political Activity,Instagram Potential Controversies,Instagram Political Activity
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Anthony Ivy,5,1,1,1,1,0,0,0,0,13,...,0,0,0,0,0,0,0,0,0,0
Blaise Carroz,5,1,0,1,1,1,1,0,0,21,...,0,0,0,0,0,0,0,0,0,0
Callie Thomas,1,1,0,1,1,1,1,0,0,23,...,0,0,0,0,0,0,0,0,0,0
Campbell Murray,3,1,1,1,1,1,1,0,0,23,...,0,0,0,1,0,0,0,0,0,0
Daniel Beck,3,1,0,1,1,1,0,0,0,17,...,0,0,0,0,0,0,0,0,0,0


In [6]:
# compute correlation between risk score and features
corr = df.corr()

# first column is the correlation between risk score and features
corr = corr.iloc[0, 1:]
corr

Bachelor's Degree                          -0.018909
Master's Degree                            -0.009336
More Than 5 Years of Work Experience       -0.064224
More Than 10 Years of Work Experience      -0.067426
More Than 15 Years of Work Experience      -0.070127
More Than 20 Years of Work Experience      -0.082576
More Than 25 Years of Work Experience      -0.064235
More Than 30 Years of Work Experience      -0.057965
Years of Work Experience                   -0.094583
Bankruptcies                                0.405891
Criminal Records                            0.558716
Civil Litigation & Legal Filings            0.318342
Tax Liens                                   0.333589
Industry Sanctions & Other Legal Actions    0.180537
Professional Licenses                       0.000699
LinkedIn Potential Controversies            0.052594
LinkedIn Political Activity                 0.000503
Twitter Potential Controversies             0.147142
Twitter Political Activity                  0.

In [7]:
# feature selection
# select features with correlation > threshold
threshold = 0.001
selected_features = corr[abs(corr) > threshold].index.tolist()
selected_features

["Bachelor's Degree",
 "Master's Degree",
 'More Than 5 Years of Work Experience',
 'More Than 10 Years of Work Experience',
 'More Than 15 Years of Work Experience',
 'More Than 20 Years of Work Experience',
 'More Than 25 Years of Work Experience',
 'More Than 30 Years of Work Experience',
 'Years of Work Experience',
 'Bankruptcies',
 'Criminal Records',
 'Civil Litigation & Legal Filings',
 'Tax Liens',
 'Industry Sanctions & Other Legal Actions',
 'LinkedIn Potential Controversies',
 'Twitter Potential Controversies',
 'Twitter Political Activity',
 'Facebook Potential Controversies',
 'Facebook Political Activity',
 'Instagram Potential Controversies',
 'Instagram Political Activity']

In [8]:
# features
X = df[selected_features].values
print(f"Shape of features: {X.shape}\nNumber of Individuals: {X.shape[0]}\nNumber of Features: {X.shape[1]}")
print(f"Features:\n{X}")

Shape of features: (2000, 21)
Number of Individuals: 2000
Number of Features: 21
Features:
[[1 1 1 ... 0 0 0]
 [1 0 1 ... 0 0 0]
 [1 0 1 ... 0 0 0]
 ...
 [1 0 1 ... 1 0 0]
 [1 0 1 ... 0 0 1]
 [1 1 1 ... 0 0 0]]


In [9]:
# ground truth labels
y = df.iloc[:, 0].values
print(f"Shape of labels: {y.shape}\nNumber of Risk Scores: {y.shape[0]}")
print(f"Labels:\n{y}")

Shape of labels: (2000,)
Number of Risk Scores: 2000
Labels:
[5 5 1 ... 4 1 3]


In [10]:
# 80/10/10 Train Val Test Split
seed = 42
X_train, X_val_test, y_train, y_val_test = train_test_split(X, y, test_size=0.2, random_state=seed)
X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size=0.5, random_state=seed)

print(f"Number of Individuals in Training Dataset: {X_train.shape[0]}")
print(f"Number of Individuals in Validation Dataset: {X_val.shape[0]}")
print(f"Number of Individuals in Test Dataset: {X_test.shape[0]}")

Number of Individuals in Training Dataset: 1600
Number of Individuals in Validation Dataset: 200
Number of Individuals in Test Dataset: 200


## Dataset & Dataloader

In [11]:
# transform for augmentation
def feature_transform(vector):
    # Gaussian noise
    noise = np.random.normal(0, 0.1, vector.shape)
    vector = vector + noise
    
    return vector

print(X[0])
print(feature_transform(X[0]))

[ 1  1  1  1  0  0  0  0 13  1  1  0  0  0  0  0  0  0  0  0  0]
[ 1.01658039e+00  9.86710960e-01  8.28088181e-01  1.02447941e+00
  1.30205288e-01 -1.73955845e-01 -1.33658976e-01 -3.93943623e-02
  1.28329611e+01  9.47590553e-01  9.41738625e-01  1.13458068e-01
 -3.46211135e-02 -2.00597218e-03  1.67602060e-02 -1.03358116e-01
 -3.59914720e-02 -3.35715222e-02  4.84725479e-02 -3.41400977e-03
  9.34241737e-02]


In [12]:
# custom dataset
class RiskScoreDataset(Dataset):
    def __init__(self, X, y, transforms=None):
        '''
        X: features
        y: labels
        '''
        self.X = X
        self.y = y
        self.transforms = transforms
    
    def __len__(self):
        X_len = len(self.X) # number of rows (individuals)
        return X_len

    def __getitem__(self, idx):
        x = self.X[idx] # features
        y = self.y[idx] # labels

        if self.transforms:
            x = self.transforms(x)
        
        x = torch.tensor(x).float() # convert to tensor (type: float)
        y = torch.tensor(y).long() # convert to tensor (type: long or int64)

        return x, y

# subtract 1 from the labels (classes) to make them zero-indexed
y_train = y_train - 1 # classes: [1,2,3,4,5] -> [0,1,2,3,4]
y_val = y_val - 1
y_test = y_test - 1

# training dataset
train_dataset = RiskScoreDataset(X_train, y_train, transforms=feature_transform)
# validation dataset
val_dataset = RiskScoreDataset(X_val, y_val)
# test dataset
test_dataset = RiskScoreDataset(X_test, y_test)

# batch size
batch_size = args.batch_size

# training dataloader
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
# validation dataloader
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
# test dataloader
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


## Training

In [13]:
# hyperparameters
num_models = args.num_models # number of models for ensemble learning
num_features = X_train.shape[1]
hidden_size = args.hidden_size
num_classes = len(np.unique(y_train))
epochs = args.epochs
lr = args.lr

ensemble_model = RiskScoreEnsembleModel(num_models, num_features, hidden_size, num_classes)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(ensemble_model.parameters(), lr=lr)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
ensemble_model.to(device)


RiskScoreEnsembleModel(
  (models): ModuleList(
    (0): Sequential(
      (0): Linear(in_features=21, out_features=128, bias=True)
      (1): ReLU()
      (2): Linear(in_features=128, out_features=5, bias=True)
    )
    (1): Sequential(
      (0): Linear(in_features=21, out_features=128, bias=True)
      (1): ReLU()
      (2): Linear(in_features=128, out_features=5, bias=True)
    )
    (2): Sequential(
      (0): Linear(in_features=21, out_features=128, bias=True)
      (1): ReLU()
      (2): Linear(in_features=128, out_features=5, bias=True)
    )
    (3): Sequential(
      (0): Linear(in_features=21, out_features=128, bias=True)
      (1): ReLU()
      (2): Linear(in_features=128, out_features=5, bias=True)
    )
    (4): Sequential(
      (0): Linear(in_features=21, out_features=128, bias=True)
      (1): ReLU()
      (2): Linear(in_features=128, out_features=5, bias=True)
    )
    (5): Sequential(
      (0): Linear(in_features=21, out_features=128, bias=True)
      (1): ReLU()


In [14]:
# Training
for epoch in range(epochs):
    ensemble_model.train()
    loss = 0.0
    for inputs, labels in train_dataloader:
        # Inputs
        inputs, labels = inputs.to(device), labels.to(device)
        # Zero Out Gradients
        optimizer.zero_grad()
        # Forward Pass for Prediction
        outputs = ensemble_model(inputs)
        # Loss Computation
        loss = criterion(outputs.mean(dim=0), labels) # loss of the average of the outputs of the models in the ensemble
        # Backpropagation
        loss.backward()
        # Parameter Update
        optimizer.step()
        
        batch_size = inputs.shape[0]
        loss += loss.item() * batch_size

    if epoch % 10 == 0 or epoch == epochs - 1:
        train_loss = loss / len(train_dataset)
        print(f'Epoch: {epoch}\tTrain Loss: {train_loss:.4f}')

    # Evaluation on validation dataset
    ensemble_model.eval()
    val_predictions = []
    with torch.no_grad():
        for inputs, labels in val_dataloader:
            inputs, labels = inputs.to(device), labels.to(device)

            outputs = ensemble_model(inputs)
            predictions = torch.argmax(outputs.mean(dim=0), dim=1)
            val_predictions.extend(predictions.cpu().numpy())

    if epoch % 10 == 0 or epoch == epochs - 1:
        val_accuracy = accuracy_score(y_val, val_predictions)
        print(f'Epoch: {epoch}\tValidation Accuracy: {val_accuracy:.4f}')


Epoch: 0	Train Loss: 0.0300
Epoch: 0	Validation Accuracy: 0.3400
Epoch: 10	Train Loss: 0.0140
Epoch: 10	Validation Accuracy: 0.7800
Epoch: 20	Train Loss: 0.0122
Epoch: 20	Validation Accuracy: 0.8050
Epoch: 30	Train Loss: 0.0099
Epoch: 30	Validation Accuracy: 0.8250
Epoch: 40	Train Loss: 0.0063
Epoch: 40	Validation Accuracy: 0.8550
Epoch: 50	Train Loss: 0.0079
Epoch: 50	Validation Accuracy: 0.8600
Epoch: 60	Train Loss: 0.0092
Epoch: 60	Validation Accuracy: 0.8600
Epoch: 70	Train Loss: 0.0050
Epoch: 70	Validation Accuracy: 0.8350
Epoch: 80	Train Loss: 0.0078
Epoch: 80	Validation Accuracy: 0.8600
Epoch: 90	Train Loss: 0.0052
Epoch: 90	Validation Accuracy: 0.8550
Epoch: 100	Train Loss: 0.0072
Epoch: 100	Validation Accuracy: 0.8450
Epoch: 110	Train Loss: 0.0094
Epoch: 110	Validation Accuracy: 0.8700
Epoch: 120	Train Loss: 0.0068
Epoch: 120	Validation Accuracy: 0.8600
Epoch: 130	Train Loss: 0.0042
Epoch: 130	Validation Accuracy: 0.8650
Epoch: 140	Train Loss: 0.0039
Epoch: 140	Validation Accu

In [15]:
# Evaluation on test data
ensemble_model.eval()
test_predictions = []
with torch.no_grad():
    for inputs, labels in test_dataloader:
        inputs, labels = inputs.to(device), labels.to(device)

        outputs = ensemble_model(inputs)
        predictions = torch.argmax(outputs.mean(dim=0), dim=1)
        test_predictions.extend(predictions.cpu().numpy())

test_accuracy = accuracy_score(y_test, test_predictions)
print(f'Test Accuracy: {test_accuracy:.4f}')

Test Accuracy: 0.8700
