In [40]:
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from torch.utils.data import DataLoader, WeightedRandomSampler
from torch.optim.lr_scheduler import StepLR

from sklearn.metrics import roc_auc_score

import numpy as np
import pandas as pd
import copy
from time import time


In [None]:
# filter dplyr warnings
%load_ext rpy2.ipython
import warnings
warnings.filterwarnings('ignore')

In [5]:
%%R
library(dplyr)
library(ggplot2)
raw_data <- read.csv("https://raw.githubusercontent.com/propublica/compas-analysis/master/compas-scores-two-years.csv")
nrow(raw_data)

df <- dplyr::select(raw_data, age, c_charge_degree, race, age_cat, score_text, sex, priors_count, 
                    days_b_screening_arrest, decile_score, is_recid, two_year_recid, c_jail_in, c_jail_out) %>% 
        filter(days_b_screening_arrest <= 30) %>%
        filter(days_b_screening_arrest >= -30) %>%
        filter(is_recid != -1) %>%
        filter(c_charge_degree != "O") %>%
        filter(score_text != 'N/A')
nrow(df)

df$length_of_stay <- as.numeric(as.Date(df$c_jail_out) - as.Date(df$c_jail_in))

df <- mutate(df, crime_factor = factor(c_charge_degree)) %>%
      mutate(age_factor = as.factor(age_cat)) %>%
      within(age_factor <- relevel(age_factor, ref = 1)) %>%
      mutate(race_factor = factor(race)) %>%
      within(race_factor <- relevel(race_factor, ref = 3)) %>%
      mutate(gender_factor = factor(sex, labels= c("Female","Male"))) %>%
      within(gender_factor <- relevel(gender_factor, ref = 2)) %>%
      mutate(score_factor = factor(score_text != "Low", labels = c("LowScore","HighScore")))

df <- df %>% select('age', 'race_factor', 'gender_factor', 'score_factor', 'priors_count', 'length_of_stay', 'crime_factor')

df %>% head()

  age      race_factor gender_factor score_factor priors_count length_of_stay
1  69            Other          Male     LowScore            0              1
2  34 African-American          Male     LowScore            0             10
3  24 African-American          Male     LowScore            4              1
4  44            Other          Male     LowScore            0              1
5  41        Caucasian          Male    HighScore           14              6
6  43            Other          Male     LowScore            3              1
  crime_factor
1            F
2            F
3            F
4            M
5            F
6            F


In [6]:
compas_df_categorical = %R df
compas_df = %R df
compas_df.head()

Unnamed: 0,age,race_factor,gender_factor,score_factor,priors_count,length_of_stay,crime_factor
1,69,Other,Male,LowScore,0,1.0,F
2,34,African-American,Male,LowScore,0,10.0,F
3,24,African-American,Male,LowScore,4,1.0,F
4,44,Other,Male,LowScore,0,1.0,M
5,41,Caucasian,Male,HighScore,14,6.0,F


In [8]:
cat_features = ['race_factor','gender_factor', 'crime_factor']
numeric_features = ['age','priors_count', 'length_of_stay']
target_feature = ["score_factor"]

In [12]:
from sklearn.preprocessing import LabelEncoder 
ly = LabelEncoder()

compas_df['race_factor'] = ly.fit_transform(compas_df['race_factor'])
race_mapping = dict(zip(ly.classes_, range(0, len(ly.classes_)+1)))
compas_df['gender_factor'] = ly.fit_transform(compas_df['gender_factor'])
gender_mapping = dict(zip(ly.classes_, range(0, len(ly.classes_)+1)))
compas_df['crime_factor'] = ly.fit_transform(compas_df['crime_factor'])
crime_factor_mapping = dict(zip(ly.classes_, range(0, len(ly.classes_)+1)))

# compas_df['score_factor'] = ly.fit_transform(compas_df['score_factor'])
# score_factor_mapping = dict(zip(ly.classes_, range(0, len(ly.classes_)+1)))

compas_df['score_factor'] = compas_df['score_factor'].map({'HighScore' : 1, 'LowScore': 0})

In [157]:
x = compas_df[cat_features+numeric_features]
y = compas_df[target_feature]

X, X_test, Y, y_test = train_test_split(x, y, test_size=0.1, random_state=2137)

X_train, X_val, y_train, y_val = train_test_split(X, Y, test_size=0.1, random_state=2137)

X_train = torch.from_numpy(X_train.to_numpy().astype('float32'))
X_val = torch.from_numpy(X_val.to_numpy().astype('float32'))
X_test = torch.from_numpy(X_test.to_numpy().astype('float32'))
y_train = torch.from_numpy(y_train.to_numpy().reshape(-1, 1).astype('float32'))
y_val = torch.from_numpy(y_val.to_numpy().reshape(-1, 1).astype('float32'))
y_test = torch.from_numpy(y_test.to_numpy().reshape(-1, 1).astype('float32'))

batch_size=1024
dataset_train = torch.utils.data.TensorDataset(X_train, y_train)
dataset_train = torch.utils.data.DataLoader(dataset_train, batch_size=batch_size, shuffle=True)

dataset_val = torch.utils.data.TensorDataset(X_val, y_val)
dataset_val = torch.utils.data.DataLoader(dataset_val, batch_size=32, shuffle=False)

dataset_test = torch.utils.data.TensorDataset(X_test, y_test)
dataset_test = torch.utils.data.DataLoader(dataset_test, batch_size=32, shuffle=False)


In [158]:
features, target = next(iter(dataset_val))

In [160]:
target.shape

torch.Size([32, 1])

# DNN Version

In [58]:
class DNN(nn.Module):

  '''
    binary_class:
      True - Output single probability of positive class for BCELoss
      False - Output probabilities for each class (positive/negative)
  '''
  def __init__(self, in_features, hidden=[128,128], act=nn.ReLU, binary_class=True):
    super(DNN, self).__init__()
    sizes = [in_features] + hidden
    if binary_class:
      sizes += [1]
    else:
      sizes += [2]
    
    layers = []
    for i in range(len(sizes)-1):
      layers.append(nn.Linear(sizes[i], sizes[i+1]))
      if i != len(sizes)-2:
        layers.append(act())
        # layers.append(nn.Dropout(0.3))

    for layer in layers:
      if isinstance(layer, nn.Linear):
          if isinstance(act, nn.Sigmoid):
              init.xavier_normal_(layer.weight)
              init.zeros_(layer.bias)
          elif isinstance(act, nn.ReLU) or isinstance(act, nn.LeakyReLU):
              init.kaiming_normal_(layer.weight)
              init.zeros_(layer.bias)

    self.layers = nn.Sequential(*layers)

  def forward(self, X):
    out = self.layers(X)
    return out



In [140]:
class DNN_model(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(DNN_model, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, 1)
        self.relu = nn.ReLU()
                           
    def get_weights(self):
        return self.weight
    
    def forward(self,x):
        out = self.fc1(x)
        out = self.relu(out)
        out = F.sigmoid(self.fc2(out)) #sigmoid as we use BCELoss
        return out

In [141]:
compas_df.columns

Index(['age', 'race_factor', 'gender_factor', 'score_factor', 'priors_count',
       'length_of_stay', 'crime_factor'],
      dtype='object')

# Binary Classification (1 output)

In [142]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [143]:
len(dataset_train.dataset[0][0])

6

In [144]:
model = DNN_model(input_size=len(dataset_train.dataset[0][0]), hidden_size=64)

In [145]:
#model = DNN(in_features=len(dataset_train.dataset[0][0]), hidden=[128,128]).to(device)
NUM_EPOCHS = 50
learning_rate = 0.01
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
scheduler = StepLR(optimizer, step_size=10, gamma=0.1)
criterion=nn.BCELoss()

In [146]:
model

DNN_model(
  (fc1): Linear(in_features=6, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=1, bias=True)
  (relu): ReLU()
)

In [169]:
def train_epoch(model, train_loader, criterion, optimizer):
    model.train()

    running_loss = 0.0
    
    start_time = time()
    for batch_idx, (data, target) in enumerate(train_loader):   
        optimizer.zero_grad()   # .backward() accumulates gradients
        data = data.to(device)
        target = target.to(device) # all data & model on same device

        outputs = model(data)
        loss = criterion(outputs, target)
        running_loss += loss.item()
        if batch_idx % 30000 == 0:
          print("Runnning loss ", batch_idx, " : ", running_loss)

        loss.backward()
        optimizer.step()
    
    end_time = time()
    
    running_loss /= len(train_loader)
    print('Training Loss: ', running_loss, 'Time: ',end_time - start_time, 's')
    return running_loss

def test_model(model, test_loader, criterion):
    with torch.no_grad():
        model.eval()

        running_loss = 0.0
        total_predictions = 0.0
        correct_predictions = 0.0

        for batch_idx, (data, target) in enumerate(test_loader):   
            data = data.to(device)
            target = target.to(device)

            outputs = model(data)
            target = target.flatten()
            outputs = outputs.flatten()
            loss = criterion(outputs, target).detach()
            running_loss += loss.item()

            pred = outputs.cpu().detach()
            pred[pred >= 0.5] = 1
            pred[pred < 0.5] = 0
            correct_predictions += (pred == target).sum().item()
            total_predictions += target.size(0)


        running_loss /= len(test_loader)
        acc = (correct_predictions/total_predictions)*100.0
        print('Testing Loss: ', running_loss)
        print('Testing Accuracy: ', acc, '%')
        return running_loss, acc

In [170]:
n_epochs = 50
Train_loss = []
Test_loss = []
Test_acc = []

for i in range(n_epochs):
    print("epoch no:", i)
    train_loss = train_epoch(model, dataset_train, criterion, optimizer)
    test_loss, test_acc = test_model(model, dataset_val, criterion)
    scheduler.step(test_loss)
    Train_loss.append(train_loss)
    Test_loss.append(test_loss)
    Test_acc.append(test_acc)
    print('='*20)

epoch no: 0
Runnning loss  0  :  0.5306012034416199
Training Loss:  0.5127270340919494 Time:  0.04605579376220703 s
Testing Loss:  0.537648253970676
Testing Accuracy:  71.40287769784173 %
epoch no: 1
Runnning loss  0  :  0.5115630626678467
Training Loss:  0.5090706646442413 Time:  0.13956975936889648 s
Testing Loss:  0.5355472730265723
Testing Accuracy:  72.48201438848922 %
epoch no: 2
Runnning loss  0  :  0.5195796489715576
Training Loss:  0.5092786014080047 Time:  0.0521693229675293 s
Testing Loss:  0.5339545177088844
Testing Accuracy:  72.3021582733813 %
epoch no: 3
Runnning loss  0  :  0.5147454738616943
Training Loss:  0.5074390172958374 Time:  0.05163288116455078 s
Testing Loss:  0.5378566715452406
Testing Accuracy:  71.58273381294964 %
epoch no: 4
Runnning loss  0  :  0.5145432949066162
Training Loss:  0.5048848032951355 Time:  0.055703163146972656 s
Testing Loss:  0.5368463297684988
Testing Accuracy:  71.0431654676259 %
epoch no: 5
Runnning loss  0  :  0.5095219612121582
Traini

In [171]:
model.eval()
Y_pred, Y = [], []
for features, targets in dataset_test:
  y_pred = model(features).cpu().detach()
  y_pred[y_pred >= 0.5] = 1
  y_pred[y_pred < 0.5] = 0
  Y_pred.append(y_pred)
  Y.append(targets)
Y_pred = torch.cat(Y_pred)
Y = torch.cat(Y)
print("AUC of model :", roc_auc_score(np.array(Y),np.array(Y_pred)))

AUC of model : 0.7604640192126254
