In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale, LabelBinarizer
from torch.cuda import is_available, empty_cache
from torch.nn import Linear, ReLU, BCELoss, Sigmoid
from torch.utils.data import DataLoader, TensorDataset
from torch.optim import Adam
from torch import FloatTensor, save
from lightning import LightningModule, Trainer
from lightning.pytorch.loggers import CSVLogger
from lightning.pytorch.callbacks.early_stopping import EarlyStopping

from math import sqrt
from pathlib import Path
from urllib.request import urlopen
from random import seed
from os import cpu_count


# TODO: remove for real randomness?
SEED = 1234
seed(SEED)
np.random.seed(SEED)

COMPAS_INPUT_FILE = "compas-scores-two-years.csv"
ADDRESS = f"https://raw.githubusercontent.com/propublica/compas-analysis/master/{COMPAS_INPUT_FILE}"

print(f"Looking for file '{COMPAS_INPUT_FILE}' in the current directory...")
path = Path(f"./{COMPAS_INPUT_FILE}")

if not path.exists():
    print("File not found! Downloading from GitHub...")
    with open(path, "wb") as file:
        file.write(urlopen(ADDRESS).read())
    print(f"File '{COMPAS_INPUT_FILE}' download and saved locally.")
else:
    print("File found in current directory.")


empty_cache()
is_available()

In [None]:
FEATURE_LIST_CONTINUOUS = ['age', 'juv_fel_count', 'juv_misd_count', 'juv_other_count', 'priors_count', 'c_length_of_stay', 'f_length_of_stay']
FEATURE_LIST_CATEGORICAL = ['sex', 'race', 'c_charge_degree']
binarizers = dict()

df = pd.read_csv('compas-scores-two-years.csv', delimiter = ',')
df = df.dropna(subset=["days_b_screening_arrest"])
df = df[(df.days_b_screening_arrest <= 30) & (df.days_b_screening_arrest >= -30) & (df.is_recid != -1) & (df.c_charge_degree != 'O') & (df.score_text != 'N/A')]
df['c_length_of_stay'] = (pd.to_datetime(df['c_jail_out']) - pd.to_datetime(df['c_jail_in'])).dt.days
df['f_length_of_stay'] = (pd.to_datetime(df['c_arrest_date']) - pd.to_datetime(df['c_offense_date'])).dt.days
df['length_of_custody'] = (pd.to_datetime(df['in_custody']) - pd.to_datetime(df['out_custody'])).dt.days
df[['f_length_of_stay']] = df[['f_length_of_stay']].fillna(0)
df.reset_index(inplace=True, drop=True)

labels_dataset = df['two_year_recid']
features_dataset = np.array([]).reshape(len(labels_dataset), 0)
for _, data in {attr: df[attr] for attr in FEATURE_LIST_CONTINUOUS}.items():
  data = scale([float(v) for v in data])
  data = np.reshape(data, (len(labels_dataset), -1))
  features_dataset = np.hstack((features_dataset, data))
for attribute, data in {attr: df[attr] for attr in FEATURE_LIST_CATEGORICAL}.items():
  enc = LabelBinarizer()
  enc.fit(data)
  binarizers[attribute] = enc
  features_dataset = np.hstack((features_dataset, enc.transform(data)))

features_train, features_test, labels_train, labels_test = train_test_split(features_dataset, labels_dataset, test_size=0.1)

In [None]:
class ClassificatorModel(LightningModule):
    def __init__(self):
        super().__init__()
        self.loss = BCELoss()
        self.constructor = Linear(15, 500)
        self.activator = ReLU()
        self.reconstructor = Linear(500, 10000)
        self.reactivator = ReLU()
        self.deconstructor = Linear(10000, 500)
        self.deactivator = ReLU()
        self.folder = Linear(500, 1)
        self.wrapper = Sigmoid()

    def forward(self, x):
        x = self.constructor(x)
        x = self.activator(x)
        x = self.reconstructor(x)
        x = self.reactivator(x)
        x = self.deconstructor(x)
        x = self.deactivator(x)
        x = self.folder(x)
        x = self.wrapper(x)
        return x

    def _accuracy(self, x, y):
        return (x.flatten().round() == y).float().mean()

    def score(self, batch):
        x, y = batch
        x = self.forward(x)
        return self._accuracy(x, y)

    def training_step(self, batch, _):
        x, y = batch[0]
        x = self.forward(x)
        loss = self.loss(x, y.unsqueeze(1))
        self.log("train_loss", loss)
        self.log("train_acc", self._accuracy(x, y))
        return loss

    def validation_step(self, batch, _):
        x, y = batch
        x = self.forward(x)
        loss = self.loss(x, y.unsqueeze(1))
        self.log("val_loss", loss)
        self.log("val_acc", (self._accuracy(x, y)))

    def configure_optimizers(self):
        return Adam(self.parameters(), lr=0.001)

In [None]:
cpus = cpu_count()
batch = round(sqrt(len(labels_train)))

features_train_tensor = FloatTensor(features_train)
labels_train_tensor = FloatTensor(labels_train.to_numpy())
features_test_tensor = FloatTensor(features_test)
labels_test_tensor = FloatTensor(labels_test.to_numpy())

train_dataset = TensorDataset(features_train_tensor, labels_train_tensor)
test_dataset = TensorDataset(features_test_tensor, labels_test_tensor)

train_dataloader = DataLoader(train_dataset, num_workers=cpus, batch_size=batch, shuffle=True)
test_dataloader = DataLoader(test_dataset, num_workers=cpus, batch_size=batch)

In [None]:
epochs = 100

model = ClassificatorModel()
early_stopping = EarlyStopping(monitor="val_acc", patience=5, verbose=False, mode="max")
trainer = Trainer(max_epochs=epochs, log_every_n_steps=1, logger=CSVLogger("logs"), callbacks=[early_stopping])
trainer.fit(model, train_dataloaders=[train_dataloader], val_dataloaders=[test_dataloader])

save(model, "./models/classifier.pth")