### Training a model using pytorch_utils

In [1]:
import os
import numpy as np
import pandas as pd
import joblib
from sklearn.metrics import roc_auc_score
from prediction_utils.pytorch_utils.datasets import ArrayLoaderGenerator
from prediction_utils.pytorch_utils.models import SparseLogisticRegression, SparseLogisticRegressionEmbed, FixedWidthModel

In [2]:
data_path = "/share/pi/nigam/projects/prediction_utils/scratch/"
merged_name = "merged_features_binary"
label_col = "LOS_7" # use length of stay >= 7 days as the outcome

Let's load the relevant data

In [3]:
cohort = pd.read_parquet(
    os.path.join(data_path, 'cohort', 'cohort.parquet')
)
features = joblib.load(
    os.path.join(data_path, merged_name, 'features_sparse', 'features.gz')
)

row_id_map = pd.read_parquet(
    os.path.join(data_path, merged_name, 'features_sparse', 'features_row_id_map.parquet')
)

vocab = pd.read_parquet(
    os.path.join(data_path, merged_name, 'vocab', 'vocab.parquet')
)

In [4]:
cohort = cohort.merge(row_id_map)

In [5]:
config_dict = {
    'batch_size': 128,
    'convert_sparse': False,
    'row_id_col': 'features_row_id',
    'input_dim': features.shape[1],
    'label_col': label_col,
    'fold_id': '1',
    'lr': 1e-5,
}

In [6]:
# Initialize the data loaders
loader_generator = ArrayLoaderGenerator(
    features=features, cohort=cohort, **config_dict
)
loaders = loader_generator.init_loaders(sample_keys=['train'])

In [7]:
# Train a neural network
nn_config = {
    'num_hidden': 1,
    'hidden_dim': 128,
    'drop_prob': 0.5
}
config_dict = {**config_dict, **nn_config}
model = FixedWidthModel(**config_dict)
result_dict = model.train(loaders)

cpu
Epoch 0/9
----------
Phase: train:
 auc: 0.552042, auprc: 0.239652, brier: 0.225124, loss_bce: 0.642811,
 loss: 0.642811,
Phase: val:
 auc: 0.655228, auprc: 0.412079, brier: 0.212385, loss_bce: 0.616450,
 loss: 0.616450,
Best model updated
Epoch 1/9
----------
Phase: train:
 auc: 0.619583, auprc: 0.289434, brier: 0.205852, loss_bce: 0.602159,
 loss: 0.602159,
Phase: val:
 auc: 0.661629, auprc: 0.411099, brier: 0.198796, loss_bce: 0.586675,
 loss: 0.586675,
Best model updated
Epoch 2/9
----------
Phase: train:
 auc: 0.654713, auprc: 0.340008, brier: 0.193308, loss_bce: 0.573689,
 loss: 0.573689,
Phase: val:
 auc: 0.661629, auprc: 0.398055, brier: 0.191417, loss_bce: 0.570027,
 loss: 0.570027,
Best model updated
Epoch 3/9
----------
Phase: train:
 auc: 0.708804, auprc: 0.416784, brier: 0.182129, loss_bce: 0.547338,
 loss: 0.547339,
Phase: val:
 auc: 0.662518, auprc: 0.379816, brier: 0.185672, loss_bce: 0.556957,
 loss: 0.556957,
Best model updated
Epoch 4/9
----------
Phase: train:
 

In [8]:
result_dict['performance']

Unnamed: 0,metric,phase,epoch,performance
0,auc,train,0,0.552042
1,auc,train,1,0.619583
2,auc,train,2,0.654713
3,auc,train,3,0.708804
4,auc,train,4,0.734751
...,...,...,...,...
95,loss,val,5,0.538555
96,loss,val,6,0.532790
97,loss,val,7,0.527273
98,loss,val,8,0.522720


In [9]:
# Evaluate the model
loaders_predict = loader_generator.init_loaders_predict()
predict_dict = model.predict(loaders_predict, phases=['val'])

Evaluating on phase: val
 auc: 0.677098, auprc: 0.371910, brier: 0.168932, loss_bce: 0.519261,
 loss: 0.519260,


In [10]:
predict_dict['performance']

Unnamed: 0,metric,phase,epoch,performance
0,auc,val,0,0.677098
1,auprc,val,0,0.37191
2,brier,val,0,0.168932
3,loss,val,0,0.51926
4,loss_bce,val,0,0.519261


In [11]:
predict_dict['outputs']

Unnamed: 0,phase,outputs,pred_probs,labels,row_id
0,val,-0.148067,0.412136,0,12
1,val,-1.395370,0.041565,0,15
2,val,-0.154878,0.410691,0,44
3,val,-0.775641,0.126467,0,46
4,val,-0.153334,0.417386,0,97
...,...,...,...,...,...
181,val,0.419859,0.553262,0,2001
182,val,-0.170817,0.413880,1,2013
183,val,-0.441105,0.302565,1,2026
184,val,-0.424910,0.303576,0,2047


In [12]:
# Train logistic regression on sparse data, as an alternative to the FixedWidthModel
# This class uses the EmbeddingBag layer as input
config_dict = {**config_dict, **{'lr': 1e-5, 'batch_size': 512, 'drop_prob': 0.0}}
model = SparseLogisticRegressionEmbed(**config_dict)
model.train(loaders)

cpu
Epoch 0/9
----------
Phase: train:
 auc: 0.548103, auprc: 0.244899, brier: 0.229063, loss_bce: 0.651033,
 loss: 0.651033,
Phase: val:
 auc: 0.559211, auprc: 0.270579, brier: 0.225542, loss_bce: 0.643936,
 loss: 0.643936,
Best model updated
Epoch 1/9
----------
Phase: train:
 auc: 0.574694, auprc: 0.267691, brier: 0.221653, loss_bce: 0.635867,
 loss: 0.635867,
Phase: val:
 auc: 0.578592, auprc: 0.256309, brier: 0.218042, loss_bce: 0.628564,
 loss: 0.628564,
Best model updated
Epoch 2/9
----------
Phase: train:
 auc: 0.591738, auprc: 0.287574, brier: 0.214673, loss_bce: 0.621206,
 loss: 0.621206,
Phase: val:
 auc: 0.574147, auprc: 0.257453, brier: 0.212040, loss_bce: 0.616050,
 loss: 0.616050,
Best model updated
Epoch 3/9
----------
Phase: train:
 auc: 0.602684, auprc: 0.303860, brier: 0.210187, loss_bce: 0.611758,
 loss: 0.611758,
Phase: val:
 auc: 0.567034, auprc: 0.261858, brier: 0.207725, loss_bce: 0.606914,
 loss: 0.606914,
Best model updated
Epoch 4/9
----------
Phase: train:
 

{'performance':    metric  phase  epoch  performance
 0     auc  train      0     0.548103
 1     auc  train      1     0.574694
 2     auc  train      2     0.591738
 3     auc  train      3     0.602684
 4     auc  train      4     0.600791
 ..    ...    ...    ...          ...
 95   loss    val      5     0.593126
 96   loss    val      6     0.587209
 97   loss    val      7     0.582192
 98   loss    val      8     0.578228
 99   loss    val      9     0.574667
 
 [100 rows x 4 columns]}