# Introduction

In this notebook i abstain through the threshold that maximizes f1 in train set. And i want to compare the final f1 score of this pipeline and see wether it is lower than my mc+LR method. If it is lower it means i was able to progress past this baseline

**TLDR**: this method improves f1 from 0.38 to 0.40. My method improves f1 from 0.38 up to 0.45!

# WandB

In [1]:
import wandb

# Change directory to parent folder

In [2]:
import os
cwd = os.getcwd()

# protection against running this cell multiple times
assert os.path.dirname(os.path.dirname(cwd)).split('/')[-1] == 'master-thesis','Oops, directory already changed previously as indended. Ignoring...'

# change working directory (if assert passed)
new_cwd = os.path.dirname(os.path.dirname(cwd)) # parent directory
os.chdir(new_cwd)

# Imports

In [3]:
# show all outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [4]:
import os
import json

from rnn_utils import DiagnosesDataset, split_dataset, MYCOLLATE
from rnn_utils import RNN, train_one_epoch, eval_model, compute_loss, outs2df, compute_metrics, get_prediction_thresholds

from Abstention.utils import plot_reliability,get_prediction_thresholds,ece

from config import Settings; settings = Settings()


import torch
from torch.utils.data import Dataset, DataLoader, random_split

from sklearn.model_selection import ParameterGrid, ParameterSampler

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import wandb

# Model reproducibility

In [5]:
# Reproducibility
seed = settings.random_seed
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

<torch._C.Generator at 0x114dc45b0>

# Load dataset

In [6]:
dataset_id = 'diag_only'
dataset_folder = os.path.join(settings.data_base,settings.model_ready_dataset_folder,dataset_id)
print('dataset at',dataset_folder)

dataset at data/model_ready_dataset/diag_only


In [7]:
grouping = 'ccs' # coding-scheme
batch_size=64

dataset = DiagnosesDataset(os.path.join(dataset_folder,'dataset.json'),grouping)

train_dataset = DiagnosesDataset(os.path.join(dataset_folder,'train_subset.json'),grouping)
val_dataset = DiagnosesDataset(os.path.join(dataset_folder,'val_subset.json'),grouping)
test_dataset = DiagnosesDataset(os.path.join(dataset_folder,'test_subset.json'),grouping)


len(train_dataset)
len(val_dataset)
len(test_dataset)


train_dataloader = DataLoader(train_dataset,batch_size=batch_size,collate_fn=MYCOLLATE(dataset),shuffle=True)
val_dataloader = DataLoader(val_dataset,batch_size=batch_size,collate_fn=MYCOLLATE(dataset)) #batch_size here is arbitrary and doesn't affect total validation speed
test_dataloader = DataLoader(test_dataset,batch_size=batch_size,collate_fn=MYCOLLATE(dataset))

5249

1125

1125

# Train model

Define hyper-parameters

In [8]:
# remaining hyperparameters of best model
input_size = next(iter(train_dataloader))['target_sequences']['sequence'].shape[2]
hidden_size = 100
num_layers = 1
n_labels = input_size
rnn_type = 'lstm'
model_type ='deterministic'

lr = 0.01
n_labels = input_size
epochs = 15
criterion = torch.nn.BCEWithLogitsLoss()

In [9]:
config = {'seed':seed,
          'input_size':input_size,
          'hidden_size':hidden_size,
          'num_layers':num_layers,
          'n_labels':n_labels,
          'rnn_type':rnn_type,
          'lr':lr,
          'optim':'adam',
          'epochs':epochs,
          'model_type':model_type
         }

and now train

In [10]:
"""
model = RNN(input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            n_labels=n_labels,
            model=rnn_type,
           )
    
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

criterion = torch.nn.BCEWithLogitsLoss(reduction='none')

# train
for idx,epoch in enumerate(range(1,epochs+1)):
    loss = train_one_epoch(model, train_dataloader, epoch, criterion, optimizer);
    _,metrics = eval_model(model,val_dataloader,dataset, ['recall@30'])
    if idx % 5 == 0 or idx == epochs-1:
        print(f"epoch {epoch}\t| loss {loss}\t| recall@30 {metrics['recall@30_adm']}") 
"""

'\nmodel = RNN(input_size=input_size,\n            hidden_size=hidden_size,\n            num_layers=num_layers,\n            n_labels=n_labels,\n            model=rnn_type,\n           )\n    \noptimizer = torch.optim.Adam(model.parameters(), lr=lr)\n\ncriterion = torch.nn.BCEWithLogitsLoss(reduction=\'none\')\n\n# train\nfor idx,epoch in enumerate(range(1,epochs+1)):\n    loss = train_one_epoch(model, train_dataloader, epoch, criterion, optimizer);\n    _,metrics = eval_model(model,val_dataloader,dataset, [\'recall@30\'])\n    if idx % 5 == 0 or idx == epochs-1:\n        print(f"epoch {epoch}\t| loss {loss}\t| recall@30 {metrics[\'recall@30_adm\']}") \n'

or load

In [11]:
model_name = 'pleasant-music-50'
model_folder = os.path.join(settings.data_base,settings.models_folder,model_name)
hypp_save_path = os.path.join(model_folder, 'hyper_parameters.json')
weights_save_path = os.path.join(model_folder,"weights")

params = dict(input_size = input_size,
              hidden_size=hidden_size,
              num_layers=num_layers,
              n_labels=n_labels,
              model=rnn_type
             )

#hyperparameters
with open(hypp_save_path,'r') as f:
    params_loaded = json.load(f)
    
# weights
weights = torch.load(weights_save_path)

new_model = RNN(**params_loaded)
new_model.load_state_dict(torch.load(weights_save_path))

<All keys matched successfully>

# basic performance

In [12]:
train_outs, train_golden = outs2df(new_model,train_dataloader,dataset,return_golden = True)

In [18]:
decision_thresholds = get_prediction_thresholds(train_outs,train_golden,method='max f1')
loss, metric = eval_model(new_model,val_dataloader,dataset,decision_thresholds,['f1','f1@30'])
metric

metrics
f1_diag       0.192786
f1@30_diag    0.249744
f1_adm        0.271918
f1@30_adm     0.380333
dtype: float64

# Findings

Final f1 was 0.27. COmparing to the f1 obtained with mc dropout+LR it is lower.

# Now abstaining based on the thresholds of each diagnostic

In [40]:
model_outputs, golden = outs2df(new_model,val_dataloader,dataset,return_golden = True)
k = 30
topk_outputs = model_outputs.apply(lambda row: row.nlargest(k),axis=1)

# fix missing columns from previous operation
missing_cols = [col for col in model_outputs.columns if col not in topk_outputs.columns]
topk_outputs_all_cols = pd.concat([topk_outputs,pd.DataFrame(columns=missing_cols)])
topk_outputs_all_cols = topk_outputs_all_cols[model_outputs.columns]

## sometimes k > (#logits>0) so we will turn all 0 logits into nan so that the following lines don't convert them to predictions
topk_outputs_all_cols = topk_outputs_all_cols.mask(topk_outputs_all_cols == 0,np.nan)
# done, continuing...

topk_predictions = np.where(topk_outputs_all_cols.isna(),0,1)
topk_predictions = pd.DataFrame(data=topk_predictions,columns=model_outputs.columns,index=model_outputs.index)

In [41]:
def abstain(row,model_outputs,decision_thresholds):
    """
    Receives a row which contains the predictions of all diagnoses (0 or 1) for a given admission.
    Also receives df_metrics, which contains the LRs trained on each diagnostic (that predicts if TP or FP)
    And stats_outs contains the means and variances of the forward passes.
    
    this function will turn some predictions from 1 to 0 if the LR model predicts it will be a FP.
    """
    new_row = row.copy()
    admission_outputs = model_outputs.loc[row.name,:]
    for index,elem in row.iteritems():
        if elem == 1:
            proba_predicted = admission_outputs[index]
            
            if decision_thresholds.loc[index,'threshold'] > proba_predicted:
                new_row[index] = 0
    return new_row

In [42]:
top_k_predictions_abstained = topk_predictions.apply(lambda row: abstain(row, model_outputs,decision_thresholds),axis=1)

In [43]:
topk_outputs_all_cols_after_abstention = topk_outputs_all_cols.fillna(0).mask(top_k_predictions_abstained == 0,0)
metrics_w_abstention = compute_metrics(topk_outputs_all_cols_after_abstention,top_k_predictions_abstained,golden,['precision@30','recall@30','f1@30'])

In [44]:
metrics_w_abstention.iloc[3:]

metrics
precision@30_adm    0.333004
recall@30_adm       0.576964
f1@30_adm           0.403593
dtype: float64

# Conclusion 2 (more important)

Abstaining using the thresholds that maximize f1 doesn't improve f1 more than my method!!! 