# First ICU Prediction using CNN with Word Embeddings

## Imports & Inits

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import sys
sys.path.append('../')

import warnings
with warnings.catch_warnings():
    warnings.simplefilter("ignore")

import matplotlib.pyplot as plt
%matplotlib inline

import pickle
import numpy as np
import pandas as pd

from collections import OrderedDict
from functools import partial
from sklearn.metrics import *

import torch
from torch import nn
from torch import optim
from torch.utils.data import DataLoader

from ignite.engine import Events, create_supervised_evaluator
from ignite.metrics import Accuracy, Loss, Precision, Recall
from ignite.contrib.handlers.param_scheduler import CosineAnnealingScheduler

from cnn_classifier.dataset import NoteDataset
from cnn_classifier.model import NoteClassifier
from cnn_classifier.containers import ModelContainer, DataContainer
from cnn_classifier.trainer import IgniteTrainer
from utils.embeddings import PretrainedEmbeddings
from utils.plots import *
from utils.metrics import BinaryAvgMetrics, get_best_model

from utils.splits import *
from args import args
vars(args)

{'workdir': PosixPath('../data/work_dir/cnn'),
 'dataset_csv': PosixPath('../data/processed_dataset.csv'),
 'batch_size': 128,
 'min_freq': 3,
 'hidden_dim': 100,
 'dropout_p': 0.1,
 'emb_dropout': 0.1,
 'n_channels': 100,
 'lr': 0.001,
 'wd': 0.0,
 'n_epochs': 15,
 'checkpointer_save_total': 1,
 'emb_path': PosixPath('../pretrained/glove/glove.6B.50d.txt'),
 'emb_sz': 50,
 'checkpointer_prefix': 'glove50_cnn',
 'device': 'cuda:2',
 'checkpointer_name': 'epoch',
 'checkpointer_save_every': 5,
 'early_stop_patience': 10,
 'bc_threshold': 0.23,
 'cols': ['class_label', 'scispacy_note'],
 'start_seed': 127}

## Functions

In [None]:
def get_sample(df, sample_pct=0.01, with_val=True, seed=None):
  train = df.loc[(df['split']) == 'train'].sample(frac=sample_pct, random_state=seed)
  train.reset_index(inplace=True, drop=True)

  if with_val:
    val = df.loc[(df['split']) == 'val'].sample(frac=sample_pct, random_state=seed)
    val.reset_index(inplace=True, drop=True)
    return pd.concat([train, val], axis=0) 

  return train

def convert_probs(output, thresh):
  y_pred, y = output
  y_pred = (torch.sigmoid(y_pred) > thresh).long()
  return y_pred, y

def predict_proba(clf, x_test):
  return torch.sigmoid(clf(x_test)).detach().numpy()

## Sample Training

In [None]:
seed = 42
ori_df = pd.read_csv(args.dataset_csv, usecols=args.cols)
df = set_all_splits(ori_df.copy(), 0.1, 0.1, seed=seed)

In [None]:
sample_df = get_sample(df)
sample_df.shape

In [None]:
dc = DataContainer(df, NoteDataset, args.workdir, bs=args.batch_size, with_test=True,\
                   min_freq=args.min_freq, create_vec=True, weighted_sampling=True)
itr = iter(dc.train_dl)

pe = PretrainedEmbeddings.from_file(args.emb_path)
pe.make_custom_embeddings(dc.get_vocab_tokens())

classifier = NoteClassifier(args.emb_sz, dc.get_vocab_size(), args.n_channels, args.hidden_dim, dc.n_classes,\
                            dropout_p=args.dropout_p, emb_dropout=args.emb_dropout,\
                            pretrained=pe.custom_embeddings)

In [None]:
optimizer = optim.Adam(classifier.parameters(), lr=args.lr, weight_decay=args.wd)
reduce_lr = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', 0.5, 1)
loss_fn = nn.BCEWithLogitsLoss()

mc = ModelContainer(classifier, loss_fn, optimizer, reduce_lr)

In [None]:
x, y = next(itr)
y_pred = classifier(x)
print(loss_fn(y_pred, y))

In [None]:
bce_logits_wrapper = partial(convert_probs, thresh=args.bc_threshold)
metrics = OrderedDict({ 'loss': Loss(loss_fn)})
ig = IgniteTrainer(mc, dc, args, metrics, log_training=True, early_stop=True)
model_name = ig.run()

In [None]:
x_test, targ = next(iter(dc.test_dl))
x_test = x_test.to('cpu')
targ = targ.to('cpu')
classifier = classifier.to('cpu')

In [None]:
prob = predict_proba(classifier, x_test)
pred = (prob > args.bc_threshold).astype(np.int64)

In [None]:
fig, ax = plt.subplots(figsize=(12,8))
plot_thresh_range(ax, targ, prob, 0.1, 0.9, 25)

In [None]:
cm = confusion_matrix(targ, pred)
tn,fp,fn,tp = cm[0][0],cm[0][1],cm[1][0],cm[1][1]
prevalence = (fn+tp)/(tn+fp+fn+tp)
sensitivity = tp/(tp+fn)
specificity = tn/(tn+fp)
ppv = tp/(tp+fp)
npv = tn/(tn+fn)
f1 = (2*ppv*sensitivity)/(ppv+sensitivity)
auroc = roc_auc_score(targ, prob)

d = {
  'sensitivity': np.round(sensitivity, 3),
  'specificity': np.round(specificity, 3),
  'ppv': np.round(ppv, 3),
  'npv': np.round(npv, 3),
  'f1': np.round(f1, 3),
  'auroc': np.round(auroc, 3),
  'prevalence': np.round(prevalence, 3),  
}

metrics = pd.DataFrame(d.values(), index=d.keys(), columns=['Value'])
metrics

In [None]:
model_name

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(16, 4))

plot_confusion_matrix(ax[0], cm, classes=['not imminent', 'imminent'], normalize=False, title='Confusion matrix')
plot_confusion_matrix(ax[1], cm, classes=['not imminent', 'imminent'], normalize=True,\
                      title='Normalized confusion matrix')
plt.show()

## Testing

In [None]:
dc = DataContainer(df, NoteDataset, args.workdir, bs=args.batch_size, with_test=True,\
                   min_freq=args.min_freq, load_vec=True)

print(dc.get_dataset_size())
print(dc.get_batch_sizes())
print(dc.get_num_batches())

In [None]:
pe = PretrainedEmbeddings.from_file(args.emb_path)
pe.make_custom_embeddings(dc.get_vocab_tokens())

classifier = NoteClassifier(args.emb_sz, dc.get_vocab_size(), args.n_channels, args.hidden_dim, dc.n_classes,\
                            dropout_p=args.dropout_p, pretrained=pe.custom_embeddings)

state_dict = torch.load(args.modelfile)
classifier.load_state_dict(state_dict);

### Plots

In [None]:
log = pd.read_csv(args.workdir/'training_log.csv')
log = log[:-1]
log.head()

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(15,10))
log.plot(x='epoch', y=['training_loss', 'validation_loss'], kind='line',
                      title='Training and validation loss', ax=axes[0][0])
log.plot(x='epoch', y=['training_accuracy', 'validation_accuracy'], kind='line',
                      title='Training and validation accuracy', ax=axes[0][1])
log.plot(x='epoch', y=['training_precision', 'validation_precision'], kind='line',
                      title='Training and validation precision', ax=axes[1][0])
log.plot(x='epoch', y=['training_recall', 'validation_recall'], kind='line',
                      title='Training and validation recall', ax=axes[1][1])

### Test set

In [None]:
loss_fn = nn.BCEWithLogitsLoss()
bce_logits_wrapper = partial(convert_probs, thresh=args.bc_threshold)
metrics = OrderedDict({ 'loss': Loss(loss_fn), 'sensitivity': Recall(bce_logits_wrapper),\
                       'ppv': Precision(bce_logits_wrapper)})

In [None]:
evaluator = create_supervised_evaluator(classifier, metrics=metrics)

@evaluator.on(Events.COMPLETED)
def log_testing_results(engine):
  metrics = engine.state.metrics
  for metric in metrics.keys():
    print(f"{metric} {metrics[metric]:0.3f}")

evaluator.run(dc.test_dl)

### Interpretation

In [None]:
# sort weights
emb = classifier.emb.weight.detach()[0]
_, idxs = torch.sort(emb, dim=0, descending=True)
idxs = idxs.numpy().tolist()

In [None]:
# Top 20 words
print("Influential words in positive class:")
print("--------------------------------------")
for i in range(20):
    print(dc.vectorizer.vocab.lookup_idx(idxs[i]))
    
print("====\n\n\n")

In [None]:
# Top 20 words
print("Influential words in negative class:")
print("--------------------------------------")
idxs.reverse()
for i in range(20):
    print(dc.vectorizer.vocab.lookup_idx(idxs[i]))
    
print("====\n\n\n")

## Metrics

In [None]:
# run this if preds.pkl is not generated
targs,preds,probs = [],[],[]

for i in range(4):
  with open(args.workdir/f'preds_{i+1}.pkl', 'rb') as f:
    targs_i = pickle.load(f)
    preds_i = pickle.load(f)
    probs_i = pickle.load(f)
    
  targs += targs_i
  preds += preds_i
  probs += probs_i

with open(args.workdir/'preds.pkl', 'wb') as f:
  pickle.dump(targs, f)
  pickle.dump(preds, f)
  pickle.dump(probs, f)

Taken from [here](https://www.dataschool.io/simple-guide-to-confusion-matrix-terminology/):

1. Prevalence: `(fn + tp) / total`
2. Sensitivity: AKA recall, true positive rate `tp / (tp + fn)`
3. Specificity: AKA true negative rate `tn / (tn + fp)`
4. Positive Predictive Value (PPV): AKA precision `tp / (tp + fp)`
5. Negative Predictive Value (NPV): `tn / (tn + fn)`

In [None]:
with open(args.workdir/'preds.pkl', 'rb') as f:
  targs = pickle.load(f)
  preds = pickle.load(f)
  probs = pickle.load(f)
  
fnames = [f'glove50_cnn_{seed}_epoch_15.pth' for seed in range(args.start_seed, args.start_seed + 100)]  

In [None]:
bam = BinaryAvgMetrics(targs, preds, probs)
bam.get_avg_metrics()

In [None]:
get_best_model(bam, fnames)

In [None]:
bam.get_avg_metrics(conf=0.95)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 8))
plot_mean_roc(ax, bam.targs, bam.probs)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(15, 6))

plot_confusion_matrix(ax[0], bam.cm_avg, classes=['not imminent', 'imminent'], normalize=False,\
                      title='Confusion Matrix Over Runs')
plot_confusion_matrix(ax[1], bam.cm_avg, classes=['not imminent', 'imminent'], normalize=True,\
                      title='Normalized Confusion Matrix Over Runs')
plt.show()

## Full Data

In [4]:
df = pd.read_csv(args.dataset_csv, usecols=args.cols)
df['split'] = 'train'
dc = DataContainer(df, NoteDataset, args.workdir, bs=args.batch_size, with_test=False,\
                   min_freq=args.min_freq, create_vec=True, weighted_sampling=True)

pe = PretrainedEmbeddings.from_file(args.emb_path)
pe.make_custom_embeddings(dc.get_vocab_tokens())

classifier = NoteClassifier(args.emb_sz, dc.get_vocab_size(), args.n_channels, args.hidden_dim, dc.n_classes,\
                            dropout_p=args.dropout_p, emb_dropout=args.emb_dropout,\
                            pretrained=pe.custom_embeddings)

optimizer = optim.Adam(classifier.parameters(), lr=args.lr, weight_decay=args.wd)
reduce_lr = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', 0.5, 1)
loss_fn = nn.BCEWithLogitsLoss()

mc = ModelContainer(classifier, loss_fn, optimizer, reduce_lr)



In [5]:
metrics = OrderedDict({ 'loss': Loss(loss_fn)})
ig = IgniteTrainer(mc, dc, args, metrics, log_training=False, early_stop=False)
ig.run()

Epoch [1/15]: [397/397] 100%|██████████, loss=5.93e-01 [01:56<00:00]


Training loss 0.581 
Validation loss 0.486 


Epoch [2/15]: [397/397] 100%|██████████, loss=4.78e-01 [01:55<00:00]


Training loss 0.486 
Validation loss 0.000 


Epoch [3/15]: [397/397] 100%|██████████, loss=3.89e-01 [01:55<00:00]


Training loss 0.374 
Validation loss 0.000 


Epoch [4/15]: [397/397] 100%|██████████, loss=3.32e-01 [01:55<00:00]


Training loss 0.304 
Validation loss 0.000 


Epoch [5/15]: [397/397] 100%|██████████, loss=2.80e-01 [01:55<00:00]


Training loss 0.252 
Validation loss 0.000 


Epoch [6/15]: [397/397] 100%|██████████, loss=2.38e-01 [01:55<00:00]


Training loss 0.236 
Validation loss 0.000 


Epoch [7/15]: [397/397] 100%|██████████, loss=2.10e-01 [01:55<00:00]


Training loss 0.206 
Validation loss 0.000 


Epoch [8/15]: [397/397] 100%|██████████, loss=1.97e-01 [01:55<00:00]


Training loss 0.174 
Validation loss 0.000 


Epoch [9/15]: [397/397] 100%|██████████, loss=1.89e-01 [01:55<00:00]


Training loss 0.166 
Validation loss 0.000 


Epoch [10/15]: [397/397] 100%|██████████, loss=1.78e-01 [01:55<00:00]


Training loss 0.159 
Validation loss 0.000 


Epoch [11/15]: [397/397] 100%|██████████, loss=1.77e-01 [01:55<00:00]


Training loss 0.153 
Validation loss 0.000 


Epoch [12/15]: [397/397] 100%|██████████, loss=1.77e-01 [01:54<00:00]


Training loss 0.150 
Validation loss 0.000 


Epoch [13/15]: [397/397] 100%|██████████, loss=1.70e-01 [01:55<00:00]


Training loss 0.147 
Validation loss 0.000 


Epoch [14/15]: [397/397] 100%|██████████, loss=1.68e-01 [01:54<00:00]


Training loss 0.149 
Validation loss 0.000 


Epoch [15/15]: [397/397] 100%|██████████, loss=1.68e-01 [01:54<00:00]


Training loss 0.140 
Validation loss 0.000 


'glove50_cnn_epoch_4.pth'

In [6]:
dc.train_ds.save_vectorizer(args.workdir)

In [None]:
test_ds = NoteDataset.load_data_and_vectorizer(df, dc.vectorizer)
test_dl = DataLoader(test_ds, len(test_ds))

In [7]:
state_dict = torch.load(args.workdir/'models/glove50_cnn_epoch_15.pth', map_location='cpu')

In [9]:
with open(args.workdir/'full_data_model.pkl', 'wb') as f:
  pickle.dump(state_dict, f)
  pickle.dump(dc.train_ds.vectorizer,f )

In [None]:
classifier.load_state_dict(state_dict);

In [None]:
x, y = next(iter(test_dl))

In [None]:
x = x.to('cpu')
classifier = classifier.to('cpu')

In [None]:
torch.cuda.empty_cache()

In [None]:
pred = classifier(x)

In [None]:
torch.sigmoid(clf(x_test)).detach().numpy()

In [None]:
prob = predict_proba(classifier, x)

In [None]:
prob.shape