# Dataset Preparation for First ICU Prediction

## Imports & Inits

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('../')

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")
%matplotlib inline

import numpy as np
np.set_printoptions(precision=2)

from utils.metrics import BinaryAvgMetrics
from utils.plots import *

import pandas as pd
import pickle
from pathlib import Path

from args import args
vars(args)

{'path': PosixPath('../data'),
 'workdir': PosixPath('../data/work_dir'),
 'dataset_csv': PosixPath('../data/processed_dataset.csv'),
 'temporal_pkl': PosixPath('../data/temporal_notes.pkl'),
 'min_freq': 3,
 'bc_threshold': {'lr': 0.47, 'rf': 0.32, 'cnn': 0.23}}

## 100 Run Performance Results

In [3]:
models = list(args.bc_threshold.keys())

bams = {}

for model in models:
  with open(args.workdir/model/'preds.pkl', 'rb') as f:
    targs = pickle.load(f)
    preds = pickle.load(f)
    probs = pickle.load(f)
    
  bams[model] = BinaryAvgMetrics(targs, preds, probs)

all_metrics = {}

for key in bams.keys():
  all_metrics[key] = []
  for i in range(len(bams[key].get_avg_metrics())):
    all_metrics[key].append(bams[key].get_avg_metrics().iloc[i]['Value'])

metrics = pd.DataFrame(all_metrics, index=['sensitivity', 'specificity', 'ppv', 'auroc', 'npv', 'f1'])
metrics

Unnamed: 0,lr,rf,cnn
sensitivity,0.894,0.822,0.808
specificity,0.62,0.732,0.748
ppv,0.422,0.487,0.504
auroc,0.843,0.857,0.86
npv,0.95,0.93,0.928
f1,0.573,0.612,0.621


### Model Plots

In [None]:
model = 'cnn'
bams[model].get_avg_metrics()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 8))
plot_mean_roc(ax, bams[model].targs, bams[model].probs)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(15, 6))

plot_confusion_matrix(ax[0], bams[model].cm_avg, classes=['not imminent', 'imminent'], normalize=False,\
                      title='Confusion Matrix Over Runs')
plot_confusion_matrix(ax[1], bams[model].cm_avg, classes=['not imminent', 'imminent'], normalize=True,\
                      title='Normalized Confusion Matrix Over Runs')
plt.show()

## Temporal Performance Analysis

In [None]:
df = pd.read_csv(args.dataset_csv, parse_dates=['admittime', 'dischtime', 'intime', 'charttime'])
df['relative_charttime'] = (df['charttime'] - df['intime'])

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))
plot_prob(ax, df, args.bc_threshold, cutoff=20, interval=12)

### Logistic Regression Model

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [None]:
with open(args.workdir/'lr'/'full_run.pkl', 'rb') as f:
  clf = pickle.load(f)
  vocabulary = pickle.load(f)
  
vectorizer = TfidfVectorizer(min_df=3, analyzer=str.split, sublinear_tf=True,\
                              ngram_range=(2,2), vocabulary=vocabulary)  

In [None]:
metrics = np.zeros((len(period), 4))

In [None]:
import pdb

In [None]:
for i in range(len(period)):
  label = period[i]
  df = temporal_notes[i][['scispacy_note', 'class_label']]
  prob = clf.predict_proba(vectorizer.fit_transform(df['scispacy_note']))[:, 1]
  y_pred = (prob > args.bc_threshold['lr']).astype(np.int64)
  cm = confusion_matrix(df['class_label'].to_numpy(), y_pred)
  print(label)
  print(cmccc)
  pdb.set_trace()
  metrics[i] = get_metrics(cm)