# Dataset Preparation for First ICU Prediction

## Imports & Inits

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('../')

import matplotlib.pyplot as plt
%matplotlib inline

import numpy as np
np.set_printoptions(precision=2)

from utils.metrics import BinaryAvgMetrics
from utils.plots import *

import pandas as pd
import pickle
from pathlib import Path

In [3]:
path = Path('./data')
workdir = path/'work_dir'

## 100 Run Performance Results

In [None]:
models = ['lr', 'rf']

bams = {}

for model in models:
  with open(workdir/model/'preds.pkl', 'rb') as f:
    targs = pickle.load(f)
    preds = pickle.load(f)
    probs = pickle.load(f)
    
  bams[model] = BinaryAvgMetrics(targs, preds, probs)

all_metrics = {}

for key in bams.keys():
  all_metrics[key] = []
  for i in range(len(bams[key].get_avg_metrics())):
    all_metrics[key].append(bams[key].get_avg_metrics().iloc[i]['Value'])

metrics = pd.DataFrame(all_metrics, index=['sensitivity', 'specificity', 'ppv', 'auroc', 'npv', 'f1'])
metrics

In [None]:
model = 'lr'
bams[model].get_avg_metrics()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 8))
plot_mean_roc(ax, bams[model].targs, bams[model].probs)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(15, 6))

plot_confusion_matrix(ax[0], bams[model].cm_avg, classes=['not imminent', 'imminent'], normalize=False,\
                      title='Confusion Matrix Over Runs')
plot_confusion_matrix(ax[1], bams[model].cm_avg, classes=['not imminent', 'imminent'], normalize=True,\
                      title='Normalized Confusion Matrix Over Runs')
plt.show()

## Subsect data for Temporal Analysis

In [None]:
df = pd.read_csv(path/'processed_dataset.csv', parse_dates=['admittime', 'dischtime', 'intime', 'charttime'])

df['relative_charttime'] = (df['charttime'] - df['intime']).apply(lambda x: int(x.value/(1e9*60*60)))
df['relative_charttime'].describe()

In [None]:
# value in hours
e1 = -480
s1 = -120
e2 = -72
s2 = -24
step = 6

temporal_notes = []

for i in range(e1, s1, step):
  period = f'{i} ≤ t ≤ {i+step}'
  tmp = df.loc[((df['relative_charttime'] >= i) & (df['relative_charttime'] <= (i+step)))].copy()
  temporal_notes.append((period, tmp))

for i in range(e2, s2, step):
  period = f'{i} ≤ t ≤ {i+step}'
  tmp = df.loc[((df['relative_charttime'] >= i) & (df['relative_charttime'] <= (i+step)))].copy()
  temporal_notes.append((period, tmp))

In [None]:
df['relative_charttime'] = (df['charttime'] - df['intime']).apply(lambda x: int(x.value/(1e9*60*60)))
df['relative_charttime'].describe()

These lengths should not be different...

In [None]:
temporal_notes = []

for i in range(e1, s1, step):
  period = f'{i} ≤ t ≤ {i+step}'
  tmp = df.loc[((df['relative_charttime'] >= i) & (df['relative_charttime'] <= (i+step)))].copy()
  temporal_notes.append((period, tmp))

for i in range(e2, s2, step):
  period = f'{i} ≤ t ≤ {i+step}'
  tmp = df.loc[((df['relative_charttime'] >= i) & (df['relative_charttime'] <= (i+step)))].copy()
  temporal_notes.append((period, tmp))

In [None]:
s = 0
for (_, x) in temporal_notes:
  s += len(x)
tmp = df.loc[((df['relative_charttime'] >= e1) & (df['relative_charttime'] <= s1))\
               | ((df['relative_charttime'] >= e2) & (df['relative_charttime'] <= s2))].copy()
print(s - len(tmp))

In [None]:
pickle.dump(temporal_notes, open(path/'temporal_notes.pkl', 'wb'))

## Temporal Performance Analysis

In [4]:
temporal_notes = pickle.load(open(path/'temporal_notes.pkl', 'rb'))

### Logistic Regression Model

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [6]:
with open(workdir/'lr'/'full_run.pkl', 'rb') as f:
  clf = pickle.load(f)
  transformer = pickle.load(f)

In [7]:
label = temporal_notes[0][0]
df = temporal_notes[0][1][['scispacy_note', 'class_label']]

In [9]:
vectorizer = TfidfVectorizer(min_df=3, analyzer=str.split, sublinear_tf=True,\
                              ngram_range=(2,2), vocabulary=transformer.vocabulary_)

In [10]:
x = vectorizer.fit_transform(df['scispacy_note'])

In [11]:
prob = clf.predict_proba(x)[:, 1]