# Imminent ICU Admission Classifier with Logistic Regression

## Imports & Inits

In [None]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('../../')

import warnings
warnings.filterwarnings("ignore", category=UserWarning)

import pickle
import scipy

import pandas as pd
import numpy as np
np.set_printoptions(precision=4)

from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")
%matplotlib inline

from pathlib import Path

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from scikitplot.metrics import *

from utils.data_utils import set_group_splits
from utils.metrics import BinaryAvgMetrics, get_best_model
from utils.plots import *

In [None]:
from args import args
vars(args)

In [None]:
str_cols = pickle.load(open(args.str_cols_pkl, 'rb'))
cols = ['hadm_id'] + str_cols + ['note', 'imi_adm_label']

In [None]:
mm_notes_vitals = pd.read_csv(args.mm_csv, usecols=cols)
mm_notes_vitals = mm_notes_vitals[mm_notes_vitals['imi_adm_label'] != -1].reset_index(drop=True)

notes_common = mm_notes_vitals[['hadm_id', 'note', 'imi_adm_label']].copy().reset_index(drop=True)
vitals_common = mm_notes_vitals[['hadm_id'] + str_cols + ['imi_adm_label']].copy().reset_index(drop=True)

print(vitals_common.shape, notes_common.shape, mm_notes_vitals.shape, vitals_common['hadm_id'].nunique(), notes_common['hadm_id'].nunique(), mm_notes_vitals['hadm_id'].nunique())

In [None]:
seed = 643
save = False

## Unstructured Data Classifier Dev

In [None]:
df = set_group_splits(notes_common.copy(), group_col='hadm_id', seed=seed)

train_df = df[df['split'] == 'train']
test_df = df[df['split'] == 'test']

g = train_df.groupby(['imi_adm_label']).size().to_numpy()
print(f"Prevalence of positive class in training set:{(g[1]/g.sum())*100:0.1f}%")
g = test_df.groupby(['imi_adm_label']).size().to_numpy()
print(f"Prevalence of positive class in test set:{(g[1]/g.sum())*100:0.1f}%")

In [None]:
# vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=60_000)
# x_note_train = vectorizer.fit_transform(train_df['note'])
# x_note_test = vectorizer.transform(test_df['note'])

# with open(args.vectordir/f'bigram_643.pkl', 'wb') as f:
#   pickle.dump(vectorizer, f)
#   pickle.dump(x_note_train, f)
#   pickle.dump(x_note_test, f)

In [None]:
with open(args.vectordir/'bigram_643.pkl', 'rb') as f:
  vectorizer = pickle.load(f)
  x_note_train = pickle.load(f)
  x_note_test = pickle.load(f)
  
y_train, y_test = train_df['imi_adm_label'], test_df['imi_adm_label']
x_note_train.shape, x_note_test.shape, y_train.shape, y_test.shape

In [None]:
clf = LogisticRegression(class_weight='balanced')
clf.fit(x_note_train, y_train)

prob = clf.predict_proba(x_note_test)
pos_prob = prob[:, 1]

labels = ['Delayed', 'Imminent']
label_test = [labels[i] for i in y_test]

In [None]:
fig, ax = plt.subplots(figsize=(10,8))
plot_roc(label_test, prob, title='', ax=ax)
ax.set_xlabel('1 - Specificity')
ax.set_ylabel('Sensitivity')

In [None]:
fig, ax = plt.subplots(figsize=(20, 10))
plot_thresh_range(ax, y_test, pos_prob, lower=0.1, upper=0.81, n_vals=100)

if save:
  fig.savefig(args.figdir/f'lr_notes_vital_metrics_vary.pdf', dpi=300, box_inches='tight', pad_inches=0)

In [None]:
fig, ax = plt.subplots(figsize=(20, 10))
best_threshold = plot_thresh_metric(ax, y_test, pos_prob, lower=0.1, upper=0.81, n_vals=100, show_f1=False)
ax.text(0.71, ax.get_ylim()[1]*0.95, f'Optimum Threshold = {best_threshold[0]}', fontsize=12, color='b')
print(f"Best Youden Threshold = {best_threshold[0]}")

if save:
  fig.savefig(args.figdir/f'lr_notes_vital_threshold_guide.pdf', dpi=300, box_inches='tight', pad_inches=0)

In [None]:
threshold = 0.41
pred = (pos_prob > threshold).astype(np.int64)
label_preds = [labels[i] for i in pred]
cm = confusion_matrix(y_test, pred)
tn,fp,fn,tp = cm[0][0],cm[0][1],cm[1][0],cm[1][1]
sensitivity = tp/(tp+fn)
specificity = tn/(tn+fp)
ppv = tp/(tp+fp)
npv = tn/(tn+fn)
f1 = (2*ppv*sensitivity)/(ppv+sensitivity)
auroc = roc_auc_score(y_test, pos_prob)

d = {
  'sensitivity': np.round(sensitivity, 3),
  'specificity': np.round(specificity, 3),
  'ppv': np.round(ppv, 3),
  'npv': np.round(npv, 3),
  'f1': np.round(f1, 3),
  'auroc': np.round(auroc, 3),
  'threshold': threshold,
}
metrics = pd.DataFrame(d.values(), index=d.keys(), columns=['Value'])
metrics

In [None]:
fig, ax = plt.subplots(figsize=(11, 8))
plot_confusion_matrix(label_test, label_preds, x_tick_rotation=45, ax=ax, normalize=False)

## Multimodal Classifier Dev

In [None]:
df = set_group_splits(mm_notes_vitals.copy(), group_col='hadm_id', seed=seed)

train_df = df[df['split'] == 'train']
test_df = df[df['split'] == 'test']

g = train_df.groupby(['imi_adm_label']).size().to_numpy()
print(f"Prevalence of positive class in training set:{(g[1]/g.sum())*100:0.1f}%")
g = test_df.groupby(['imi_adm_label']).size().to_numpy()
print(f"Prevalence of positive class in test set:{(g[1]/g.sum())*100:0.1f}%")

In [None]:
with open(args.vectordir/'bigram_643.pkl', 'rb') as f:
  vectorizer = pickle.load(f)
  x_note_train = pickle.load(f)
  x_note_test = pickle.load(f)
  
y_train, y_test = train_df['imi_adm_label'], test_df['imi_adm_label']
x_note_train.shape, x_note_test.shape, y_train.shape, y_test.shape

In [None]:
x_vitals_train, x_vitals_test = train_df[str_cols].values, test_df[str_cols].values
x_vitals_train.shape, x_vitals_test.shape

In [None]:
x_train = scipy.sparse.hstack((x_vitals_train, x_note_train)).tocsr()
x_test = scipy.sparse.hstack((x_vitals_test, x_note_test)).tocsr()
x_train.shape, x_test.shape

In [None]:
clf = LogisticRegression(class_weight='balanced')
clf.fit(x_train, y_train)

prob = clf.predict_proba(x_test)
pos_prob = prob[:, 1]

labels = ['Delayed', 'Imminent']
label_test = [labels[i] for i in y_test]

In [None]:
fig, ax = plt.subplots(figsize=(10,8))
plot_roc(label_test, prob, title='', ax=ax)
ax.set_xlabel('1 - Specificity')
ax.set_ylabel('Sensitivity')

In [None]:
fig, ax = plt.subplots(figsize=(20, 10))
plot_thresh_range(ax, y_test, pos_prob, lower=0.1, upper=0.81, n_vals=100)

if save:
  fig.savefig(args.figdir/f'lr_notes_vital_metrics_vary.pdf', dpi=300, box_inches='tight', pad_inches=0)

In [None]:
fig, ax = plt.subplots(figsize=(20, 10))
best_threshold = plot_thresh_metric(ax, y_test, pos_prob, lower=0.1, upper=0.81, n_vals=100, show_f1=False)
ax.text(0.71, ax.get_ylim()[1]*0.95, f'Optimum Threshold = {best_threshold[0]}', fontsize=12, color='b')
print(f"Best Youden Threshold = {best_threshold[0]}")

if save:
  fig.savefig(args.figdir/f'lr_notes_vital_threshold_guide.pdf', dpi=300, box_inches='tight', pad_inches=0)

In [None]:
threshold = 0.45
pred = (pos_prob > threshold).astype(np.int64)
label_preds = [labels[i] for i in pred]
cm = confusion_matrix(y_test, pred)
tn,fp,fn,tp = cm[0][0],cm[0][1],cm[1][0],cm[1][1]
sensitivity = tp/(tp+fn)
specificity = tn/(tn+fp)
ppv = tp/(tp+fp)
npv = tn/(tn+fn)
f1 = (2*ppv*sensitivity)/(ppv+sensitivity)
auroc = roc_auc_score(y_test, pos_prob)

d = {
  'sensitivity': np.round(sensitivity, 3),
  'specificity': np.round(specificity, 3),
  'ppv': np.round(ppv, 3),
  'npv': np.round(npv, 3),
  'f1': np.round(f1, 3),
  'auroc': np.round(auroc, 3),
  'threshold': threshold,
}
metrics = pd.DataFrame(d.values(), index=d.keys(), columns=['Value'])
metrics

In [None]:
fig, ax = plt.subplots(figsize=(11, 8))
plot_confusion_matrix(label_test, label_preds, x_tick_rotation=45, ax=ax, normalize=False)

## Structured Classifier Dev

In [None]:
df = set_group_splits(vitals_common.copy(), group_col='hadm_id', seed=seed)

train_df = df[df['split'] == 'train']
test_df = df[df['split'] == 'test']

g = train_df.groupby(['imi_adm_label']).size().to_numpy()
print(f"Prevalence of positive class in training set:{(g[1]/g.sum())*100:0.1f}%")
g = test_df.groupby(['imi_adm_label']).size().to_numpy()
print(f"Prevalence of positive class in test set:{(g[1]/g.sum())*100:0.1f}%")

In [None]:
y_train, y_test = train_df['imi_adm_label'], test_df['imi_adm_label']
x_vitals_train, x_vitals_test = train_df[str_cols].values, test_df[str_cols].values
x_vitals_train.shape, x_vitals_test.shape

In [None]:
clf = LogisticRegression(class_weight='balanced')
clf.fit(x_vitals_train, y_train)

prob = clf.predict_proba(x_vitals_test)
pos_prob = prob[:, 1]

labels = ['Delayed', 'Imminent']
label_test = [labels[i] for i in y_test]

In [None]:
fig, ax = plt.subplots(figsize=(10,8))
plot_roc(label_test, prob, title='', ax=ax)
ax.set_xlabel('1 - Specificity')
ax.set_ylabel('Sensitivity')

In [None]:
fig, ax = plt.subplots(figsize=(20, 10))
plot_thresh_range(ax, y_test, pos_prob, lower=0.1, upper=0.81, n_vals=100)

if save:
  fig.savefig(args.figdir/f'lr_notes_vital_metrics_vary.pdf', dpi=300, box_inches='tight', pad_inches=0)

In [None]:
fig, ax = plt.subplots(figsize=(20, 10))
best_threshold = plot_thresh_metric(ax, y_test, pos_prob, lower=0.1, upper=0.81, n_vals=100, show_f1=False)
ax.text(0.71, ax.get_ylim()[1]*0.95, f'Optimum Threshold = {best_threshold[0]}', fontsize=12, color='b')
print(f"Best Youden Threshold = {best_threshold[0]}")

if save:
  fig.savefig(args.figdir/f'lr_notes_vital_threshold_guide.pdf', dpi=300, box_inches='tight', pad_inches=0)

In [None]:
threshold = 0.53
pred = (pos_prob > threshold).astype(np.int64)
label_preds = [labels[i] for i in pred]
cm = confusion_matrix(y_test, pred)
tn,fp,fn,tp = cm[0][0],cm[0][1],cm[1][0],cm[1][1]
sensitivity = tp/(tp+fn)
specificity = tn/(tn+fp)
ppv = tp/(tp+fp)
npv = tn/(tn+fn)
f1 = (2*ppv*sensitivity)/(ppv+sensitivity)
auroc = roc_auc_score(y_test, pos_prob)

d = {
  'sensitivity': np.round(sensitivity, 3),
  'specificity': np.round(specificity, 3),
  'ppv': np.round(ppv, 3),
  'npv': np.round(npv, 3),
  'f1': np.round(f1, 3),
  'auroc': np.round(auroc, 3),
  'threshold': threshold,
}
metrics = pd.DataFrame(d.values(), index=d.keys(), columns=['Value'])
metrics

## Metrics

In [None]:
with open(args.workdir/f'preds.pkl', 'rb') as f:
  targs = pickle.load(f)
  probs = pickle.load(f)
  preds = pickle.load(f)

bam = BinaryAvgMetrics(targs, preds, [prob[:, 1] for prob in probs])
bam.get_avg_metrics(defn=True)

In [None]:
bam.get_avg_metrics(conf=0.95)

In [None]:
fig, ax = plt.subplots(figsize=(11, 8))
plot_cm(ax, bam.cm_avg, ['Delayed', 'Imminent'])

if save:
  fig.savefig(args.figdir/f'mean_cm.pdf', dpi=300)

In [None]:
fig, ax = plt.subplots(figsize=(10, 8))
plot_mean_roc(ax, bam.targs, probs)

if save:
  fig.savefig(args.figdir/f'mean_roc.pdf', dpi=300)