# Imports

In [None]:
import pandas as pd
import string
import numpy as np
import nltk
import matplotlib.pyplot as plt
import pickle
import os
from itertools import permutations
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import PunktSentenceTokenizer, sent_tokenize, word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import ExtraTreeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from keras.models import Sequential
from keras.layers import LSTM, Dense, Bidirectional

# Data Pre-processing Functions

In [None]:
def flatten(lst):
    '''Recursively Flatten a nested List'''
    return sum( ([x] if not isinstance(x, list) else flatten(x) for x in lst), [] )

def pre_process(data):
    '''Returns cleaned DataFrame'''
    
    # load stop words and lemmatizer
    lemmatizer = WordNetLemmatizer()
    
    # characters to replace in data
    to_replace_with_space = '!"#$%&\'()*+,./:;<=>?@[\\]^_`{|}~1234567890' 
    to_replace_with_empty_string = '-'

    # sentence tokenization and split at br tags
    data['Data'] = data['Data'].apply(lambda text: flatten([sentence.split('<br />') for sentence in sent_tokenize(text)]))

    # convert to lowercase, replace punctuations and strip
    data['Data'] = data['Data'].apply(lambda strlist: list(map(lambda x: x.lower().translate(str.maketrans(to_replace_with_space, ''.join([' ' for _ in to_replace_with_space]))).translate(str.maketrans(({char: None for char in to_replace_with_empty_string}))).strip(), strlist)))

    # word tokenize, remove small words and lemmatize
    data['Data'] = data['Data'].apply(lambda strlist: list(map(lambda l: [lemmatizer.lemmatize(w) for w in l if len(w) > 2], [word_tokenize(sent) for sent in strlist])))

    # remove empty lists
    data['Data'] = data['Data'].apply(lambda ll: list(filter(lambda l: l, ll)))
    
    # Label Encode Critical_Findings column
    data['Critical_Finding'] = data['Critical_Finding'].replace(['None', 'Complete Critical Finding', 'Complete Physician Decline'], [0, 1, 1])
    
    return data
    

# Read Data

In [47]:
data_critical = pd.read_csv('./critical-findings-sample-data-20180601-20180901.csv')
data_non_critical = pd.read_csv('./non-critical-findings-sample-data-20180601-20180901.csv')

data = pd.concat([data_critical, data_non_critical]).reset_index(drop=True)
data = data.sample(frac=1).reset_index(drop=True)

In [48]:
data = pre_process(data)
data.head()

Unnamed: 0,Modality,Critical_Finding,Category,Data
0,CR,1,Other,"[[chest, view], [indication, picc, line, place..."
1,CT,1,Acute Vascular Event,"[[study, brain, without, contrast], [reason, f..."
2,US,0,,"[[study, renal, ultrasound, complete], [reason..."
3,CTA,1,Acute Vascular Event,"[[examination, cta, head, with, contrast], [in..."
4,CR,0,,"[[study, xray, chest], [reason, for, exam, fem..."


# Convert to Embedding Functions

In [315]:
def to_glove(cleaned_data):
    '''Converts cleaned data to GloVe Embeddings'''
    glove_embeddings = pickle.load(open('GloVe_Embeddings_Dict.pickle', 'rb'))
    glove_data = cleaned_data.copy()

    glove_data['Data'] = glove_data['Data'].apply(lambda ll: sum(list(map(lambda word: glove_embeddings[word] if word in glove_embeddings else np.zeros(300), flatten(ll)))))
    dim_columns = []
    for i in range(300):
        glove_data['dim_' + str(i)] = glove_data.Data.apply(lambda x: x[i])
        dim_columns.append('dim_' + str(i))
    return glove_data[dim_columns]


In [313]:
def to_pubmed(cleaned_data):
    '''Converts cleaned data to PubMed Embeddings'''
    pubmed_embeddings = pickle.load(open('PubMed_Embeddings_Dict.pickle', 'rb'))
    pubmed_data = cleaned_data.copy()

    pubmed_data['Data'] = pubmed_data['Data'].apply(lambda ll: sum(list(map(lambda word: pubmed_embeddings[word] if word in pubmed_embeddings else np.zeros(200), flatten(ll)))))
    dim_columns = []
    for i in range(200):
        pubmed_data['dim_' + str(i)] = pubmed_data.Data.apply(lambda x: x[i])
        dim_columns.append('dim_' + str(i))
    return pubmed_data[dim_columns]


In [337]:
def to_bioasq(cleaned_data):
    '''Converts cleaned data to BioASQ Embeddings'''
    bioasq_embeddings = pickle.load(open('BioASQ_Embeddings_Dict.pickle', 'rb'))
    bioasq_data = cleaned_data.copy()

    bioasq_data['Data'] = bioasq_data['Data'].apply(lambda ll: sum(list(map(lambda word: bioasq_embeddings[word] if word in bioasq_embeddings else np.zeros(200), flatten(ll)))))
    dim_columns = []
    for i in range(200):
        bioasq_data['dim_' + str(i)] = bioasq_data.Data.apply(lambda x: x[i])
        dim_columns.append('dim_' + str(i))
    return bioasq_data[dim_columns]


In [286]:
def to_tfidf(cleaned_data, gram):
    '''Converts cleaned data to TF-IDF vectors'''
    tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,gram))

    tfidf = tfidf_vectorizer.fit_transform(cleaned_data['Data'].apply(lambda x: ' '.join(flatten(x))))
    return tfidf

In [42]:
def to_bow(cleaned_data):
    '''Converts cleaned data to BOW vectors'''
    count_vectorizer = CountVectorizer()
    
    bow = count_vectorizer.fit_transform(cleaned_data['Data'].apply(lambda x: ' '.join(flatten(x)))) 
    return bow

## Initialize embeddings dictionary and models dictionary

In [338]:
embeddings = {
    "BOW": to_bow(data),
    "TFIDF-1Gram": to_tfidf(data, 1),
    "TFIDF-2Gram": to_tfidf(data, 2),
    "TFIDF-3Gram": to_tfidf(data, 3),
    "GloVe": to_glove(data),
    "PubMed": to_pubmed(data),
    "BioASQ": to_bioasq(data)
}

In [136]:
sklearn_models = {
    "LogisticRegression": LogisticRegression(solver='lbfgs', max_iter=1000),
    "MLPClassifier": MLPClassifier(solver='adam', alpha=1e-4, hidden_layer_sizes=(1000, 300, 10), random_state=1),
    "DecisionTreeClassifier": DecisionTreeClassifier(),
    "ExtraTreeClassifier": ExtraTreeClassifier(),
    "SVC": SVC(kernel='linear'),
    "KNeighborsClassifier": KNeighborsClassifier(n_neighbors=2),
    "AdaBoostClassifier": AdaBoostClassifier(),
    "BaggingClassifier": BaggingClassifier(),
    "ExtraTreesClassifier": ExtraTreesClassifier(n_estimators=100),
    "GradientBoostingClassifier": GradientBoostingClassifier(loss="deviance", n_estimators=100),
    "RandomForestClassifier": RandomForestClassifier(n_estimators=100),
}

## Generic function to run Sklearn Models with Embedding

In [287]:
def run_sklearn(model, embedding, pickle_model=False, use_pickle=False):
    '''Run Sklearn Model with Embedding. Returns Accuracy'''
    
    X_train, X_test, y_train, y_test = train_test_split(embeddings[embedding], data['Critical_Finding'], test_size = 0.2, random_state=4)
    if use_pickle and os.path.exists(model + '_' + embedding + '.pickle'):
        trained_model = pickle.load(open(model + '_' + embedding + '.pickle', 'rb'))
        y_pred = trained_model.predict(X_test)
    else:
        sklearn_models[model].fit(X_train, y_train)
        if pickle_model:
            pickle.dump(sklearn_models[model], open(model + '_' + embedding + '.pickle', 'wb'))

        y_pred = sklearn_models[model].predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy*100

# Keras Deep Learning Models

## Model Generator Functions

In [161]:
def generate_dense_model(train_data_shape):
    '''Generate Standard Keras DenseNet'''
    # create model
    dense_model = Sequential()

    #add model layers
    dense_model.add(Dense(1000, activation='relu', input_shape=(train_data_shape[2],)))
    dense_model.add(Dense(600, activation='relu'))
    dense_model.add(Dense(200, activation='relu'))
    dense_model.add(Dense(30, activation='relu'))
    dense_model.add(Dense(1, activation='sigmoid'))

    # compile model
    dense_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    return dense_model

In [192]:
def generate_lstm_model(train_data_shape):
    '''Generate an LSTM model'''
    # create model
    lstm_model = Sequential()

    #add model layers
    lstm_model.add(LSTM(100, input_shape=(None, train_data_shape[2])))
    lstm_model.add(Dense(80, activation='relu'))
    lstm_model.add(Dense(40, activation='relu'))
    lstm_model.add(Dense(1, activation='sigmoid'))

    # compile model
    lstm_model.compile(optimizer='adam', loss='poisson', metrics=['accuracy'])
    
    return lstm_model

In [234]:
def generate_bilstm_model(train_data_shape):
    '''Generate a Bi-directional LSTM'''
    # create model
    bilstm_model = Sequential()

    #add model layers
    bilstm_model.add(Bidirectional(LSTM(100, input_shape=(None, train_data_shape[2]))))
    bilstm_model.add(Dense(80, activation='relu'))
    bilstm_model.add(Dense(40, activation='relu'))
    bilstm_model.add(Dense(1, activation='sigmoid'))

    # compile model
    bilstm_model.compile(optimizer='adam', loss='poisson', metrics=['accuracy'])
    
    return bilstm_model

## Initialize Keras Models Dictionary

In [236]:
keras_models = {
    "DenseNet": lambda shape: generate_dense_model(shape),
    "LSTM": lambda shape: generate_lstm_model(shape),
    "BiLSTM": lambda shape: generate_bilstm_model(shape),
}

In [245]:
def get_data_for_keras(embedding):
    '''Get Training and Test Data for Keras Models with provided embedding'''
    
    # split data
    X_train, X_test, y_train, y_test = train_test_split(embeddings[embedding], data['Critical_Finding'], test_size = 0.3, random_state=42)
    
    # Reshape for Keras Models 
    if str(type(X_train))[8:13] == 'scipy':
        X_train = X_train.toarray().reshape((X_train.shape[0], 1, X_train.shape[1]))
        X_test = X_test.toarray().reshape((X_test.shape[0], 1, X_test.shape[1]))
    else:
        X_train = X_train.values.reshape((X_train.shape[0], 1, X_train.shape[1]))
        X_test = X_test.values.reshape((X_test.shape[0], 1, X_test.shape[1]))
    return X_train, X_test, y_train.values, y_test.values

In [252]:
def run_keras(model, embedding, pickle_model=False, use_pickle=False):
    '''Run Keras Model with Embedding. Returns Accuracy'''
    
    X_train, X_test, y_train, y_test = get_data_for_keras(embedding)
    
    if use_pickle and os.path.exists(model + '_' + embedding + '.pickle'):
        trained_model = pickle.load(open(model + '_' + embedding + '.pickle', 'rb'))
        if model == "DenseNet":
            accuracy = trained_model.evaluate(X_test.reshape(X_test.shape[0], X_test.shape[2]), y_test)[1]*100
        else:
            accuracy = trained_model.evaluate(X_test, y_test)[1]*100
    else:
        if model == "DenseNet":
            model_obj = keras_models[model](X_train.shape)
            model_obj.fit(X_train.reshape(X_train.shape[0], X_train.shape[2]), y_train, validation_split=0.3, epochs=300, batch_size=1000, verbose=0)

            accuracy = model_obj.evaluate(X_test.reshape(X_test.shape[0], X_test.shape[2]), y_test)[1]*100
        else:
            model_obj = keras_models[model](X_train.shape)
            model_obj.fit(X_train, y_train, validation_split=0.3, epochs=300, batch_size=1000, verbose=0)

            accuracy = model_obj.evaluate(X_test, y_test)[1]*100
        if pickle_model:
            pickle.dump(model_obj, open(model + '_' + embedding + '.pickle', 'wb'))
            
    return accuracy

In [328]:
def run_all(skip_old_models=False):
    '''Run all combinations of sklearn and Keras models. Returns a Pandas DataFrame containing the results'''
    
    sklearn_run_combinations = sorted([(model, embedding) for model in sklearn_models.keys() for embedding in embeddings.keys()], key=lambda x: x[1])
    keras_run_combinations = sorted([(model, embedding) for model in keras_models.keys() for embedding in embeddings.keys()], key=lambda x: x[1])
    
    results_df = pd.DataFrame(columns=['Model', 'Embedding', 'Accuracy'])
    start_from = 0
    
    for i, r in enumerate(sklearn_run_combinations):
        print('Model: {}\tEmbedding: {}'.format(r[0], r[1]))
        if skip_old_models and os.path.exists(r[0] + '_' + r[1] + '.pickle'):
            print('Skipping.....\n')
            continue
        results_df.loc[i] = [r[0], r[1], run_sklearn(r[0], r[1], pickle_model=True, use_pickle=True)]
        start_from = i
    
    for i, r in enumerate(keras_run_combinations, start=start_from + 1):
        print('Model: {}\tEmbedding: {}'.format(r[0], r[1]))
        if skip_old_models and os.path.exists(r[0] + '_' + r[1] + '.pickle'):
            print('Skipping.....\n')
            continue
        results_df.loc[i] = [r[0], r[1], run_keras(r[0], r[1], pickle_model=True, use_pickle=True)]
    return results_df

In [341]:
results_df = run_all()

Model: LogisticRegression	Embedding: BOW
Model: MLPClassifier	Embedding: BOW
Model: DecisionTreeClassifier	Embedding: BOW
Model: ExtraTreeClassifier	Embedding: BOW
Model: SVC	Embedding: BOW
Model: KNeighborsClassifier	Embedding: BOW
Model: AdaBoostClassifier	Embedding: BOW
Model: BaggingClassifier	Embedding: BOW
Model: ExtraTreesClassifier	Embedding: BOW
Model: GradientBoostingClassifier	Embedding: BOW
Model: RandomForestClassifier	Embedding: BOW
Model: LogisticRegression	Embedding: BioASQ
Model: MLPClassifier	Embedding: BioASQ
Model: DecisionTreeClassifier	Embedding: BioASQ
Model: ExtraTreeClassifier	Embedding: BioASQ
Model: SVC	Embedding: BioASQ
Model: KNeighborsClassifier	Embedding: BioASQ
Model: AdaBoostClassifier	Embedding: BioASQ
Model: BaggingClassifier	Embedding: BioASQ
Model: ExtraTreesClassifier	Embedding: BioASQ
Model: GradientBoostingClassifier	Embedding: BioASQ
Model: RandomForestClassifier	Embedding: BioASQ
Model: LogisticRegression	Embedding: GloVe
Model: MLPClassifier	E

In [343]:
pickle.dump(results_df, open('results.pickle', 'wb'))

In [350]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(results_df)

                         Model    Embedding   Accuracy
0           LogisticRegression          BOW  95.000000
1                MLPClassifier          BOW  96.000000
2       DecisionTreeClassifier          BOW  92.000000
3          ExtraTreeClassifier          BOW  90.500000
4                          SVC          BOW  94.000000
5         KNeighborsClassifier          BOW  86.500000
6           AdaBoostClassifier          BOW  87.000000
7            BaggingClassifier          BOW  92.500000
8         ExtraTreesClassifier          BOW  97.000000
9   GradientBoostingClassifier          BOW  92.000000
10      RandomForestClassifier          BOW  97.000000
11          LogisticRegression       BioASQ  93.000000
12               MLPClassifier       BioASQ  90.500000
13      DecisionTreeClassifier       BioASQ  87.000000
14         ExtraTreeClassifier       BioASQ  91.000000
15                         SVC       BioASQ  93.000000
16        KNeighborsClassifier       BioASQ  88.000000
17        