# Machine Learning

## Import Libraries

In [27]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# import color maps
import matplotlib.cm as cm
# import tokenizer
#from nltk import word_tokenize
import string
# import model related libraries
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, _forest
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, fbeta_score, classification_report, silhouette_samples, silhouette_score
from sklearn.model_selection import learning_curve
from sklearn_pandas import DataFrameMapper, cross_val_score
from sklearn.preprocessing import LabelEncoder, KBinsDiscretizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
# import a function that convert items into a callable object
from operator import itemgetter
# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

pd.options.display.max_colwidth = 200
%matplotlib inline
%run functions.ipynb # import my functions from functions notebook

Stored 'my_stop_words' (list)
sucessfully ran function nobtebook!


## Load Data

In [28]:
# read in doctor's discharge notes from NOTEEVENTS.csv
df_notes = pd.read_csv("../data/mimic-iii-clinical-database-1.4/NOTEEVENTS.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [29]:
# load master dataframe
master = pd.read_csv('../data/master.csv')
master.drop('Unnamed: 0',axis=1,inplace=True)
master.columns

Index(['SUBJECT_ID', 'HADM_ID', 'READMISSION_30DAYS', 'DISCHARGE_LOCATION',
       'INSURANCE', 'MARITAL_STATUS', 'GENDER', 'AGE', 'ETHNICITY_GRP',
       'CURR_SERVICE', 'TEXT_CL', 'AGE_boxcox_lambda_opt', 'NUM_PRESCRIPTION',
       'LOS', 'HLOS_CL', 'LOS_RATIO', 'KD', 'HP', 'PUL', 'UT', 'HIV', 'DB',
       'MBD', 'TB', 'GA', 'HM', 'HEP', 'HO', 'FR', 'TX', 'LA', 'AF', 'CB',
       'PNE', 'HF', 'SP', 'WMCC', 'WCC', 'WOCCMCC', 'WOMCC', 'WCCMCC',
       'DRG_SEVERITY', 'DRG_MORTALITY', 'TEXT', 'NUM_PRESCRIPTION_LOG',
       'LOS_LOG', 'LOS_boxcox_lambda_opt', 'HLOS_CL_LOG',
       'HLOS_CL_boxcox_lambda_opt', 'LOS_RATIO_LOG',
       'LOS_RATIO_boxcox_lambda_opt'],
      dtype='object')

Since this project is not only NLP focused, I will only adopt simple NLP techniques that are appropriate for this project.

In [45]:
X_train, X_test, y_train, y_test = train_test_split(master['TEXT_CL'],master.READMISSION_30DAYS,
                                        test_size=0.3,train_size=0.7, random_state = 0)

In [35]:
from sklearn.pipeline import Pipeline
# Pipeline & Gridsearch setup: TFIDF pipeline setup
tvc_pipe = Pipeline([
 ('tvec', TfidfVectorizer()),
 ('mb', MultinomialNB())
])

# Setting params for TFIDF Vectorizer gridsearch
tf_params = {
    'tvec__min_df':[5, 10],
    'tvec__max_features':[1000, 3000],
    'tvec__stop_words': [None, my_stop_words],
    'mb__alpha': np.linspace(0.5, 1.5, 6),
    'mb__fit_prior': [True, False], 
}

# Setting up GridSearch for TFIDFVectorizer
tvc_gs = GridSearchCV(tvc_pipe, param_grid=tf_params, cv = 3, verbose =1, n_jobs = -1)

# Fitting TVC GS
tvc_gs.fit(X_train, y_train)

Fitting 5 folds for each of 96 candidates, totalling 480 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  6.0min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 25.0min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 64.9min
[Parallel(n_jobs=-1)]: Done 480 out of 480 | elapsed: 72.1min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('tvec',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        no

In [36]:
print(tvc_gs.score(x_train, y_train))
print(tvc_gs.score(x_test, y_test))

0.7313843508168529
0.7308463698355395


In [37]:
tvc_gs.best_params_

{'mb__alpha': 0.5,
 'mb__fit_prior': True,
 'tvec__max_features': 1000,
 'tvec__min_df': 5,
 'tvec__stop_words': None}

In [52]:
# TfidfVectorizer is used instead of CountVectorizer because the former performs better with the return frequency ratio instead of count only
# set min_df to 5 to avoid using words that only appeared once or twice
vect = TfidfVectorizer(min_df=5, max_features = 3000, lowercase=True, tokenizer=tokenizer_better,
                       stop_words=my_stop_words, use_idf=True)

In [53]:
# extract word features from TEXT
feature_words = vect.fit_transform(master['TEXT_CL'].values)

In [54]:
feature_words.shape

(8308, 3000)

## Term frequency Analysis

In [None]:
# create sparse matrix containing terms and document frequencies that are predictive of the positive and negative class
neg_doc_matrix = vect.transform(
    master[master['READMISSION_30DAYS'] == 0].TEXT_CL)
pos_doc_matrix = vect.transform(
    master[master['READMISSION_30DAYS'] == 1].TEXT_CL)
neg_tf = np.sum(neg_doc_matrix, axis=0)
pos_tf = np.sum(pos_doc_matrix, axis=0)
neg = np.squeeze(np.asarray(neg_tf))
pos = np.squeeze(np.asarray(pos_tf))
# combine the spare matrices into a dataframe to derive the total doc_frequencies for each term
term_freq_df = pd.DataFrame(
    [neg, pos], columns=vect.get_feature_names()).transpose()
term_freq_df.columns = ['negative', 'positive']
term_freq_df['total'] = term_freq_df['negative'] + term_freq_df['positive']
term_freq_df.sort_values(by='total', ascending=False).iloc[:10]

In [None]:
# create a series from the sparse matrix
d = pd.Series(term_freq_df.total,
              index=term_freq_df.index).sort_values(ascending=False)
# plot term frequency for first 50 words
ax = d[:50].plot(kind='bar', figsize=(10, 6), width=.8,
                 fontsize=14, rot=90, color='b')
ax.title.set_size(18)
plt.ylabel('count')
plt.show()
# plot term frequency for 50th to 100th words
ax = d[50:100].plot(kind='bar', figsize=(10, 6), width=.8,
                    fontsize=14, rot=90, color='g')
ax.title.set_size(18)
plt.ylabel('count')
plt.show()

## Building NLP Model

In [55]:
X_train_w, X_test_w, y_train, y_test = train_test_split(feature_words,master.READMISSION_30DAYS,
                                        test_size=0.3,train_size=0.7, random_state = 0)

In [56]:
# build and evaluate a Multinomial Naive Bayes model using only word features from discharge notes
model = MultinomialNB(alpha = 0.5, fit_prior = True)
clf = model.fit(X_train_w, y_train)

y_train_preds = clf.predict_proba(X_train_w)[:,1]
y_valid_preds = clf.predict_proba(X_test_w)[:,1]

auc_train = roc_auc_score(y_train, y_train_preds)
auc_valid = roc_auc_score(y_test, y_valid_preds)

print ("AUC on training data: ","{0:.3%}".format(auc_train))
print ("AUC on test data: ","{0:.3%}".format(auc_valid))

AUC on training data:  69.554%
AUC on test data:  63.707%


## What are the strongly predictive features?

Below is a neat trick to identify strongly predictive features (i.e. words). 

* first, create a data set such that each row has exactly one feature. This is represented by the identity matrix.
* use the trained classifier to make predictions on this matrix
* sort the rows by predicted probabilities, and pick the top and bottom $K$ rows

In [57]:
# plot the top 10 words that are most predictive of the positive and negative class
words = np.array(vect.get_feature_names())

x = np.eye(X_test_w.shape[1])
probs = clf.predict_proba(x)[:, 1]
ind = np.argsort(probs) # returned index of probs sorted in ascending order

wk_words = words[ind[:10]]
strg_words = words[ind[-10:]]

wk_prob = probs[ind[:10]]
strg_prob = probs[ind[-10:]]

zipped = zip(strg_words, strg_prob)
strg_zipped = sorted(zipped, key = lambda x: x[1], reverse=True)

print("High probability readmission words\tP(readmit<30 | word)")
for w, p in strg_zipped:
    print("{:>20}".format(w), "{:.4f}".format(p))
    
#zipped = zip(wk_words, wk_prob)
#wk_zipped = sorted(zipped, key = lambda x: x[1], reverse=True)
    
print("Low probability readmission words\tP(readmit<30 | word)")
for w, p in zip(wk_words, wk_prob):
    print("{:>20}".format(w), "{:.4f}".format(p))

High probability readmission words	P(readmit<30 | word)
            subdural 0.5704
                 sdh 0.5614
          herniation 0.4865
        neurosurgery 0.4390
             lifting 0.4362
                 peg 0.4314
                  br 0.4312
               trach 0.4302
           occipital 0.4241
        transitional 0.4236
Low probability readmission words	P(readmit<30 | word)
             suicide 0.1262
               crohn 0.1321
         nephrostomy 0.1421
         phosphatase 0.1485
            alkaline 0.1515
                ptca 0.1562
     catheterization 0.1569
             norvasc 0.1655
           verapamil 0.1689
             flovent 0.1699
