In [1]:
# Standard python helper libraries.
import os, sys, re, json, time, wget, csv, string, time, random
import itertools, collections
from importlib import reload
from IPython.display import display

# NumPy and SciPy for matrix ops
import numpy as np
import scipy.sparse

# NLTK for NLP utils
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer

# Helper libraries
from w266_common import utils, vocabulary, tf_embed_viz
utils.require_package("wget")      # for fetching dataset

from keras.models import Sequential
from keras.layers import GaussianNoise, LSTM, Bidirectional, Dropout, Dense, Embedding, MaxPool1D, GlobalMaxPool1D, Conv1D
from keras.optimizers import Adam

from pymagnitude import *

[nltk_data] Downloading package punkt to /home/renzeer/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/renzeer/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Using TensorFlow backend.


# INTRODUCTION
## Phenotype Classification of Electronic Health Records

Electronic Health Record (EHR) data is a rapidly growing source of unstructured biomedical data. This data is extremely rich, often capturing a patient’s phenotype. In a clinical context, phenotype refers to the specific medical condition or disease of a patient. These records captures this data in higher detail compared to structured encodings such as the International Classification of Diseases (ICD) or National Drug Codes (NDC). Traditional methods for extracting phenotypes from this data typically relies on manual review or processing the data through rule-based expert systems. Both approaches are time intensive, rely heavily on human expertise, and scale poorly with increased volume. This project proposes an automated approach to identifying phenotypes in EHR data through word vector clustering and machine learning. An automated approach would greatly reduce time and operation costs, with the potential of even outperforming industry standards. 

The data for this project is provided by nlplab, who have induced a biomedical corpus using word2vec. This corpus contains over 5 billion words pulled from biomedical scientific literature and Wikipedia.


# DATA EXPLORATION
## Word Embedding

The foundation of this project is based on word embedding models, an approach that converts words into number vectors based on co-occurence. These vectors help capture word meanings and context in a format suitable for machine learning. 

Typically these vectors are trained on extremely large corpora, which can take a lot of time and resources. Thankfully, the word embedding space is quite mature and there exists pre-trained models, ready to use out of the box. One such model is Standford's GloVe vectors, which is trained on a corpus of 6B tokens from Wikipedia and Gigaword. These vectors are available at https://nlp.stanford.edu/projects/glove/. We will go through some exercises to explore word vectors.


In [2]:
import glove_helper; reload(glove_helper)
hands = glove_helper.Hands(ndim=100)

Loading vectors from data/glove/glove.6B.zip
Parsing file: data/glove/glove.6B.zip:glove.6B.100d.txt
Found 400,000 words.
Parsing vectors... Done! (W.shape = (400003, 100))


In [9]:
def find_nn_cos(v, Wv, k=10):
    """Find nearest neighbors of a given word, by cosine similarity.
    
    Returns two parallel lists: indices of nearest neighbors, and 
    their cosine similarities. Both lists are in descending order, 
    and inclusive: so nns[0] should be the index of the input word, 
    nns[1] should be the index of the first nearest neighbor, and so on.

    Args:
      v: (d-dimensional vector) word vector of interest
      Wv: (V x d matrix) word embeddings
      k: (int) number of neighbors to return
    
    Returns (nns, ds), where:
      nns: (k-dimensional vector of int), row indices of nearest neighbors, 
        which may include the given word.
      similarities: (k-dimensional vector of float), cosine similarity of each 
        neighbor in nns.
    """

    v_norm = np.linalg.norm(v)
    Wv_norm = np.linalg.norm(Wv, axis=1)

    dot = np.dot(v, Wv.T)

    cos_sim = dot / (v_norm * Wv_norm)

    nns = np.flipud(np.argsort(cos_sim)[-k:])
    ds = np.flipud(np.sort(cos_sim)[-k:])
    
    return [nns, ds]


def show_nns(hands, word, k=10):
    """Helper function to print neighbors of a given word."""
    word = word.lower()
    print("Nearest neighbors for '{:s}'".format(word))
    v = hands.get_vector(word)
    for i, sim in zip(*find_nn_cos(v, hands.W, k)):
        target_word = hands.vocab.id_to_word[i]
        print("{:.03f} : '{:s}'".format(sim, target_word))
    print("")

In [10]:
show_nns(hands, "diabetes")
show_nns(hands, "cancer")
show_nns(hands, "depression")

Nearest neighbors for 'diabetes'
1.000 : 'diabetes'
0.848 : 'hypertension'
0.799 : 'obesity'
0.780 : 'arthritis'
0.779 : 'cancer'
0.774 : 'alzheimer'
0.765 : 'asthma'
0.756 : 'cardiovascular'
0.733 : 'disease'
0.730 : 'epilepsy'

Nearest neighbors for 'cancer'
1.000 : 'cancer'
0.821 : 'breast'
0.807 : 'prostate'
0.785 : 'disease'
0.779 : 'diabetes'
0.766 : 'cancers'
0.751 : 'patients'
0.749 : 'leukemia'
0.744 : 'alzheimer'
0.732 : 'lung'

Nearest neighbors for 'depression'
1.000 : 'depression'
0.706 : 'illness'
0.690 : 'anxiety'
0.679 : 'severe'
0.672 : 'onset'
0.670 : 'schizophrenia'
0.668 : 'disorder'
0.666 : 'alcoholism'
0.643 : 'psychosis'
0.641 : 'mental'



The results we see make sense and showcase the capability of word embeddings. However, we do run into a few issues. For one, 
loading the file into our workspace requires careful memory management. This can become a problem when dealing with larger models or when we want to tweak our models and reload the data. Another issue is that we have to build our own help functions for performing calculations on the word vectors. Not inherently an issue, but these calculations are fairly standard and it is always a good idea to work smarter, not harder.

As an alternative, we can look at third-party packages that offer fast and simple support for word vector operations. The package we will use for this project is Magnitude (https://github.com/plasticityai/magnitude). This package offers "lazy-loading for faster cold starts in development, LRU memory caching for performance in production, multiple key queries, direct featurization to the inputs for a neural network, performant similiarity calculations, and other nice to have features for edge cases like handling out-of-vocabulary keys or misspelled keys and concatenating multiple vector models together." These are all great features that we can leverage for this project.

## Working with Word Vectors - Magnitude

Going through a few simple comparisons and exercises, we can see the difference between working with the raw text file versus working with the magnitude file:
  - The zip file is ~4 times larger than the magnitude file. This is even more impressive consdering the text file still needs to be unpackaged.  
  - Load times are extremely quick for the magnitude file, far outperforming the standard file.  
  - Querying from the standard file outperforms the magnitude file, but querying from the magnitude file is simpler and offers additional functionality.  
  
While the increased query times is not ideal, especially when it comes to training, the portability and the increased functionality just makes life so much easier.

In [25]:
print('Standard Text File:')
print('\tFile Size: ', os.stat('data/glove/glove.6B.zip').st_size)

start = time.time()
glove_vectors_txt = glove_helper.Hands(ndim=100, quiet=True)
end = time.time()
print('\tFile Load Time: ', end - start)

start = time.time()
glove_vectors_txt.get_vector('diabetes')
glove_vectors_txt.get_vector('cancer')
glove_vectors_txt.get_vector('hypertension')
end = time.time()
print('\tQuery Time: ', end - start)

print('\tHandling out-of-vocabulary words:')
try:
    print('\t\t', glove_vectors_txt.get_vector('wordnotfoundinvocab'))
except AssertionError:
    print('\t\tWord not found in vocabulary')

print('\nMagnitude File:')
print('\tFile Size: ', os.stat('data/glove-lemmatized.6B.100d.magnitude').st_size)

start = time.time()
glove_vectors_mag = Magnitude("data/glove-lemmatized.6B.100d.magnitude")
end = time.time()
print('\tFile Load Time: ', end - start)

start = time.time()
glove_vectors_mag.query("diabetes")
glove_vectors_mag.query("cancer")
glove_vectors_mag.query("hypertension")
end = time.time()
print('\tQuery Time: ', end - start)

print('\tHandling out-of-vocabulary words:')
try:
    print('\t\t', glove_vectors_mag.query('wordnotfoundinvocab'))
except AssertionError:
    print('\t\tWord not found in vocabulary')


Standard Text File:
	File Size:  862182613
	File Load Time:  18.083816289901733
	Query Time:  0.00012183189392089844
	Handling out-of-vocabulary words:
		Word not found in vocabulary

Magnitude File:
	File Size:  266366976
	File Load Time:  0.002287149429321289
	Query Time:  0.006619930267333984
	Handling out-of-vocabulary words:
		 [-0.04397694  0.08708267  0.05870734 -0.04722567 -0.03879925  0.21312321
  0.02859145 -0.03979973 -0.02670808  0.02556176 -0.07791763  0.0055145
 -0.03020298  0.06430179 -0.00551911  0.16249717 -0.06189246 -0.12206172
 -0.02767706 -0.05265569  0.13255737  0.02846519  0.0451067   0.11242716
  0.01290785 -0.04876954 -0.04612697 -0.03764525 -0.00251381  0.11269477
  0.11309229  0.09421328 -0.13763386 -0.02501031  0.01126506  0.06448203
  0.06115726 -0.12342421  0.02004041 -0.0443186  -0.02901474 -0.01431345
  0.05068584 -0.02549015 -0.08328359 -0.07138098  0.0835982  -0.03470181
 -0.00475797 -0.07226969  0.20147627 -0.02546141  0.16691468  0.15587942
 -0.10204

## Corpus Selection - Biomedical Text

-- Talk about importance of base corpora  
-- Reference paper that compares medical coprora to general corpora  
-- Show case actual examples by showing NN of GloVe vs medical  

With a framework that allows more freedom in corpus selection, we can move into much more larger word embedding models. The GloVe model we have been previously working with is actually on the smaller side. Of course, a larger corpus offers more data to train on, thus better capturing word contexts and meanings. However, another determininig factor in corpus selection is the source of the text. In general, these pre-trained models are based on general topic sources such as Wikipedia and Gigaword. However, since we know the domain we are working in, it may make sense to pull from relevant text sources. 

A Comparison of Word Embeddings for the Biomedical Natural Language Processing (https://arxiv.org/pdf/1802.00400.pdf) explores this idea. The paper concluded that "word embeddings trained on EHR and MedLit can capture the semantics of medical terms better and find semantically relevant medical terms closer to human experts’ judgments than those trained on GloVe and Google News." 

We can test these results ourselves by comparing GloVe against a biomedical based word embedding that was trained on text from PubMed and PubMed Central.

In [30]:
print('GloVe length: ', len(glove_vectors_mag))
print('GloVe dimensions: ', glove_vectors_mag.dim)

print('\nNearest Neighbor examples:')
print('10 NN for diabetes:\n', glove_vectors_mag.most_similar("diabetes", topn = 10))
print('10 NN for cancer:\n', glove_vectors_mag.most_similar("cancer", topn = 10))
print('10 NN for hyperlipidemia:\n', glove_vectors_mag.most_similar("hyperlipidemia", topn = 10))
print('10 NN for e119:\n', glove_vectors_mag.most_similar("e119", topn = 10))

GloVe length:  336951
GloVe dimensions:  100

Nearest Neighbor examples:
10 NN for diabetes:
 [('diabetic', 0.7566893059521317), ('diabetis', 0.7465886369685321), ('obesity', 0.619293770727618), ('hypertension', 0.6162751523182464), ('cardiovascular', 0.5791516470463346), ('asthma', 0.5689611839459698), ('arthriti', 0.5554265183541941), ('mellitu', 0.5439654800171492), ('allergy', 0.5393456576654493), ('alzheimer', 0.5297674264739546)]
10 NN for cancer:
 [('breast', 0.8210739), ('prostate', 0.8065967), ('disease', 0.78536785), ('diabetis', 0.7788438), ('patient', 0.75117147), ('leukemia', 0.7485109), ('alzheimer', 0.744444), ('lung', 0.73171055), ('diseasis', 0.729254), ('heart', 0.7241202)]
10 NN for hyperlipidemia:
 [('dyslipidemia', 0.6900931), ('hypercholesterolemia', 0.67991346), ('insulin-dependent', 0.6547221), ('insipidu', 0.61982), ('hyperglycemia', 0.6196113), ('vaginismu', 0.61709344), ('metformin', 0.6067523), ('pre-eclampsia', 0.60390294), ('prediabetis', 0.6029445), ('pol

In [3]:
med_vectors = Magnitude("data/wikipedia-pubmed-and-PMC-w2v.magnitude", pad_to_length=30)
print('Medical length: ', len(med_vectors))
print('Medical dimensions: ', med_vectors.dim)

# print('\nNearest Neighbor examples:')
# print('10 NN for diabetes:\n', med_vectors.most_similar("diabetes", topn = 10))
# print('10 NN for cancer:\n', med_vectors.most_similar("cancer", topn = 10))
# print('10 NN for hyperlipidemia:\n', med_vectors.most_similar("hyperlipidemia", topn = 10))
# print('10 NN for e119:\n', med_vectors.most_similar("e119", topn = 10))

Medical length:  5443656
Medical dimensions:  200


## Training Data - Labeled Electronic Health Record Text

-- Refer back to goal of project  
-- Talk about difficulty of getting medical data (HIPPA)  
-- Reference MTsamples as source of data  
-- Show raw data unprocessed  
-- Briefly talk about transformations  

The goal of this project is to classify Eletronic Health Record (EHR) text. This of course means that we need to get our hands on some EHR data. This can be particularly difficult due to the strict rules and guidelines around healthcare data. The Health Insurance Portability and Accountability Act of 1996, or HIPAA, outlines a set of rules that help protect the privacy of our health information. These rules are vital for building a healthcare system where we can trust our healthcare providers and caregivers, so it is important that we adhere to the standards set by HIPAA. 

For this project, we will be using a dataset provided by MTSamples.com. They provide ~5,000 transcribed medical reports covering 40 specialty types. All of the notes have been de-identified of protected health information, making them HIPAA compliant. Below we will explore a few rows of the raw data.

In [4]:
ehr_notes = []
with open('data/ehr_samples.csv', newline='') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        ehr_notes.append([row['Specialty'], row['Note']])
        
print('EHR Sentence Example:\n')
print(ehr_notes[0])
print(ehr_notes[1])

EHR Sentence Example:

['Bariatrics', 'PAST MEDICAL HISTORY:, He has difficulty climbing stairs, difficulty with airline seats, tying shoes, used to public seating, and lifting objects off the floor.  He exercises three times a week at home and does cardio.  He has difficulty walking two blocks or five flights of stairs.  Difficulty with snoring.  He has muscle and joint pains including knee pain, back pain, foot and ankle pain, and swelling.  He has gastroesophageal reflux disease.,PAST SURGICAL HISTORY:, Includes reconstructive surgery on his right hand 13 years ago.  ,SOCIAL HISTORY:, He is currently single.  He has about ten drinks a year.  He had smoked significantly up until several months ago.  He now smokes less than three cigarettes a day.,FAMILY HISTORY:, Heart disease in both grandfathers, grandmother with stroke, and a grandmother with diabetes.  Denies obesity and hypertension in other family members.,CURRENT MEDICATIONS:, None.,ALLERGIES:,  He is allergic to Penicillin.,M

## Text Processing - Pre-Processing the EHR Notes

-- Talk about how we need to manage our scope. Mention how ML on larger text scales  
-- For simplicity, going to limit ourselves to sentences. Possibly moving on to more text if we see promising results  
-- Talk about the standard set of NLTK functions  
-- Show a new sentence  

With the EHR data now loaded, we could technically start applying Machine Learning operations as is. However, as with a lot of text-based data, there are a few characteristics that are less than ideal for this project. The first obstacle is managing our text length. As our input text grows, so does the number of variables and the number of operations. Depending on our algorithm, these values can scale exponentially, causing runtime and resource usage to explode out of hand. To help manage the scope of our input text, we will be breaking up our notes into sentences. This should give us enough context to learn the more complex relationships between our words while minimizing runtime. Of course, if we find that runtime performance is not an issue, we can try further expanding our input text.

Another pre-processing step we can take is to apply basic natural language cleanup techniques that standardize the text and remove non-essential information. Thankfully, python has a package called the Natural Language Toolkit (NLTK) that provides a lot of these transformations as built-in functions. The operations we will use for this project are converting all text to lowercase, removing punctation, filtering out stop words, and removing blanks.

After all of the pre-processing, we can take a look at what the EHR notes now look like.

In [5]:
ehr_sentences = []
for record in ehr_notes:
    sent_text = nltk.sent_tokenize(record[1])
    for sent in sent_text:
        tokens = word_tokenize(sent)

        # convert to lower case
        tokens = [w.lower() for w in tokens]

        # remove punctuation from each word
        table = str.maketrans('', '', string.punctuation)
        tokens = [w.translate(table) for w in tokens]

        # filter out stop words
        stop_words = set(stopwords.words('english'))
        tokens = [w for w in tokens if not w in stop_words]

#         # stem words
#         porter = PorterStemmer()
#         tokens = [porter.stem(word) for word in tokens]

        # remove blanks
        tokens = [w for w in tokens if w != '']

        ehr_sentences.append([record[0], ' '.join(tokens)])

random.Random(4).shuffle(ehr_sentences)

In [6]:
print(ehr_sentences[:10])

specialties = ['Allergy', 'Autopsy', 'Bariatrics', 'Cardiovascular', 'Chart', 'Chiropractic', 'Consult'
               , 'Cosmetic', 'Dentistry', 'Dermatology', 'Diet', 'Discharge', 'Emergency', 'Endocrinology'
               , 'Gastroenterology', 'General', 'Gynecology', 'Hospice', 'IME', 'Letters', 'Nephrology', 'Neurology'
               , 'Neurosurgery', 'Office Notes', 'Oncology', 'Ophthalmology', 'Orthopedic', 'Otolaryngology'
               , 'Pain Management', 'Pathology', 'Pediatrics', 'Podiatry', 'Psychiatry', 'Radiology', 'Rehab'
               , 'Rheumatology', 'Sleep', 'Speech', 'Surgery', 'Urology']

[['Psychiatry', 'exhusband died 1980 acute pancreatitis secondary alcohol abuse'], ['Gynecology', 'patient taken post anesthesia care unit stable condition'], ['Consult', 'send pertussis pcr'], ['Discharge', 'admission diagnoses 1'], ['Surgery', 'time removed 12 mm broach proceeded implanting polyethylene liner within acetabulum'], ['General', 'peripheral vascular disease status post recent last week pta right lower extremity social history negative smoking drinking current home medications novolog 20 units meal lantus 30 units bedtime crestor 10 mg daily micardis 80 mg daily imdur 30 mg daily amlodipine 10 mg daily coreg 125 mg bid lasix 20 mg daily ecotrin 325 mg daily calcitriol 05 mcg daily review systems patient denies complaints states right hand left foot swollen painful came emergency room'], ['Surgery', 'estimated blood loss less 15 ml'], ['Surgery', 'base tumor fulgurated periphery normal mucosa surrounding base bladder tumor'], ['Cardiovascular', 'focal areas consolidation s

# METHODS AND APPROACHES
## Naive Nearest Neighbor

-- Talk about distance vs similarity  
-- Talk about fundametal co-occurence principle of word to vector  
-- How those vectors repsenet context or meaning  
-- If a sentence is more similar to our category, we can simply label it as such  
-- SHow some good examples but emphasize the bad examples  


The first method we will explore will be to just leverage the word embedding space with no Machine Learning at all. We mentioned earlier that the word vectors capture context and meaning. Additionally position of these vectors in relation to eachother also convey word relationships. At the core of it, vectors clustered together are more similar in context and meaning. Using this principle, we can use our categories as anchors in our word embedding, calculate a similarity score for a sentence, and identify which category is the nearest neighbor to our sentence. 

This is a very naive approach but it will be a good exercise and can at least set a baseline for performance. 

In [139]:
print('Similarity between diabetes and mellitus: ', med_vectors.similarity("diabetes", "mellitus"))
print('Similarity between diabetes and breast: ', med_vectors.similarity("diabetes", "breast"))

print('\nSimilarity between cancer and mellitus: ', med_vectors.similarity("cancer", "mellitus"))
print('Similarity between cancer and breast: ', med_vectors.similarity("cancer", "breast"))

Similarity between diabetes and mellitus:  0.80347604
Similarity between diabetes and breast:  0.26328182

Similarity between cancer and mellitus:  0.13384798
Similarity between cancer and breast:  0.7488326


In [140]:
nn_results = []
for i, ehr_sent in enumerate(ehr_sentences[0:2000]):
#     print(ehr_sent)
    
    most_similar_specialty = []
    
    for specialty in specialties:
        spec_similarity_sum = 0
        for token in ehr_sent[1].split(' '):
#             print('\t', token, med_vectors.similarity(specialty, token))
            
            spec_similarity_sum += med_vectors.similarity(specialty, token)
        
        spec_similarity = spec_similarity_sum / len(ehr_sent[1].split(' '))
        
#         print(specialty, spec_similarity)

        if not most_similar_specialty:
            most_similar_specialty = [i, ehr_sent[0], specialty, spec_similarity]
        elif spec_similarity > most_similar_specialty[3]:
            most_similar_specialty = [i, ehr_sent[0], specialty, spec_similarity]
        
    nn_results.append(most_similar_specialty)

    
correct_results = [result for result in nn_results if result[1] == result[2]]
print('# of Correct Classifications: ', len(correct_results))
print('Accuracy: ', len(correct_results) / len(nn_results))

# of Correct Classifications:  98
Accuracy:  0.049


In [109]:
print('Example of correct classification:')

correct_example = correct_results[0]
example_sentence = ehr_sentences[correct_example[0]]
print('\tSentence: ', example_sentence)

print('\n\tTrue category:', correct_example[1])
print('\tPredicted category:', correct_example[2])

print('\n\tTrue/Predicted Similarities:')
for token in example_sentence[1].split(' '):
    print('\t\t', token, med_vectors.similarity(correct_example[1], token))
    spec_similarity_sum += med_vectors.similarity(correct_example[1], token)

spec_similarity = spec_similarity_sum / len(example_sentence.split(' '))
print('\t\tAverage similarity: ', spec_similarity)


Example of correct classification:
	Sentence:  ['Orthopedic', 'exposed vertebral bodies c2c3 c4c5 bridged plate']

	True category: Orthopedic
	Predicted category: Orthopedic

	Similarities:
		 exposed -0.03660483
		 vertebral 0.266623
		 bodies 0.068785824
		 c2c3 0.031137193347711197
		 c4c5 0.09221873311276046
		 bridged 0.057441555
		 plate 0.039083302
		Average similarity:  0.1750587690063761


In [113]:
print('Example of incorrect classification:')

incorrect_example = nn_results[0]
example_sentence = ehr_sentences[incorrect_example[0]]
print('\tSentence: ', example_sentence)

print('\n\tTrue category:', incorrect_example[1])
print('\tPredicted category:', incorrect_example[2])

print('\n\tTrue Similarities:')
for token in example_sentence[1].split(' '):
    print('\t\t', token, med_vectors.similarity(incorrect_example[1], token))
    spec_similarity_sum += med_vectors.similarity(incorrect_example[1], token)

spec_similarity = spec_similarity_sum / len(example_sentence[1].split(' '))
print('\t\tAverage similarity: ', spec_similarity)

print('\n\tPredicted Similarities:')
for token in example_sentence[1].split(' '):
    print('\t\t', token, med_vectors.similarity(incorrect_example[2], token))
    spec_similarity_sum += med_vectors.similarity(incorrect_example[2], token)

spec_similarity = spec_similarity_sum / len(example_sentence[1].split(' '))
print('\t\tAverage similarity: ', spec_similarity)

Example of incorrect classification:
	Sentence:  ['Neurology', 'see velocity measurements left carotid eca measurement 0938 msecond']

	True category: Neurology
	Predicted category: Endocrinology

	True Similarities:
		 see 0.034260437
		 velocity 0.07255719
		 measurements 0.015290075
		 left 0.07127067
		 carotid 0.07629804
		 eca 0.094461195
		 measurement -0.020491015
		 0938 0.10253124
		 msecond 0.030581191
		Average similarity:  0.48662400427592156

	Predicted Similarities:
		 see -0.006658733
		 velocity 0.05569666
		 measurements 0.05710432
		 left 0.05892444
		 carotid 0.04782131
		 eca 0.15068905
		 measurement 0.05482881
		 0938 0.14380054
		 msecond 0.061377887
		Average similarity:  0.5559111470957501


So as we can see, the results are pretty terrible with an accuracy of 5%. Looking at an example the classifier got right, it relied on words that are exclusively and very distinctly related. However, these strong signals are not always present in our sentences. Looking at an incorrect example, we see how the signals are being drowned out or offset by the other words. This emphasizes the need for some type of model that can learn and weigh the words that provide strong signals for particular categories.

# METHODS AND APPROACHES
## Neural Networks

A neural network will allow us to build a model that can take in the word vectors as inputs and learn the complex relationships between those vectors to better classify the target sentence. This is a more holistic approach that tries to capture meaning from the entire sentence rather than token by token. 

## Defining our Training and Test Data

Before we can start building our neural networks, we first have to define our datasets. Specifically, we have to break up our EHR data so that we have records that we can train on and records that are exclusively used to test on. Maintaining a separate set for testing ensures we avoid overfitting our data.

We will use some built-in functions provided by Magnitude that helps encode our classes/categories. We then partition our data into our train and test sets. For each set we have both data and labels. Initially, we will be making these partitions small to make iterating through model development much quicker. However, once the models are developed, we will expand our datasets to include all of our data. To ensure we defined our data correctly, we can print a few lines from the two sets.

In [12]:
add_intent, intent_to_int, int_to_intent = MagnitudeUtils.class_encoding()

x_train = [ehr_sent[1].split(' ') for ehr_sent in ehr_sentences[:130000]]
x_test = [ehr_sent[1].split(' ') for ehr_sent in ehr_sentences[130001:]]

y_train = [add_intent(ehr_sent[0]) for ehr_sent in ehr_sentences[:130000]]
y_test = [add_intent(ehr_sent[0]) for ehr_sent in ehr_sentences[130001:]]

y_train = list(np.array(y_train).reshape(len(y_train)))
y_test = list(np.array(y_test).reshape(len(y_test)))

num_training = len(x_train)
num_test = len(x_test)
num_outputs = int(max(max(y_train), max(y_test))) + 1

print(int_to_intent(0))

print("First line of train/test data:")
print("\t", x_train[0])
print("\t", y_train[0], int_to_intent(y_train[0]))
print("\t", x_test[0])
print("\t", y_test[0], int_to_intent(y_test[0]))
print("Second line of train/test data:")
print("\t", x_train[1])
print("\t", y_train[1], int_to_intent(y_train[1]))
print("\t", x_test[1])
print("\t", y_test[1], int_to_intent(y_test[1]))

Psychiatry
First line of train/test data:
	 ['exhusband', 'died', '1980', 'acute', 'pancreatitis', 'secondary', 'alcohol', 'abuse']
	 0 Psychiatry
	 ['cystoscopy', 'revealed', 'good', 'efflux', 'urine', 'ureteral', 'openings']
	 4 Surgery
Second line of train/test data:
	 ['patient', 'taken', 'post', 'anesthesia', 'care', 'unit', 'stable', 'condition']
	 1 Gynecology
	 ['help', 'anyway', 'improve', 'patient', 'laboratory', 'abnormalities']
	 5 General


## Convultional Neural Network
-- Explain conv layers, focusing on 1d  
-- how it learns the best filters  
-- talk about exact model structure  

In [13]:
MAX_WORDS = 30 # The maximum number of words the sequence model will consider
STD_DEV = 0.01 # Deviation of noise for Gaussian Noise applied to the embeddings
DROPOUT_RATIO = .5 # The ratio to dropout
BATCH_SIZE = 100 # The number of examples per train/validation step
EPOCHS = 100 # The number of times to repeat through all of the training data
LEARNING_RATE = .01 # The learning rate for the optimizer
NUM_FILTERS = 128

model = Sequential()
model.add(GaussianNoise(STD_DEV, input_shape=(MAX_WORDS, med_vectors.dim)))
model.add(Conv1D(NUM_FILTERS, 7, activation='relu', padding='same'))
model.add(MaxPool1D(2))
model.add(Conv1D(NUM_FILTERS, 7, activation='relu', padding='same'))
model.add(GlobalMaxPool1D())
model.add(Dropout(DROPOUT_RATIO))
model.add(Dense(64, activation='relu'))
model.add(Dense(num_outputs, activation='softmax'))
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['categorical_accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gaussian_noise_2 (GaussianNo (None, 30, 200)           0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 30, 128)           179328    
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 15, 128)           0         
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 15, 128)           114816    
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 128)               0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 64)                8256      
__________

In [14]:
training_batches = MagnitudeUtils.batchify(x_train, y_train, BATCH_SIZE) # Split the training data into batches
num_batches_per_epoch_train = int(np.ceil(num_training/float(BATCH_SIZE)))
test_batches = MagnitudeUtils.batchify(x_test, y_test, BATCH_SIZE)  # Split the test data into batches
num_batches_per_epoch_test = int(np.ceil(num_test/float(BATCH_SIZE)))


# Generates batches of the transformed training data
train_batch_generator = (
  (
    med_vectors.query(x_train_batch), # Magnitude will handle converting the 2D array of text into the 3D word vector representations!
    MagnitudeUtils.to_categorical(y_train_batch, num_outputs) # Magnitude will handle converting the class labels into one-hot encodings!
  ) for x_train_batch, y_train_batch in training_batches
)

# Generates batches of the transformed test data
test_batch_generator = (
  (
    med_vectors.query(x_test_batch), # Magnitude will handle converting the 2D array of text into the 3D word vector representations!
    MagnitudeUtils.to_categorical(y_test_batch, num_outputs) # Magnitude will handle converting the class labels into one-hot encodings!
  ) for x_test_batch, y_test_batch in test_batches
)

# Start training
from keras.utils import np_utils
model.fit_generator(
    generator = train_batch_generator,
    steps_per_epoch = num_batches_per_epoch_train,
    validation_data = test_batch_generator,
    validation_steps = num_batches_per_epoch_test,
    epochs = EPOCHS,
)

Epoch 1/100
Epoch 3/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
 280/1300 [=====>........................] - ETA: 4:42 - loss: 1.7451 - categorical_accuracy: 0.3724

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Epoch 17/100
Epoch 18/100

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Epoch 23/100
Epoch 24/100

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Epoch 29/100
Epoch 30/100

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Epoch 39/100
Epoch 40/100

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Epoch 45/100
Epoch 46/100

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Epoch 60/100
Epoch 61/100

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Epoch 92/100
Epoch 93/100

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Epoch 99/100
Epoch 100/100

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [15]:
print("Results after training for %d epochs:" % (EPOCHS,))

train_metrics = model.evaluate_generator(
    generator = train_batch_generator,
    steps = num_batches_per_epoch_train,
)

print("loss: %.4f - categorical_accuracy: %.4f" % tuple(train_metrics))

val_metrics = model.evaluate_generator(
    generator = test_batch_generator,
    steps = num_batches_per_epoch_test,
)

print("val_loss: %.4f - val_categorical_accuracy: %.4f" % tuple(val_metrics))

Results after training for 100 epochs:
loss: 1.1771 - categorical_accuracy: 0.4449
val_loss: 2.5844 - val_categorical_accuracy: 0.2600


In [44]:
len(ehr_sentences)

146838

## LSTM Neural Network
-- talk about LSTM vs conv  
-- advantages  
-- talk about exact model  

In [197]:
MAX_WORDS = 30 # The maximum number of words the sequence model will consider
STD_DEV = 0.01 # Deviation of noise for Gaussian Noise applied to the embeddings
HIDDEN_UNITS = 100 # The number of hidden units from the LSTM
DROPOUT_RATIO = .8 # The ratio to dropout
BATCH_SIZE = 100 # The number of examples per train/validation step
EPOCHS = 100 # The number of times to repeat through all of the training data
LEARNING_RATE = .01 # The learning rate for the optimizer

model = Sequential()
model.add(GaussianNoise(STD_DEV, input_shape=(MAX_WORDS, med_vectors.dim)))
model.add(Bidirectional(LSTM(HIDDEN_UNITS, activation='tanh'), merge_mode='concat'))
model.add(Dropout(DROPOUT_RATIO))
model.add(Dense(num_outputs, activation='softmax'))
model.compile(
    loss='categorical_crossentropy',
    optimizer=Adam(lr=LEARNING_RATE),
    metrics=['categorical_accuracy'])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gaussian_noise_25 (GaussianN (None, 30, 200)           0         
_________________________________________________________________
bidirectional_11 (Bidirectio (None, 200)               240800    
_________________________________________________________________
dropout_11 (Dropout)         (None, 200)               0         
_________________________________________________________________
dense_23 (Dense)             (None, 33)                6633      
Total params: 247,433
Trainable params: 247,433
Non-trainable params: 0
_________________________________________________________________


In [132]:
training_batches = MagnitudeUtils.batchify(x_train, y_train, BATCH_SIZE) # Split the training data into batches
num_batches_per_epoch_train = int(np.ceil(num_training/float(BATCH_SIZE)))
test_batches = MagnitudeUtils.batchify(x_test, y_test, BATCH_SIZE)  # Split the test data into batches
num_batches_per_epoch_test = int(np.ceil(num_test/float(BATCH_SIZE)))

# Generates batches of the transformed training data
train_batch_generator = (
  (
    med_vectors.query(x_train_batch), # Magnitude will handle converting the 2D array of text into the 3D word vector representations!
    MagnitudeUtils.to_categorical(y_train_batch, num_outputs) # Magnitude will handle converting the class labels into one-hot encodings!
  ) for x_train_batch, y_train_batch in training_batches
)

# Generates batches of the transformed test data
test_batch_generator = (
  (
    med_vectors.query(x_test_batch), # Magnitude will handle converting the 2D array of text into the 3D word vector representations!
    MagnitudeUtils.to_categorical(y_test_batch, num_outputs) # Magnitude will handle converting the class labels into one-hot encodings!
  ) for x_test_batch, y_test_batch in test_batches
)

# Start training
from keras.utils import np_utils
model.fit_generator(
    generator = train_batch_generator,
    steps_per_epoch = num_batches_per_epoch_train,
    validation_data = test_batch_generator,
    validation_steps = num_batches_per_epoch_test,
    epochs = EPOCHS,
)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

Epoch 100/200
Epoch 101/200
Epoch 102/200
Epoch 103/200
Epoch 104/200
Epoch 105/200
Epoch 106/200
Epoch 107/200
Epoch 108/200
Epoch 109/200
Epoch 110/200
Epoch 111/200
Epoch 112/200
Epoch 113/200
Epoch 114/200
Epoch 115/200
Epoch 116/200
Epoch 117/200
Epoch 118/200
Epoch 119/200
Epoch 120/200
Epoch 121/200
Epoch 122/200
Epoch 123/200
Epoch 124/200
Epoch 125/200
Epoch 126/200
Epoch 127/200
Epoch 128/200
Epoch 129/200
Epoch 130/200
Epoch 131/200
Epoch 132/200
Epoch 133/200
Epoch 134/200
Epoch 135/200
Epoch 136/200
Epoch 137/200
Epoch 138/200
Epoch 139/200
Epoch 140/200
Epoch 141/200
Epoch 142/200
Epoch 143/200
Epoch 144/200
Epoch 145/200
Epoch 146/200
Epoch 147/200


Epoch 148/200
Epoch 149/200
Epoch 150/200
Epoch 151/200
Epoch 152/200
Epoch 153/200
Epoch 154/200
Epoch 155/200
Epoch 156/200
Epoch 157/200
Epoch 158/200
Epoch 159/200
Epoch 160/200
Epoch 161/200
Epoch 162/200
Epoch 163/200
Epoch 164/200
Epoch 165/200
Epoch 166/200
Epoch 167/200
Epoch 168/200
Epoch 169/200
Epoch 170/200
Epoch 171/200
Epoch 172/200
Epoch 173/200
Epoch 174/200
Epoch 175/200
Epoch 176/200
Epoch 177/200
Epoch 178/200
Epoch 179/200
Epoch 180/200
Epoch 181/200
Epoch 182/200
Epoch 183/200
Epoch 184/200
Epoch 185/200
Epoch 186/200
Epoch 187/200
Epoch 188/200
Epoch 189/200
Epoch 190/200
Epoch 191/200
Epoch 192/200
Epoch 193/200
Epoch 194/200
Epoch 195/200
Epoch 196/200


Epoch 197/200
Epoch 198/200
Epoch 199/200
Epoch 200/200


<keras.callbacks.History at 0x7f3e697db198>

In [10]:
print("Results after training for %d epochs:" % (EPOCHS,))

train_metrics = model.evaluate_generator(
    generator = train_batch_generator,
    steps = num_batches_per_epoch_train,
)

print("loss: %.4f - categorical_accuracy: %.4f" % tuple(train_metrics))

val_metrics = model.evaluate_generator(
    generator = test_batch_generator,
    steps = num_batches_per_epoch_test,
)

print("val_loss: %.4f - val_categorical_accuracy: %.4f" % tuple(val_metrics))

Results after training for 100 epochs:
loss: 0.1476 - categorical_accuracy: 0.9353
val_loss: 6.9081 - val_categorical_accuracy: 0.2211


In [134]:
MAX_WORDS = 30 # The maximum number of words the sequence model will consider
STD_DEV = 0.01 # Deviation of noise for Gaussian Noise applied to the embeddings
HIDDEN_UNITS = 50 # The number of hidden units from the LSTM
DROPOUT_RATIO = .8 # The ratio to dropout
BATCH_SIZE = 100 # The number of examples per train/validation step
EPOCHS = 200 # The number of times to repeat through all of the training data
LEARNING_RATE = .001 # The learning rate for the optimizer

model = Sequential()
model.add(GaussianNoise(STD_DEV, input_shape=(MAX_WORDS, med_vectors.dim)))
model.add(Bidirectional(LSTM(HIDDEN_UNITS, activation='tanh'), merge_mode='concat'))
model.add(Dropout(DROPOUT_RATIO))
model.add(Dense(num_outputs, activation='softmax'))
model.compile(
    loss='categorical_crossentropy',
    optimizer=Adam(lr=LEARNING_RATE),
    metrics=['categorical_accuracy'])

In [135]:
training_batches = MagnitudeUtils.batchify(x_train, y_train, BATCH_SIZE) # Split the training data into batches
num_batches_per_epoch_train = int(np.ceil(num_training/float(BATCH_SIZE)))
test_batches = MagnitudeUtils.batchify(x_test, y_test, BATCH_SIZE)  # Split the test data into batches
num_batches_per_epoch_test = int(np.ceil(num_test/float(BATCH_SIZE)))

# Generates batches of the transformed training data
train_batch_generator = (
  (
    med_vectors.query(x_train_batch), # Magnitude will handle converting the 2D array of text into the 3D word vector representations!
    MagnitudeUtils.to_categorical(y_train_batch, num_outputs) # Magnitude will handle converting the class labels into one-hot encodings!
  ) for x_train_batch, y_train_batch in training_batches
)

# Generates batches of the transformed test data
test_batch_generator = (
  (
    med_vectors.query(x_test_batch), # Magnitude will handle converting the 2D array of text into the 3D word vector representations!
    MagnitudeUtils.to_categorical(y_test_batch, num_outputs) # Magnitude will handle converting the class labels into one-hot encodings!
  ) for x_test_batch, y_test_batch in test_batches
)

# Start training
from keras.utils import np_utils
model.fit_generator(
    generator = train_batch_generator,
    steps_per_epoch = num_batches_per_epoch_train,
    validation_data = test_batch_generator,
    validation_steps = num_batches_per_epoch_test,
    epochs = EPOCHS,
)

Epoch 1/200
Epoch 1/200
Epoch 2/200
 1/50 [..............................] - ETA: 1s - loss: 3.0341 - categorical_accuracy: 0.2100Epoch 2/200
Epoch 3/200
Epoch 3/200
Epoch 4/200
Epoch 4/200
Epoch 5/200
 1/50 [..............................] - ETA: 3s - loss: 2.7613 - categorical_accuracy: 0.3100Epoch 5/200
Epoch 6/200
 1/50 [..............................] - ETA: 2s - loss: 2.6345 - categorical_accuracy: 0.2800Epoch 6/200
Epoch 7/200
 1/50 [..............................] - ETA: 1s - loss: 2.6175 - categorical_accuracy: 0.2500Epoch 7/200
Epoch 8/200
 1/50 [..............................] - ETA: 2s - loss: 2.6593 - categorical_accuracy: 0.3200Epoch 8/200
Epoch 9/200
 1/50 [..............................] - ETA: 1s - loss: 2.6224 - categorical_accuracy: 0.2900Epoch 9/200
Epoch 10/200
 1/50 [..............................] - ETA: 2s - loss: 2.5555 - categorical_accuracy: 0.2800Epoch 10/200
Epoch 11/200
 1/50 [..............................] - ETA: 1s - loss: 2.5168 - categorical_accuracy:

Epoch 22/200
Epoch 22/200
Epoch 23/200
 1/50 [..............................] - ETA: 1s - loss: 2.3776 - categorical_accuracy: 0.3200Epoch 23/200
Epoch 24/200
 1/50 [..............................] - ETA: 2s - loss: 2.3930 - categorical_accuracy: 0.3000Epoch 24/200
Epoch 25/200
Epoch 25/200
Epoch 26/200
 1/50 [..............................] - ETA: 1s - loss: 2.3434 - categorical_accuracy: 0.3200Epoch 26/200
Epoch 27/200
 1/50 [..............................] - ETA: 2s - loss: 2.3633 - categorical_accuracy: 0.3400Epoch 27/200
Epoch 28/200
Epoch 28/200
Epoch 29/200
 1/50 [..............................] - ETA: 1s - loss: 2.3353 - categorical_accuracy: 0.3500Epoch 29/200
Epoch 30/200
Epoch 30/200
Epoch 31/200
Epoch 31/200
Epoch 32/200
 1/50 [..............................] - ETA: 1s - loss: 2.2413 - categorical_accuracy: 0.3600Epoch 32/200
Epoch 33/200
 1/50 [..............................] - ETA: 1s - loss: 2.3049 - categorical_accuracy: 0.3100Epoch 33/200
Epoch 34/200
Epoch 34/200
Epoc

Epoch 63/200
Epoch 63/200
Epoch 64/200
Epoch 64/200
Epoch 65/200
 1/50 [..............................] - ETA: 1s - loss: 2.0549 - categorical_accuracy: 0.4100Epoch 65/200
Epoch 66/200
 1/50 [..............................] - ETA: 1s - loss: 2.2438 - categorical_accuracy: 0.3800Epoch 66/200
Epoch 67/200
 1/50 [..............................] - ETA: 1s - loss: 2.1087 - categorical_accuracy: 0.3500Epoch 67/200
Epoch 68/200
Epoch 68/200
Epoch 69/200
 1/50 [..............................] - ETA: 1s - loss: 1.9947 - categorical_accuracy: 0.3900Epoch 69/200
Epoch 70/200
 1/50 [..............................] - ETA: 1s - loss: 1.9651 - categorical_accuracy: 0.4000Epoch 70/200
Epoch 71/200
Epoch 71/200
Epoch 72/200
 1/50 [..............................] - ETA: 1s - loss: 2.0758 - categorical_accuracy: 0.2900Epoch 72/200
Epoch 73/200
 1/50 [..............................] - ETA: 2s - loss: 2.0229 - categorical_accuracy: 0.4000Epoch 73/200
Epoch 74/200
 1/50 [..............................] - ET

Epoch 103/200
 1/50 [..............................] - ETA: 1s - loss: 1.7984 - categorical_accuracy: 0.4300Epoch 103/200
Epoch 104/200
 1/50 [..............................] - ETA: 1s - loss: 1.8401 - categorical_accuracy: 0.4300Epoch 104/200
Epoch 105/200
Epoch 105/200
Epoch 106/200
 1/50 [..............................] - ETA: 1s - loss: 1.9610 - categorical_accuracy: 0.4200Epoch 106/200
Epoch 107/200
 1/50 [..............................] - ETA: 1s - loss: 1.8906 - categorical_accuracy: 0.3600Epoch 107/200
Epoch 108/200
 1/50 [..............................] - ETA: 1s - loss: 1.8737 - categorical_accuracy: 0.4400Epoch 108/200
Epoch 109/200
 1/50 [..............................] - ETA: 2s - loss: 1.8198 - categorical_accuracy: 0.4300Epoch 109/200
Epoch 110/200
 1/50 [..............................] - ETA: 2s - loss: 1.8533 - categorical_accuracy: 0.3900Epoch 110/200
Epoch 111/200
 1/50 [..............................] - ETA: 2s - loss: 1.7305 - categorical_accuracy: 0.4600Epoch 111/

Epoch 142/200
 1/50 [..............................] - ETA: 2s - loss: 1.5905 - categorical_accuracy: 0.4900Epoch 142/200
Epoch 143/200
 1/50 [..............................] - ETA: 1s - loss: 1.7128 - categorical_accuracy: 0.4900Epoch 143/200
Epoch 144/200
 1/50 [..............................] - ETA: 1s - loss: 1.6646 - categorical_accuracy: 0.5000Epoch 144/200
Epoch 145/200
 1/50 [..............................] - ETA: 2s - loss: 1.5810 - categorical_accuracy: 0.4800Epoch 145/200
Epoch 146/200
 1/50 [..............................] - ETA: 2s - loss: 1.7918 - categorical_accuracy: 0.4500Epoch 146/200
Epoch 147/200
 1/50 [..............................] - ETA: 1s - loss: 1.6627 - categorical_accuracy: 0.4600Epoch 147/200
Epoch 148/200
 1/50 [..............................] - ETA: 2s - loss: 1.6251 - categorical_accuracy: 0.5100Epoch 148/200
Epoch 149/200
 1/50 [..............................] - ETA: 2s - loss: 1.7525 - categorical_accuracy: 0.4600Epoch 149/200
Epoch 150/200
 1/50 [...

Epoch 181/200
 1/50 [..............................] - ETA: 1s - loss: 1.5150 - categorical_accuracy: 0.5300Epoch 181/200
Epoch 182/200
 1/50 [..............................] - ETA: 1s - loss: 1.5695 - categorical_accuracy: 0.5600Epoch 182/200
Epoch 183/200
Epoch 183/200
Epoch 184/200
 1/50 [..............................] - ETA: 1s - loss: 1.5243 - categorical_accuracy: 0.5200Epoch 184/200
Epoch 185/200
 1/50 [..............................] - ETA: 1s - loss: 1.4249 - categorical_accuracy: 0.5800Epoch 185/200
Epoch 186/200
 1/50 [..............................] - ETA: 1s - loss: 1.5252 - categorical_accuracy: 0.5100Epoch 186/200
Epoch 187/200
 1/50 [..............................] - ETA: 1s - loss: 1.3496 - categorical_accuracy: 0.6000Epoch 187/200
Epoch 188/200
 1/50 [..............................] - ETA: 1s - loss: 1.5613 - categorical_accuracy: 0.4900Epoch 188/200
Epoch 189/200
 1/50 [..............................] - ETA: 1s - loss: 1.6396 - categorical_accuracy: 0.4500Epoch 189/

<keras.callbacks.History at 0x7f3e6db8cbe0>

<keras.callbacks.History at 0x7f3e6db8cbe0>

In [138]:
print("Results after training for %d epochs:" % (EPOCHS,))

train_metrics = model.evaluate_generator(
    generator = train_batch_generator,
    steps = num_batches_per_epoch_train,
)

print("loss: %.4f - categorical_accuracy: %.4f" % tuple(train_metrics))

val_metrics = model.evaluate_generator(
    generator = test_batch_generator,
    steps = num_batches_per_epoch_test,
)

print("val_loss: %.4f - val_categorical_accuracy: %.4f" % tuple(val_metrics))

Results after training for 200 epochs:
loss: 1.1604 - categorical_accuracy: 0.6434
val_loss: 3.7746 - val_categorical_accuracy: 0.2553


In [20]:
print(int_to_intent(MagnitudeUtils.from_categorical(model.predict(med_vectors.query(["past medical history difficulty climbing stairs difficulty airline seats tying shoes used public seating lifting objects floor".split(" ")])))[0]))

 Bariatrics


In [None]:
with open('data/ehr_sentences.csv', 'w') as outfile:
    writer = csv.writer(outfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    writer.writerow(['Specialty', 'Note'])
    for sent in ehr_sentences:
        writer.writerow(sent)
        

In [None]:
ehr_labels = []
ehr_vectors = []

for sentence in ehr_sentences:
    ehr_labels.append(sentence[0])

    sentence_split = sentence[1].split(' ')
    ehr_vectors.append(med_vectors.query(sentence_split))

In [None]:
with open('data/ehr_labels.csv', 'w') as outfile:
    writer = csv.writer(outfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    writer.writerow(['SpecialtyID'])
    for lbl in ehr_labels:
        writer.writerow(lbl)
        
with open('data/ehr_vectors.csv', 'w') as outfile:
    writer = csv.writer(outfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    writer.writerow(['NoteVector'])
    for vctr in ehr_vectors:
        writer.writerow(vctr)