In [1]:
import glob
import pandas as pd
import numpy as np
import re
import spacy
from sklearn import svm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

## Read in the notes

In [2]:
def sep():
    label_path = 'docclasses_RAtrial.csv'
    df=pd.read_csv(label_path)
    df1=df.loc[df['truedocumentclass'] == 1]
    df0=df.loc[df['truedocumentclass'] == 0]
    fn1 = df1['documentname'].tolist()
    fn0 = df0['documentname'].tolist()
    return fn1, fn0

In [3]:
def read_notes(fn1): # from individual input files
    f_path = "Corpus_Test/*"
    names = glob.glob(f_path) # [...,'Corpus_Test/652742.txt',...]
    names.sort()

    notes1 = []
    id1 = []
    notes0 = []
    id0 = []
    for fi in names:
        with open(fi, "r") as f:
            a = fi[12:] #remove prefix
            b = int(a[:-4]) #remove suffix, then b is the id (also doc_name) for each note
            note = f.read()
            words = note.split()
            flat = " ".join(words)
            if b in fn1:
                notes1.append(flat)
                id1.append(b)
            else:
                notes0.append(flat)
                id0.append(b)
    return notes1, notes0, id1, id0

In [4]:
nlp = spacy.load('en')

Now, all bleeding notes are read in array named notes1, all non-bleeding notes are read in array named notes1. Let's look at the first bleeding note where bleeding is present.

In [15]:
fn1, fn0 = sep()
notes1, notes0, id1, id0 = read_notes(fn1)

### Q1. What is the mean number of words and standard deviation (SD) of that mean for notes labelled as bleeding present? For notes labelled as bleeding absent?

In [7]:
def word_count_stat(notes,nlp):
    count_list = []
    for note in notes:
        doc = nlp(note) 
        note_tokens = []
        for token in doc:
            if token.is_punct:
                continue
            else:
                note_tokens.append(token.text)
        ct = len(note_tokens)
        count_list.append(ct)
    count_array = np.array(count_list)
    mu = np.mean(count_array)
    s = np.std(count_array)
    return mu, s

In [10]:
mu1, s1 = word_count_stat(notes1,nlp)
print(mu1,s1)

1228.04109589 679.708749843


In [11]:
mu0, s0 = word_count_stat(notes0,nlp)
print(mu0,s0)

1079.87743191 616.860775617


### Q2.  What is the mean and SD for the number of times the words “bleed,” “bled,” and “bleeding” appear in notes labelled as bleeding present? For notes labelled as bleeding absent?

In [12]:
def bleed_count_stat(notes,nlp):
    count_list = []
    trigger = ['bleed','bled','bleeding']
    for note in notes:
        doc = nlp(note)
        note_tokens = []
        for token in doc:
            if token.text.lower() in trigger:
                note_tokens.append(token.text)
        ct = len(note_tokens)
        count_list.append(ct)
    count_array = np.array(count_list)
    mu = np.mean(count_array)
    s = np.std(count_array)
    return mu, s

In [13]:
mu1, s1 = bleed_count_stat(notes1,nlp)
print(mu1,s1)

2.36301369863 2.86184186545


In [14]:
mu0, s0 = bleed_count_stat(notes0,nlp)
print(mu0,s0)

0.24513618677 0.646921534443


### Q3. What are the top ten bigrams in notes labelled as bleeding present? For notes labelled as bleeding absent?

In [17]:
def topn_bigrams_word(notes,nlp,n):
    all_sentences = []
    all_bigs = []
    for note in notes:
        doc = nlp(note)
        sentences = [sent.text for sent in doc.sents]
        all_sentences.extend(sentences) #combine all sentences into one single array
        cv_i = CountVectorizer(ngram_range=(2,2)).fit(sentences)
        bigs = cv_i.get_feature_names()
        all_bigs.append(bigs) # read each note as bigrams into an array

    ct_dict = {}
    cv = CountVectorizer(ngram_range=(2,2)).fit(all_sentences)
    bigs = cv.get_feature_names()
    x=cv.fit_transform(all_sentences)
    ct_array=sum(x).toarray()[0]
    combo = zip(bigs,ct_array)
    for bi, c in combo:
        ct_dict[bi] = c
    sorted_by_value = sorted(ct_dict.items(), key=lambda kv: kv[1], reverse=True)
    topn = [x[0] for x in sorted_by_value[:n]]
    return topn, all_bigs

In [16]:
n = 10
topn1, all_bigs1 = topn_bigrams_word(notes1,nlp,n)
print(topn1)

['the patient', 'mg dl', 'in the', 'last name', 'not assessed', 'mg tablet', 'patient was', 'of the', 'sig one', 'to the']


In [17]:
topn0, all_bigs0 = topn_bigrams_word(notes0,nlp,n)
print(topn0)

['mg dl', 'the patient', 'last name', 'in the', 'mg tablet', 'sig one', 'to the', 'tablet sig', 'tablet po', 'one tablet']


### Q4. Use scikit learn (or other package) to create a machine learning model to predict bleeding present or bleeding absent for each clinical note. Nothing fancy, just a simple model. What is the sensitivity, specificity, positive predictive value, and negative predictive value for the model?

### 1. Extracting Features from Notes
We have chosen only the most frequently occuring bigrams as the vocabulary list. More specifically, we have chose top $30$ bigrams from bleeding notes.

In [40]:
def notes_to_tokens(notes,nlp):
    tokens = []
    for note in notes:
        doc = nlp(note) 
        note_tokens = []
        for token in doc:
            if token.is_punct:
                continue
            else:
                note_tokens.append(token.text)
        tokens.append(note_tokens)
    return tokens

Now, we convert each note into a vector in $R^n$ where $n$ is the number of bigrams and unigrams together in vocabulary list.

In [28]:
def gen_voc_dict():
    trigger = ['bleed','bled','bleeding']
    voc_dict = dict((item, i) for i, item in enumerate(trigger))
    return voc_dict

In [29]:
def note_to_idx(note_tokens,voc_dict): 
    index_list = [voc_dict[token] for token in note_tokens if token in voc_dict]
    return index_list

In [30]:
def note_to_vec(note_tokens,voc_dict):
    n = len(voc_dict)
    result = np.zeros((n,1))
    voc_idx = note_to_idx(note_tokens, voc_dict)
    for idx in voc_idx:
        result[idx] = 1
    result = np.reshape(result,n).tolist()
    return result

In [42]:
def prep_X(all_tokens,voc_dict):
    X_list=[note_to_vec(x, voc_dict) for x in all_tokens]
    df = pd.DataFrame(data=X_list)
    X=df.values
    return X
print(X)
X.shape

[[ 0.  0.  1.]
 [ 0.  0.  0.]
 [ 0.  0.  0.]
 ..., 
 [ 0.  0.  0.]
 [ 0.  0.  0.]
 [ 0.  0.  1.]]


(660, 3)

In [32]:
def prep_y(id1,id0):
    idx = id0 + id1
    label_path = 'docclasses_RAtrial.csv'
    df=pd.read_csv(label_path)
    df=df.set_index('documentname')
    df1=df.loc[idx]
    y=df1.values
    m=y.shape[0]
    y1=np.reshape(y,m)
    return y1

### 2. Training SVM for Bleed Classification

In [33]:
def classify(X,y): 
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    #classifier = svm.SVC(kernel='linear', C=0.01)
    classifier = svm.SVC(C=50, kernel='rbf', gamma=6)
    y_pred = classifier.fit(X_train, y_train).predict(X_test)
    return y_test,y_pred

### 3. Evaluating the Model Using Confusion Matrix

In [34]:
def eval(y_true,y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    sensitivity = tp/(tp+fn+eps)
    specificity = tn/(tn+fp+eps)
    ppv = tp/(tp+fp+eps)
    npv = tn/(tn+fn+eps)
    return sensitivity, specificity, ppv, npv

In [41]:
n=30
eps = 1e-6
tokens1=notes_to_tokens(notes1,nlp)
tokens0=notes_to_tokens(notes0,nlp)
voc_dict=gen_voc_dict()

X=prep_X(tokens0+tokens1,voc_dict)
y=prep_y(id1,id0)
y_test,y_pred=classify(X,y)
sensitivity, specificity, ppv, npv=eval(y_test,y_pred)
print('sensitivity:', sensitivity)
print('specificity:', specificity)
print('positive predictive value:', npv)
print('negative predictive value:',npv)

sensitivity: 0.292682919691
specificity: 0.975806443743
positive predictive value: 0.806666661289
negative predictive value: 0.806666661289
