In [23]:
import glob
import pandas as pd
import numpy as np
import re
import spacy
from sklearn import svm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

## Read in the notes

In [5]:
def sep():
    label_path = 'docclasses_RAtrial.csv'
    df=pd.read_csv(label_path)
    df1=df.loc[df['truedocumentclass'] == 1]
    df0=df.loc[df['truedocumentclass'] == 0]
    fn1 = df1['documentname'].tolist()
    fn0 = df0['documentname'].tolist()
    return fn1, fn0

In [2]:
def read_notes(fn1): # from individual input files
    f_path = "Corpus_Test/*"
    names = glob.glob(f_path) # [...,'Corpus_Test/652742.txt',...]
    names.sort()

    notes1 = []
    id1 = []
    notes0 = []
    id0 = []
    for fi in names:
        with open(fi, "r") as f:
            a = fi[12:] #remove prefix
            b = int(a[:-4]) #remove suffix, then b is the id (also doc_name) for each note
            note = f.read()
            words = note.split()
            flat = " ".join(words)
            if b in fn1:
                notes1.append(flat)
                id1.append(b)
            else:
                notes0.append(flat)
                id0.append(b)
    return notes1, notes0, id1, id0

In [8]:
nlp = spacy.load('en')

Now, all bleeding notes are read in array named notes1, all non-bleeding notes are read in array named notes1. Let's look at the first bleeding note where bleeding is present.

In [6]:
fn1, fn0 = sep()
notes1, notes0, id1, id0 = read_notes(fn1)
notes1[0]

"Admission Date: [**2533-12-28**] Discharge Date: [**2534-1-6**] Service: CARDIOTHOR CHIEF COMPLAINT: The patient is an 85-year-old woman patient of Dr. [**First Name8 (NamePattern2) **] [**Last Name (NamePattern1) 721**] referred for an outpatient cardiac catheterization due to progressive dyspnea on exertion on positive ETT. HISTORY OF THE PRESENT ILLNESS: [**Known patient firstname 4426**] [**Known patient lastname 4427**] is an 85-year-old woman with no known cardiac history. She complains of several weeks of shortness of breath with a minimal amount of exertion. She states that she was recently out with her daughter doing some shopping and her daughter noticed that the patient was quite short of breath. She denied any history of chest discomfort, palpitations, or dizziness. Echocardiogram from [**Month (only) 151**] of this year revealed mild to moderate MI, mild TIA, moderate pulmonary artery hypertension, concentric LVH with an EF60%, sclerotic aortic valve without stenosis, Per

### Q1. What is the mean number of words and standard deviation (SD) of that mean for notes labelled as bleeding present? For notes labelled as bleeding absent?

In [7]:
def word_count_stat(notes,nlp):
    count_list = []
    for note in notes:
        doc = nlp(note) 
        note_tokens = []
        for token in doc:
            if token.is_punct:
                continue
            else:
                note_tokens.append(token.text)
        ct = len(note_tokens)
        count_list.append(ct)
    count_array = np.array(count_list)
    mu = np.mean(count_array)
    s = np.std(count_array)
    return mu, s

In [10]:
mu1, s1 = word_count_stat(notes1,nlp)
print(mu1,s1)

1228.04109589 679.708749843


In [11]:
mu0, s0 = word_count_stat(notes0,nlp)
print(mu0,s0)

1079.87743191 616.860775617


### Q2.  What is the mean and SD for the number of times the words “bleed,” “bled,” and “bleeding” appear in notes labelled as bleeding present? For notes labelled as bleeding absent?

In [12]:
def bleed_count_stat(notes,nlp):
    count_list = []
    trigger = ['bleed','bled','bleeding']
    for note in notes:
        doc = nlp(note)
        note_tokens = []
        for token in doc:
            if token.text.lower() in trigger:
                note_tokens.append(token.text)
        ct = len(note_tokens)
        count_list.append(ct)
    count_array = np.array(count_list)
    mu = np.mean(count_array)
    s = np.std(count_array)
    return mu, s

In [13]:
mu1, s1 = bleed_count_stat(notes1,nlp)
print(mu1,s1)

2.36301369863 2.86184186545


In [14]:
mu0, s0 = bleed_count_stat(notes0,nlp)
print(mu0,s0)

0.24513618677 0.646921534443


### Q3. What are the top ten bigrams in notes labelled as bleeding present? For notes labelled as bleeding absent?

In [15]:
def topn_bigrams_word(notes,nlp,n):
    all_sentences = []
    all_bigs = []
    for note in notes:
        doc = nlp(note)
        sentences = [sent.text for sent in doc.sents]
        all_sentences.extend(sentences) #combine all sentences into one single array
        cv_i = CountVectorizer(ngram_range=(2,2)).fit(sentences)
        bigs = cv_i.get_feature_names()
        all_bigs.append(bigs) # read each note as bigrams into an array

    ct_dict = {}
    cv = CountVectorizer(ngram_range=(2,2)).fit(all_sentences)
    bigs = cv.get_feature_names()
    x=cv.fit_transform(all_sentences)
    ct_array=sum(x).toarray()[0]
    combo = zip(bigs,ct_array)
    for bi, c in combo:
        ct_dict[bi] = c
    sorted_by_value = sorted(ct_dict.items(), key=lambda kv: kv[1], reverse=True)
    topn = [x[0] for x in sorted_by_value[:n]]
    return topn, all_bigs

In [16]:
n = 10
topn1, all_bigs1 = topn_bigrams_word(notes1,nlp,n)
print(topn1)

['the patient', 'mg dl', 'in the', 'last name', 'not assessed', 'mg tablet', 'patient was', 'of the', 'sig one', 'to the']


In [17]:
topn0, all_bigs0 = topn_bigrams_word(notes0,nlp,n)
print(topn0)

['mg dl', 'the patient', 'last name', 'in the', 'mg tablet', 'sig one', 'to the', 'tablet sig', 'tablet po', 'one tablet']


### Q4. Use scikit learn (or other package) to create a machine learning model to predict bleeding present or bleeding absent for each clinical note. Nothing fancy, just a simple model. What is the sensitivity, specificity, positive predictive value, and negative predictive value for the model?

In [18]:
def combine(notes1,notes0,nlp,n):
    topn1, all_bigs1 = topn_bigrams_word(notes1,nlp,n)
    topn0, all_bigs0 = topn_bigrams_word(notes0,nlp,n)
    topn=list(set(topn0+topn1))
    print('feature size:',len(topn))
    return all_bigs0 + all_bigs1,topn

In [19]:
def gen_voc_dict(topn):
    voc_dict = dict((item, i) for i, item in enumerate(topn))
    return voc_dict

In [20]:
def note_to_idx(note_big,voc_dict): 
    index_list = [ voc_dict[token] for token in note_big if token in voc_dict ]
    return index_list

In [24]:
def note_to_vec(note_big,voc_dict):
    n = len(voc_dict)
    result = np.zeros((n,1))
    voc_idx = note_to_idx(note_big, voc_dict)
    for idx in voc_idx:
        result[idx] = 1
    result = np.reshape(result,n).tolist()
    return result

In [21]:
def prep_X(all_bigs,voc_dict):
    X_list=[note_to_vec(x, voc_dict) for x in all_bigs]
    df = pd.DataFrame(data=X_list)
    X=df.values
    return X

In [22]:
def prep_y(id1,id0):
    idx = id0 + id1
    label_path = 'docclasses_RAtrial.csv'
    df=pd.read_csv(label_path)
    df=df.set_index('documentname')
    df1=df.loc[idx]
    y=df1.values
    return y

In [25]:
def classify(X,y): 
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    classifier = svm.SVC(kernel='linear', C=0.01)
    y_pred = classifier.fit(X_train, y_train).predict(X_test)
    return y_test,y_pred

In [26]:
def eval(y_true,y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    sensitivity = tp/(tp+fn+eps)
    specificity = tn/(tn+fp+eps)
    ppv = tp/(tp+fp+eps)
    npv = tn/(tn+fn+eps)
    return sensitivity, specificity, ppv, npv

In [27]:
n=30
eps = 1e-6
all_bigs,topn=combine(notes1,notes0,nlp,n)
voc_dict=gen_voc_dict(topn)
X=prep_X(all_bigs,voc_dict)
y=prep_y(id1,id0)
y_test,y_pred=classify(X,y)
sensitivity, specificity, ppv, npv=eval(y_test,y_pred)
print('sensitivity:', sensitivity)
print('specificity:', specificity)
print('positive predictive value:', npv)
print('negative predictive value:',npv)

sensitivity: 0.0
specificity: 0.999999991935
positive predictive value: 0.751515146961
negative predictive value: 0.751515146961


  y = column_or_1d(y, warn=True)
