In [4]:
from gensim.models import KeyedVectors
import pandas as pd
import numpy as np
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
pd.set_option('display.max_colwidth', -1)
np.set_printoptions(threshold=np.nan)
pd.options.display.max_rows = 999

In [5]:
model = KeyedVectors.load_word2vec_format('PubMed-shuffle-win-30.bin', binary=True)

In [6]:
df = pd.read_csv("Data/PatientNotes_acronym_longform.csv")
group_df = df[['abbreviation','long_form']]
group_df = group_df.groupby('abbreviation').agg(lambda x: set(x.tolist()))
group_df

Unnamed: 0_level_0,long_form
abbreviation,Unnamed: 1_level_1
AB,"{X-ray finding, UNSURED SENSE, NAME, antipyrine benzocaine, atrioventricular:AV, ankle-brachial, arterial blood, abortion, blood group in ABO system, type A, type B, MISTAKE:abduction, arteriovenous:AV}"
AC,"{adriamycin cyclophosphamide, assist control, acetate, alternating current, antecubital, (drug) AC, anticoagulation, angiotensin-converting enzyme:ACE, before meals, abdominal circumference, acromioclavicular}"
ALD,"{ad lib on demand, alanine aminotransferase:ALT, adrenoleukodystrophy, acetyl lysergic acid diethylamide, left anterior descending:LAD}"
AMA,"{against medical advice, antimitochondrial antibody, advanced maternal age}"
ASA,"{aminosalicylic acid, American Society of Anesthesiologists, acetylsalicylic acid}"
AV,"{atrioventricular, UNSURED SENSE, arteriovenous, aortic valve}"
AVR,"{UNSURED SENSE, aortic valve resistance, aortic valve replacement, aortic valve regurgitation, augmented voltage right arm, auditory brainstem response:ABR, rapid ventricular response:RVR}"
BAL,"{blood alcohol level, bronchoalveolar lavage}"
BK,"{BK (virus), below knee}"
BM,"{bone marrow, bowel movement, UNSURED SENSE, breast milk}"


In [7]:
stop_words = "disease, diseases, disorder, symptom, symptoms, drug, drugs, problems, problem,prob, probs, med, meds,\
, pill, pills, medicine, medicines, medication, medications, treatment, treatments, caps, capsules, capsule,\
, tablet, tablets, tabs, doctor, dr, dr., doc, physician, physicians, test, tests, testing, specialist, specialists,\
, side-effect, side-effects, patient, patients, pharmaceutical, pharmaceuticals, pharma, diagnosis, diagnose, diagnosed, exam,\
, challenge, device, condition, conditions, suffer, suffering ,suffered, feel, feeling, prescription, prescribe,\
, prescribed, over-the-counter, a, about, above, after, again, against, all, am, an, and, any, are, aren't, as, at, be, because, been, before,\
, being, below, between, both, but, by, can, can't, cannot, could, couldn't, did, didn't, do, does, doesn't,\
, doing, don't, down, during, each, few, for, from, further, had, hadn't, has, hasn't, have, haven't, having, he,\
, he'd, he'll, he's, her, here, here's, hers, herself, him, himself, his, how, how's, i, i'd, i'll, i'm, i've, if, in, into,\
, is, isn't, it, it's, its, itself, let's, me, more, most, mustn't, my, myself, no, nor, not, of, off, on, once, only, or,\
, other, ought, our, ours , ourselves, out, over, own, same, shan't, she, she'd, she'll, she's, should, shouldn't,\
, so, some, such, than, that, that's, the, their, theirs, them, themselves, then, there, there's, these, they,\
, they'd, they'll, they're, they've, this, those, through, to, too, under, until, up, very, was, wasn't, we, we'd,\
, we'll, we're, we've, were, weren't, what, what's, when, when's, where, where's, which, while, who, who's,\
, whom, why, why's, with, won't, would, wouldn't, you, you'd, you'll, you're, you've, your, yours, yourself,\
, yourselves, n't, 're, 've, 'd, 's, 'll, 'm".replace(',','').split(' ')

In [8]:
#from nltk.stem.snowball import SnowballStemmer
def sentence_vector(sentence):
    word_list = sentence.split()
    #stemmer = SnowballStemmer("english", ignore_stopwords=True)
    #word_list = [stemmer.stem(x) for x in word_list]
    word_list = [word for word in word_list if word not in stop_words]
    word_vectors = []
    for x in word_list:
        try:
            word_vectors.append(model[x])
        except KeyError:
            None    
    return sum(word_vectors)/len(word_vectors)

def vector_breakage(sentence):
    word_list = sentence.split()
    #stemmer = SnowballStemmer("english", ignore_stopwords=True)
    #word_list = [stemmer.stem(x) for x in word_list]
    word_list = [word for word in word_list if word not in stop_words]
    word_vectors_list = []
    for x in word_list:
        try:
            if len(model[x])==200:
                word_vectors_list.append(x)
        except:
            None
        else:
            None
    return word_vectors_list


In [9]:
df_slice = df[df.abbreviation=='MP']
y = df_slice.long_form
X = list([sentence_vector(x) for x in df_slice.sentence])


In [10]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_fscore_support
np.set_printoptions(threshold=np.nan)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [12]:
clf = SVC(C=1.0, kernel='linear', degree=1).fit(X_train, y_train)

In [13]:
pred = clf.predict(X_test)
cm = confusion_matrix(y_test, pred,labels=list(set(df_slice.long_form)))
cross_val_scores = cross_val_score(clf, X, y, cv=7)



In [14]:
print('accuracy: {}'.format(cross_val_scores))
print()
print(set(df_slice.long_form))
print('sample size: {}'.format([len(df_slice[df_slice.long_form == x]) for x in set(df_slice.long_form)]))
print()
print(cm)
print()
print(f1_score(y_test,pred,average = 'weighted'))

accuracy: [0.69230769 0.56       0.64383562 0.65753425 0.66666667 0.67164179
 0.66153846]

{'nurse practitioner:NP', 'military police', 'mesangial proliferative', 'metacarpophalangeal', '(device) MP', 'mercaptopurine', 'metarsophalangeal', 'menstrual period', 'UNSURED SENSE', 'milligram:mg', 'metatarsophalangeal/metacarpophalangeal', 'metabolic panel', 'metatarsophalangeal', '(drug) MP'}
sample size: [11, 4, 1, 179, 5, 107, 6, 1, 11, 1, 105, 12, 55, 2]

[[ 1  0  0  0  0  1  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  1  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0 39  0  2  0  0  0  0  4  0  1  0]
 [ 0  0  0  0  0  1  0  0  0  0  0  0  0  0]
 [ 0  0  0  1  0 24  0  0  0  0  0  0  0  0]
 [ 0  0  0  2  0  0  0  0  0  0  0  0  2  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  3  0  3  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0 19  0  0  0  0  0  0  7  0  2  0]
 [ 0  0  0  1  0  2  0  0  0  0  0  0  0  0]
 [

  'precision', 'predicted', average, warn_for)


In [15]:
def get_patient_note_score(list_of_abbreviation):
    stuff = []
    for abbreviation in list_of_abbreviation:
        try:
            df_slice = df[df.abbreviation==abbreviation]
            y = df_slice.long_form
            X = list([sentence_vector(x) for x in df_slice.sentence])
            X_train, X_test, y_train, y_test = train_test_split(X, y)
            clf = SVC(C=1.0, kernel='linear', degree=1).fit(X_train, y_train)
            pred = clf.predict(X_test)
            print((abbreviation, f1_score(y_test,pred,average = 'weighted')))
            stuff.append(f1_score(y_test,pred,average = 'weighted'))
            #print(str(len(stuff))+"/"+str(len(group_df.index)))
        except ValueError:
            None
    return stuff

In [16]:
acr_scores = get_patient_note_score(list(group_df.index))

  'precision', 'predicted', average, warn_for)


('AB', 0.8851154863355862)
('AC', 0.8066837947982665)
('ALD', 0.951864644516525)
('AMA', 0.9634259740259741)
('ASA', 0.9203265306122449)
('AV', 0.9165482233502537)
('AVR', 0.9322650273224043)
('BAL', 0.983068376068376)
('BK', 0.9920272410985269)
('BM', 0.9601322314049587)
('BMP', 0.8960223152022315)
('C&S', 0.9592291041505108)
('C3', 0.988031007751938)
('C4', 0.940499802916831)
('CA', 0.9495754475703325)
('CDI', 0.9602539682539682)
('CEA', 0.9640528634361233)
('CR', 0.9513142857142857)
('CTA', 0.9761749926101093)
('CVA', 0.976)
('CVP', 0.9477429643527204)
('CVS', 0.977557080474111)
('DC', 0.857524960254372)
('DIP', 1.0)
('DM', 0.9088872160934459)
('DT', 0.8623809523809524)
('EC', 0.9235301027900147)
('ER', 0.985263768115942)
('ES', 0.893344398340249)
('ET', 0.9167647058823529)
('FISH', 0.9918437081018961)
('FSH', 0.9439491865966686)
('GT', 0.9010909090909092)
('IA', 0.8748309263544243)
('IB', 0.9668136054421769)
('IM', 0.9918269910584321)
('IR', 0.9640174346201744)
('IT', 0.82790741914

  'recall', 'true', average, warn_for)


('MOM', 0.9839044936284373)
('MP', 0.6051712912774382)
('MR', 0.936567206592782)
('MS', 0.8161187465640463)
('MSSA', 0.9274444106000609)
('NAD', 0.9504752475247525)
('NP', 0.9561282882882883)
('OP', 0.9249230692761726)
('OR', 0.9590187643020593)
('OTC', 0.9535822959889351)
('PA', 0.8595478468899521)
('PAC', 0.8466464864864865)
('PCP', 0.9186217718208565)
('PD', 0.8094272300469484)
('PDA', 0.9057979094076656)
('PE', 0.9682024467906821)
('PM', 0.9401327014218009)
('PR', 0.920974113135187)
('PT', 0.9372898891501511)
('RA', 0.9298819434372734)
('RT', 0.9360345721694036)
('SA', 0.9048916408668731)
('SBP', 0.9595533561730745)
('SMA', 0.8684364640883978)
('SS', 0.9641690140845071)
('T1', 0.8962282376307027)
('T2', 0.9138276679841897)
('T3', 0.9218484605087015)
('T4', 0.9598509316770186)
('US', 0.9492394366197184)
('VAD', 0.9137777777777778)
('VBG', 1.0)


In [17]:
sum(acr_scores)/len(acr_scores)

0.9264844694022029