In [129]:
from gensim.models import KeyedVectors
import pandas as pd
import numpy as np
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
pd.set_option('display.max_colwidth', -1)
np.set_printoptions(threshold=np.nan)
model = KeyedVectors.load_word2vec_format('/Word_sense_all_data/bio_nlp_vec/PubMed-shuffle-win-30.bin', binary=True)

In [130]:
df = pd.read_csv("/Word_sense_all_data/PatientNotes_acronym_longform.csv")
group_df = df[['abbreviation','long_form']]
group_df = group_df.groupby('abbreviation').agg(lambda x: set(x.tolist()))
group_df

Unnamed: 0_level_0,long_form
abbreviation,Unnamed: 1_level_1
AB,"{MISTAKE:abduction, atrioventricular:AV, ankle-brachial, type A, type B, X-ray finding, NAME, abortion, arteriovenous:AV, antipyrine benzocaine, blood group in ABO system, arterial blood, UNSURED SENSE}"
AC,"{acetate, abdominal circumference, angiotensin-converting enzyme:ACE, (drug) AC, assist control, acromioclavicular, adriamycin cyclophosphamide, antecubital, anticoagulation, before meals, alternating current}"
ALD,"{ad lib on demand, left anterior descending:LAD, adrenoleukodystrophy, alanine aminotransferase:ALT, acetyl lysergic acid diethylamide}"
AMA,"{antimitochondrial antibody, advanced maternal age, against medical advice}"
ASA,"{aminosalicylic acid, American Society of Anesthesiologists, acetylsalicylic acid}"
AV,"{UNSURED SENSE, aortic valve, atrioventricular, arteriovenous}"
AVR,"{aortic valve resistance, rapid ventricular response:RVR, auditory brainstem response:ABR, aortic valve regurgitation, aortic valve replacement, augmented voltage right arm, UNSURED SENSE}"
BAL,"{bronchoalveolar lavage, blood alcohol level}"
BK,"{below knee, BK (virus)}"
BM,"{breast milk, bowel movement, UNSURED SENSE, bone marrow}"


In [131]:
stop_words = "disease, diseases, disorder, symptom, symptoms, drug, drugs, problems, problem,prob, probs, med, meds,\
, pill, pills, medicine, medicines, medication, medications, treatment, treatments, caps, capsules, capsule,\
, tablet, tablets, tabs, doctor, dr, dr., doc, physician, physicians, test, tests, testing, specialist, specialists,\
, side-effect, side-effects, patient, patients, pharmaceutical, pharmaceuticals, pharma, diagnosis, diagnose, diagnosed, exam,\
, challenge, device, condition, conditions, suffer, suffering ,suffered, feel, feeling, prescription, prescribe,\
, prescribed, over-the-counter, a, about, above, after, again, against, all, am, an, and, any, are, aren't, as, at, be, because, been, before,\
, being, below, between, both, but, by, can, can't, cannot, could, couldn't, did, didn't, do, does, doesn't,\
, doing, don't, down, during, each, few, for, from, further, had, hadn't, has, hasn't, have, haven't, having, he,\
, he'd, he'll, he's, her, here, here's, hers, herself, him, himself, his, how, how's, i, i'd, i'll, i'm, i've, if, in, into,\
, is, isn't, it, it's, its, itself, let's, me, more, most, mustn't, my, myself, no, nor, not, of, off, on, once, only, or,\
, other, ought, our, ours , ourselves, out, over, own, same, shan't, she, she'd, she'll, she's, should, shouldn't,\
, so, some, such, than, that, that's, the, their, theirs, them, themselves, then, there, there's, these, they,\
, they'd, they'll, they're, they've, this, those, through, to, too, under, until, up, very, was, wasn't, we, we'd,\
, we'll, we're, we've, were, weren't, what, what's, when, when's, where, where's, which, while, who, who's,\
, whom, why, why's, with, won't, would, wouldn't, you, you'd, you'll, you're, you've, your, yours, yourself,\
, yourselves, n't, 're, 've, 'd, 's, 'll, 'm".replace(',','').split(' ')

In [132]:
#from nltk.stem.snowball import SnowballStemmer
def sentence_vector(sentence):
    word_list = sentence.split()
    #stemmer = SnowballStemmer("english", ignore_stopwords=True)
    #word_list = [stemmer.stem(x) for x in word_list]
    word_list = [word for word in word_list if word not in stop_words]
    word_vectors = []
    for x in word_list:
        try:
            word_vectors.append(model[x])
        except KeyError:
            None    
    return sum(word_vectors)/len(word_vectors)

def vector_breakage(sentence):
    word_list = sentence.split()
    #stemmer = SnowballStemmer("english", ignore_stopwords=True)
    #word_list = [stemmer.stem(x) for x in word_list]
    word_list = [word for word in word_list if word not in stop_words]
    word_vectors_list = []
    for x in word_list:
        try:
            if len(model[x])==200:
                word_vectors_list.append(x)
        except:
            None
        else:
            None
    return word_vectors_list


In [133]:
df_slice = df[df.abbreviation=='MP']
y = df_slice.long_form
X = list([sentence_vector(x) for x in df_slice.sentence])


In [134]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_fscore_support
np.set_printoptions(threshold=np.nan)

In [135]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [136]:
clf = SVC(C=1.0, kernel='linear', degree=1).fit(X_train, y_train)

In [137]:
pred = clf.predict(X_test)
cm = confusion_matrix(y_test, pred,labels=list(set(df_slice.long_form)))
cross_val_scores = cross_val_score(clf, X, y, cv=7)



In [138]:
print('accuracy: {}'.format(cross_val_scores))
print()
print(set(df_slice.long_form))
print('sample size: {}'.format([len(df_slice[df_slice.long_form == x]) for x in set(df_slice.long_form)]))
print()
print(cm)
print()
print(f1_score(y_test,pred,average = 'weighted'))

accuracy: [0.69230769 0.56       0.64383562 0.65753425 0.66666667 0.67164179
 0.66153846]

{'metabolic panel', 'milligram:mg', 'metatarsophalangeal', 'metarsophalangeal', 'metatarsophalangeal/metacarpophalangeal', 'nurse practitioner:NP', 'menstrual period', '(drug) MP', 'military police', 'metacarpophalangeal', 'mesangial proliferative', 'UNSURED SENSE', '(device) MP', 'mercaptopurine'}
sample size: [12, 1, 55, 6, 105, 11, 1, 2, 4, 179, 1, 11, 5, 107]

[[ 0  0  0  0  0  0  0  0  0  0  0  0  0  3]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  7  0  0  0  0  0  0  5  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  1  0  0  0  0]
 [ 0  0  4  0  1  0  0  0  0 21  0  0  0  1]
 [ 0  0  0  0  0  2  0  0  0  0  0  0  0  1]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  1]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  1]
 [ 0  0  0  0  0  0  0  0  0 37  0  0  0  1]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  1]
 [ 0  0  0  0  0  1  0  0  0  1  0  0  0  3]
 [

  'precision', 'predicted', average, warn_for)


In [139]:
def get_patient_note_score(list_of_abbreviation):
    stuff = []
    for abbreviation in list_of_abbreviation:
        try:
            df_slice = df[df.abbreviation==abbreviation]
            y = df_slice.long_form
            X = list([sentence_vector(x) for x in df_slice.sentence])
            X_train, X_test, y_train, y_test = train_test_split(X, y)
            clf = SVC(C=1.0, kernel='linear', degree=1).fit(X_train, y_train)
            pred = clf.predict(X_test)
            print((abbreviation, f1_score(y_test,pred,average = 'weighted')))
            stuff.append(f1_score(y_test,pred,average = 'weighted'))
            #print(str(len(stuff))+"/"+str(len(group_df.index)))
        except ValueError:
            None
    return stuff

In [140]:
acr_scores = get_patient_note_score(list(group_df.index))

  'precision', 'predicted', average, warn_for)


('AB', 0.8802748948106592)
('AC', 0.7693311596205215)
('ALD', 0.9800199004975125)
('AMA', 0.9557718388512166)
('ASA', 0.956327868852459)
('AV', 0.9218771280532938)
('AVR', 0.9561005025125627)
('BAL', 0.9733826679649464)
('BK', 0.9919757593247241)
('BM', 0.9602666666666666)
('BMP', 0.9050578512396695)
('C&S', 0.9806618013423606)
('C3', 0.9681405183270365)
('C4', 0.9443227695711589)
('CA', 0.9377341478828733)
('CDI', 0.9800275862068966)
('CEA', 0.9670787878787878)
('CR', 0.9796245210727968)
('CTA', 0.9754113824546694)
('CVA', 0.9597736001617141)
('CVP', 0.9800975609756097)
('CVS', 0.9845958702064896)
('DC', 0.878564541434888)
('DIP', 0.9917818732643271)
('DM', 0.9038888317035512)
('DT', 0.9107929373996789)
('EC', 0.9285130380536494)
('ER', 0.9917100854700854)
('ES', 0.9270181311018132)
('ET', 0.9132948604793265)
('FISH', 0.9918577777777777)
('FSH', 0.9561637426900584)
('GT', 0.9175975232198142)
('IA', 0.8423925280827969)
('IB', 0.9050666666666666)
('IM', 0.9753456197929815)
('IR', 0.9442

In [141]:
sum(acr_scores)/len(acr_scores)

0.9281812901129267