In [3]:
from gensim.models import KeyedVectors
import pandas as pd
import numpy as np
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
pd.set_option('display.max_colwidth', -1)
np.set_printoptions(threshold=np.nan)
model = KeyedVectors.load_word2vec_format('/Word_sense_all_data/bio_nlp_vec/PubMed-shuffle-win-30.bin', binary=True)

In [4]:
df = pd.read_csv("/Word_sense_all_data/PatientNotes_acronym_longform.csv")
group_df = df[['abbreviation','long_form']]
group_df = group_df.groupby('abbreviation').agg(lambda x: set(x.tolist()))
group_df

Unnamed: 0_level_0,long_form
abbreviation,Unnamed: 1_level_1
AB,"{ankle-brachial, NAME, X-ray finding, MISTAKE:abduction, arteriovenous:AV, antipyrine benzocaine, arterial blood, UNSURED SENSE, type A, type B, abortion, blood group in ABO system, atrioventricular:AV}"
AC,"{(drug) AC, antecubital, angiotensin-converting enzyme:ACE, adriamycin cyclophosphamide, acromioclavicular, before meals, assist control, acetate, alternating current, anticoagulation, abdominal circumference}"
ALD,"{acetyl lysergic acid diethylamide, left anterior descending:LAD, ad lib on demand, alanine aminotransferase:ALT, adrenoleukodystrophy}"
AMA,"{advanced maternal age, antimitochondrial antibody, against medical advice}"
ASA,"{acetylsalicylic acid, aminosalicylic acid, American Society of Anesthesiologists}"
AV,"{arteriovenous, aortic valve, UNSURED SENSE, atrioventricular}"
AVR,"{aortic valve resistance, aortic valve replacement, aortic valve regurgitation, UNSURED SENSE, augmented voltage right arm, rapid ventricular response:RVR, auditory brainstem response:ABR}"
BAL,"{blood alcohol level, bronchoalveolar lavage}"
BK,"{BK (virus), below knee}"
BM,"{breast milk, UNSURED SENSE, bowel movement, bone marrow}"


In [5]:
stop_words = "disease, diseases, disorder, symptom, symptoms, drug, drugs, problems, problem,prob, probs, med, meds,\
, pill, pills, medicine, medicines, medication, medications, treatment, treatments, caps, capsules, capsule,\
, tablet, tablets, tabs, doctor, dr, dr., doc, physician, physicians, test, tests, testing, specialist, specialists,\
, side-effect, side-effects, patient, patients, pharmaceutical, pharmaceuticals, pharma, diagnosis, diagnose, diagnosed, exam,\
, challenge, device, condition, conditions, suffer, suffering ,suffered, feel, feeling, prescription, prescribe,\
, prescribed, over-the-counter, a, about, above, after, again, against, all, am, an, and, any, are, aren't, as, at, be, because, been, before,\
, being, below, between, both, but, by, can, can't, cannot, could, couldn't, did, didn't, do, does, doesn't,\
, doing, don't, down, during, each, few, for, from, further, had, hadn't, has, hasn't, have, haven't, having, he,\
, he'd, he'll, he's, her, here, here's, hers, herself, him, himself, his, how, how's, i, i'd, i'll, i'm, i've, if, in, into,\
, is, isn't, it, it's, its, itself, let's, me, more, most, mustn't, my, myself, no, nor, not, of, off, on, once, only, or,\
, other, ought, our, ours , ourselves, out, over, own, same, shan't, she, she'd, she'll, she's, should, shouldn't,\
, so, some, such, than, that, that's, the, their, theirs, them, themselves, then, there, there's, these, they,\
, they'd, they'll, they're, they've, this, those, through, to, too, under, until, up, very, was, wasn't, we, we'd,\
, we'll, we're, we've, were, weren't, what, what's, when, when's, where, where's, which, while, who, who's,\
, whom, why, why's, with, won't, would, wouldn't, you, you'd, you'll, you're, you've, your, yours, yourself,\
, yourselves, n't, 're, 've, 'd, 's, 'll, 'm".replace(',','').split(' ')

In [6]:
#from nltk.stem.snowball import SnowballStemmer
def sentence_vector(sentence):
    word_list = sentence.split()
    #stemmer = SnowballStemmer("english", ignore_stopwords=True)
    #word_list = [stemmer.stem(x) for x in word_list]
    word_list = [word for word in word_list if word not in stop_words]
    word_vectors = []
    for x in word_list:
        try:
            word_vectors.append(model[x])
        except KeyError:
            None    
    return sum(word_vectors)/len(word_vectors)

def vector_breakage(sentence):
    word_list = sentence.split()
    #stemmer = SnowballStemmer("english", ignore_stopwords=True)
    #word_list = [stemmer.stem(x) for x in word_list]
    word_list = [word for word in word_list if word not in stop_words]
    word_vectors_list = []
    for x in word_list:
        try:
            if len(model[x])==200:
                word_vectors_list.append(x)
        except:
            None
        else:
            None
    return word_vectors_list


In [7]:
df_slice = df[df.abbreviation=='CA']
y = df_slice.long_form
X = list([sentence_vector(x) for x in df_slice.sentence])


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_fscore_support
np.set_printoptions(threshold=np.nan)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [10]:
clf = SVC(C=1.0, kernel='linear', degree=1).fit(X_train, y_train)

In [11]:
pred = clf.predict(X_test)
cm = confusion_matrix(y_test, pred,labels=list(set(df_slice.long_form)))
cross_val_scores = cross_val_score(clf, X, y, cv=7)



In [12]:
print('accuracy: {}'.format(cross_val_scores))
print()
print(set(df_slice.long_form))
print('sample size: {}'.format([len(df_slice[df_slice.long_form == x]) for x in set(df_slice.long_form)]))
print()
print(cm)
print()
print(f1_score(y_test,pred,average = 'weighted'))

accuracy: [0.94520548 0.89041096 0.94366197 0.95774648 0.94366197 0.94366197
 0.92857143]

{'California', 'UNSURED SENSE', 'carbohydrate antigen', 'cancer'}
sample size: [2, 2, 105, 391]

[[  0   0   0   0]
 [  0   0   0   0]
 [  0   0  18   5]
 [  0   0   1 101]]

0.9501758241758242
