In [35]:
from gensim.models import KeyedVectors
import pandas as pd
import numpy as np
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
pd.set_option('display.max_colwidth', -1)

In [36]:
model = KeyedVectors.load_word2vec_format('/Word_sense_all_data/bio_nlp_vec/PubMed-shuffle-win-30.bin', binary=True)


In [37]:
w1 ="stroke"
print(model.most_similar(positive = w1, topn = 6))
model.most_similar_cosmul(positive=['hepatoma', 'brain'], negative=['liver'])

[('Stroke', 0.821251630783081), ('stoke', 0.8012742400169373), ('stroke-related', 0.7877746820449829), ('cardioembolic', 0.7663429975509644), ('TIA', 0.7593713402748108), ('cerebrovascular', 0.759026825428009)]


[('SK-N-SH', 0.8899902105331421),
 ('IMR-32', 0.887042224407196),
 ('glioma', 0.8706138134002686),
 ('astrocyte', 0.8669655919075012),
 ('Hs683', 0.8647631406784058),
 ('Neuro-2A', 0.8643868565559387),
 ('astrocytes', 0.862960696220398),
 ('U87-MG', 0.8623798489570618),
 ('U373', 0.8599711656570435),
 ('U-87', 0.8597809672355652)]

In [38]:
stop_words = "disease, diseases, disorder, symptom, symptoms, drug, drugs, problems, problem,prob, probs, med, meds,\
, pill, pills, medicine, medicines, medication, medications, treatment, treatments, caps, capsules, capsule,\
, tablet, tablets, tabs, doctor, dr, dr., doc, physician, physicians, test, tests, testing, specialist, specialists,\
, side-effect, side-effects, patient, patients, pharmaceutical, pharmaceuticals, pharma, diagnosis, diagnose, diagnosed, exam,\
, challenge, device, condition, conditions, suffer, suffering ,suffered, feel, feeling, prescription, prescribe,\
, prescribed, over-the-counter, a, about, above, after, again, against, all, am, an, and, any, are, aren't, as, at, be, because, been, before,\
, being, below, between, both, but, by, can, can't, cannot, could, couldn't, did, didn't, do, does, doesn't,\
, doing, don't, down, during, each, few, for, from, further, had, hadn't, has, hasn't, have, haven't, having, he,\
, he'd, he'll, he's, her, here, here's, hers, herself, him, himself, his, how, how's, i, i'd, i'll, i'm, i've, if, in, into,\
, is, isn't, it, it's, its, itself, let's, me, more, most, mustn't, my, myself, no, nor, not, of, off, on, once, only, or,\
, other, ought, our, ours , ourselves, out, over, own, same, shan't, she, she'd, she'll, she's, should, shouldn't,\
, so, some, such, than, that, that's, the, their, theirs, them, themselves, then, there, there's, these, they,\
, they'd, they'll, they're, they've, this, those, through, to, too, under, until, up, very, was, wasn't, we, we'd,\
, we'll, we're, we've, were, weren't, what, what's, when, when's, where, where's, which, while, who, who's,\
, whom, why, why's, with, won't, would, wouldn't, you, you'd, you'll, you're, you've, your, yours, yourself,\
, yourselves, n't, 're, 've, 'd, 's, 'll, 'm".replace(',','').split(' ')

In [39]:
from nltk.tokenize import TreebankWordTokenizer
#from nltk.stem.snowball import SnowballStemmer
def sentence_vector(sentence):
    word_list = TreebankWordTokenizer().tokenize(sentence)
    #stemmer = SnowballStemmer("english", ignore_stopwords=True)
    #word_list = [stemmer.stem(x) for x in word_list]
    word_list = [word for word in word_list if word not in stop_words]
    word_vectors = []
    for x in word_list:
        try:
            word_vectors.append(model[x])
        except KeyError:
            None    
    return sum(word_vectors)/len(word_vectors)
def vector_breakage(sentence):
    word_list = TreebankWordTokenizer().tokenize(sentence)
    #stemmer = SnowballStemmer("english", ignore_stopwords=True)
    #word_list = [stemmer.stem(x) for x in word_list]
    word_list = [word for word in word_list if word not in stop_words]
    word_vectors_list = []
    for x in word_list:
        try:
            if len(model[x])==200:
                word_vectors_list.append(x)
        except:
            None
        else:
            None
    return word_vectors_list

In [40]:
df = pd.read_csv("/Word_sense_all_data/DM_dataset.csv")
df = df[['sentence','label']]
df['vec'] = [sentence_vector(x) for x in df.sentence]
df.label.unique()


array(['dry matter', 'myotonic_dystrophy', 'diabetes', 'dextromethorphan',
       'dexamethasone'], dtype=object)

In [41]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_fscore_support

In [42]:
X = list(df.vec)
X = np.array(X)
y = df.label
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [43]:
clf = SVC(C=1.0, kernel='linear', degree=1).fit(X_train, y_train)

In [44]:
pred = clf.predict(X_test)
cm = confusion_matrix(y_test, pred,labels=list(set(df.label)))
cross_val_scores = cross_val_score(clf, X, y, cv=7)

In [45]:
print('accuracy: {}'.format(cross_val_scores))
print()
print(set(df.label))
print([len(df[df.label == x]) for x in set(df.label)])
print()
print(cm)
print()
print(f1_score(y_test,pred,average = 'weighted'))

accuracy: [0.9321267  0.94090909 0.91324201 0.92201835 0.93577982 0.95391705
 0.95833333]

{'dextromethorphan', 'dry matter', 'myotonic_dystrophy', 'diabetes', 'dexamethasone'}
[152, 520, 374, 407, 76]

[[ 30   0   0   1   0]
 [  0 131   0   0   1]
 [  1   0  88   4   1]
 [  1   1   3 100   1]
 [  2   1   0   0  17]]

0.9556710330784836


In [46]:
sentence = 'Patient was tested for dm, cystic fibrosis and other heritable diseases '
print(vector_breakage(sentence))
print(clf.predict(sentence_vector(sentence).reshape(1, -1)))

['Patient', 'tested', 'cystic', 'fibrosis', 'heritable']
['myotonic_dystrophy']


## Try

In [153]:
try_out = ''
print(vector_breakage(try_out))
print(clf.predict(sentence_vector(try_out).reshape(1, -1)))

"try_out = ''\nprint(vector_breakage(try_out))\nprint(clf.predict(sentence_vector(try_out).reshape(1, -1)))"