In [1]:
import nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk import ngrams
import re

In [3]:
from nltk.corpus import stopwords
stop_words=set(stopwords.words("english"))

In [4]:
from nltk.stem.wordnet import WordNetLemmatizer
lem = WordNetLemmatizer()

In [5]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

In [6]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_trans = TfidfTransformer(smooth_idf=True, use_idf=True)

In [72]:
from sklearn.feature_extraction.text import CountVectorizer
cvec = CountVectorizer(max_df=.85, stop_words=stop_words, ngram_range=(1, 3))

In [8]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
SIA = SentimentIntensityAnalyzer()

## Load & inspect datasets

Dataset

In [9]:
df_clean_pmc = pd.read_csv("data/clean_pmc.csv", skiprows=0, header=None)
df_clean_noncomm_use = pd.read_csv("data/clean_noncomm_use.csv", skiprows=1, header=None)
df_clean_comm_use = pd.read_csv("data/clean_comm_use.csv", skiprows=1, header=None)
df_biorxiv_clean = pd.read_csv("data/biorxiv_clean.csv", skiprows=1, header=None)

In [10]:
df_concat = pd.concat([df_clean_pmc, df_clean_comm_use, df_clean_noncomm_use, df_biorxiv_clean])

In [11]:
data = df_concat.to_numpy()

In [12]:
data.shape

(13203, 9)

In [13]:
data[0]

array(['paper_id', 'title', 'authors', 'affiliations', 'abstract', 'text',
       'bibliography', 'raw_authors', 'raw_bibliography'], dtype=object)

In [14]:
df_drug_ratings = pd.read_csv("data/drugsComTest_raw.csv", skiprows=0, header=None)

In [15]:
drug_ratings = df_drug_ratings.to_numpy()

In [16]:
drug_ratings.shape

(53767, 7)

In [17]:
drug_ratings[1]

array(['163740', 'Mirtazapine', 'Depression',
       '"I&#039;ve tried a few antidepressants over the years (citalopram, fluoxetine, amitriptyline), but none of those helped with my depression, insomnia &amp; anxiety. My doctor suggested and changed me onto 45mg mirtazapine and this medicine has saved my life. Thankfully I have had no side effects especially the most common - weight gain, I&#039;ve actually lost alot of weight. I still have suicidal thoughts but mirtazapine has saved me."',
       '10', '28-Feb-12', '22'], dtype=object)

In [18]:
drugs = np.hstack([np.unique(drug_ratings[1:,1]), np.array(['chloroquine'])])
conditions = np.unique(drug_ratings[1:,2].astype(str))[44:]

In [19]:
print(drugs.shape, conditions.shape)

(2638,) (665,)


In [73]:
def get_snippet(index, doc):
    return doc[max(index.start()-50, 0):max(index.start()-1, 0)] +\
            doc[index.end():min(index.end()+100, len(doc))]

In [74]:
def process(drug, doc):
    s = 0
    d = ""
    for index in re.finditer(drug.lower(), str(doc)):
        snippet = get_snippet(index, doc)
        sentiment = SIA.polarity_scores(snippet)
        s = sentiment['compound']
        d += ' '.join([lem.lemmatize(w) for w in tokenizer.tokenize(snippet)[1:-1] if not w in stop_words])
        
    return s, d

In [75]:
sentimemts = []
documents = []

print("Total:", len(drugs))
for i, drug in enumerate(drugs):
    print("Progress:", i, end="\r")
    s = []
    d = ""
    for doc in data[1:]:
        p = process(drug, doc[1])
        if len(p[1]) > 0: 
            s.append(p[0])
            d += p[1]
        p = process(drug, doc[4])
        if len(p[1]) > 0: 
            s.append(p[0])
            d += p[1]
        p = process(drug, doc[5])
        if len(p[1]) > 0: 
            s.append(p[0])
            d += p[1]
    
    sentimemts.append(s)
    documents.append(d)  

Total: 2638
Progress: 2637

In [27]:
documents[256]

'LGP2 construct deficient RNA binding individual domain alone capable ofg MDA5 directed signaling Pippig et al 2009 Also supporting possible unique link MDn stringency increased number SELEX cycle Increased stringency achieved byg number DNA RNA molecule relative possible target binding site well extensicking ENaC In previous experiment constructed mutant abolishing βV348 γH233R org αY458A γM432G ENaC self inhibition 14 24 The mutation eliminating self inhibition resultivated intense effort develop new surveillance method 6 Public health official nowg traditional disease surveillance e g laboratory based method nontraditional analysisextra intestinal infection caused virus commensal bacteria appear uniformly protective ing host defense Studies using viral infection probe immune shift following antibiotic induced eation HI CF neutralization test served basis arbovirus diagnosis many year g electron microscopy see section allowed visualization virus infected tixample syndrome rather dise

In [76]:
wordcounts = cvec.fit_transform(documents)

In [77]:
tf_idf = tfidf_trans.fit_transform(wordcounts)

In [78]:
voc = np.asarray(cvec.get_feature_names())

In [79]:
res = []

for i, d in enumerate(tf_idf):
    if len(documents[i]) > 0:
        res.append((np.asarray(sentimemts[i]).mean(), np.asarray(sentimemts[i]).var(), drugs[i], voc[d.toarray().argsort()[0, -20:]]))

In [83]:
print(sorted(res, reverse=False)[:10])

[(-0.8689, 0.0, 'Herceptin', array(['trastuzumab breast cancer', 'trastuzumab breast',
       'monoclonal antibody trastuzumab', 'apoptosis breast cancer',
       'antibody trastuzumab', 'patient 56 could', '56 could induce',
       'induce apoptosis breast', '56 could', 'patient 56',
       'carrying monoclonal', 'carrying monoclonal antibody',
       'nanofilaments carrying monoclonal', 'nanofilaments carrying',
       'nanofilaments', 'cancer patient 56',
       'antibody trastuzumab breast', 'cancer', 'breast', 'breast cancer'],
      dtype='<U134')), (-0.7845, 0.0, 'Lapatinib', array(['autophagy inhibition', 'effect cancer', 'netotic',
       'inhibition 26', 'cell abolished', 'abolished upon',
       'effect cancer cell', 'upon autophagy inhibition',
       'upon autophagy', 'abolished upon autophagy',
       'cancer cell abolished', 'cytotoxic effect cancer',
       'autophagy inhibition 26', '26 necrosis', '26 necrosis netotic',
       'cell abolished upon', 'necrosis netotic',

In [62]:
print(drugs[256], np.asarray(cvec.get_feature_names())[tf_idf[256].toarray().argsort()[0, -20:]])

Augmentin ['viral' 'expression' 'immune' 'response' 'egr' 'ofg' 'role' 'ing' 'cell'
 'byg']


In [59]:
tf_idf[256].toarray().argsort()[0, -10:]

array([5860903, 2127408, 2727013, 4629309, 1909015, 3766677, 4752603,
       2917731, 1065756,  961961])

In [17]:
wordcounts = []
for i, d in enumerate(documents):
    wordcounts.append(Counter(d))

In [18]:
outs = []
for drug in drugs:
    num = 0
    for i, w in enumerate(wordcounts):
        num += sentiments[i]w[drug.lower()]
    outs.append((num, drug))

In [60]:
sorted(outs, reverse=True)

[(3348, 'Glucose'),
 (2035, 'Ribavirin'),
 (1755, 'Oseltamivir'),
 (1186, 'Urea'),
 (1123, 'chloroquine'),
 (1122, 'Lysine'),
 (908, 'Silver'),
 (631, 'Adenosine'),
 (588, 'Luminal'),
 (587, 'Tetracycline'),
 (577, 'Zanamivir'),
 (558, 'Copper'),
 (540, 'Amantadine'),
 (450, 'Tryptophan'),
 (441, 'Doxycycline'),
 (433, 'Biotin'),
 (420, 'Gentamicin'),
 (388, 'Azithromycin'),
 (370, 'Phenol'),
 (350, 'Dexamethasone'),
 (327, 'Fluoride'),
 (321, 'Doxorubicin'),
 (300, 'Aspirin'),
 (288, 'Prednisolone'),
 (282, 'Vancomycin'),
 (265, 'Ketamine'),
 (257, 'Amoxicillin'),
 (253, 'Acyclovir'),
 (250, 'Methylprednisolone'),
 (208, 'Propofol'),
 (206, 'Cyclosporine'),
 (204, 'Cetuximab'),
 (202, 'Ciprofloxacin'),
 (196, 'Imiquimod'),
 (185, 'Tamoxifen'),
 (185, 'Acidophilus'),
 (184, 'Imatinib'),
 (182, 'Metronidazole'),
 (173, 'Prednisone'),
 (171, 'Rosiglitazone'),
 (171, 'Midazolam'),
 (171, 'Hydrocortisone'),
 (170, 'Montelukast'),
 (168, 'Erythromycin'),
 (158, 'Caffeine'),
 (157, 'Echinace

In [87]:
SIA.polarity_scores("totally heals cancer")['compound']

-0.6887

In [82]:
np.array([.1,.4,.2,.5,.2,.1]).var()

0.022500000000000003