In [1]:
import os, sys, re, random, time
import requests
import collections
from Bio import Entrez, Medline
from com.ibm.watson import DESKTOP, DOCS_DIR
import com.ibm.watson.utils.pubmed_utils as pu
import urllib
import json
import pandas as pd
from bs4 import BeautifulSoup as bs
import pymongo

In [2]:
base_dir = os.path.join(DESKTOP, "Levels_Of_Evidence")

In [3]:
outcome_terms = ["[Rr]esponse[s]?", "[Ss]urvival", "[Oo]verall [Ss]urvival", "[Pp]artial [Rr]esponse[s]*", 
                 "[Ss]urvival [Rr]ate", "[Oo]utcome", "PFS", "OS", "ORR", "TTP", "RECIST", "[pP]rogression", "EFS"]
outcome_terms_re = re.compile("|".join(outcome_terms))
cancer_terms = ["cancer", "tumor", "tumour", "lymphoma", "leukemia", "myeloma", "carcinoma", 
                "adenoma", "sarcoma", "melanoma"]
cancer_terms_re = re.compile("|".join(cancer_terms), re.IGNORECASE)
#cancer_terms = "cancer OR tumor OR tumour OR lymphoma OR leukemia OR myeloma OR carcinoma OR adenoma OR sarcoma"
#patient_terms_re = re.compile("[mM]en|[wW]omen|[pP]atient[s]?")
#clinical, trial, demographics, phase IGNORE
#remove "Review", use Clinical Trial, Case Reports
Coding_Substitutions = "([c]\.)(\d+)?[-+*]?\d+[AGTC]>[AGTC](?!\w)"
Coding_Deletions = "[c]\.\S+(\d+|\?)del[AGCT]*|(c\.)((\d+([_+-]\d+)?)|(\(.*\))_\(.*\))del[ACTG]*"
Coding_Duplications = "[gc]\.\S+(\d+|\?)dup[AGCT]*|([cg]\.)(\d+)?(_\d+)?(\(\S*\)_\(\S*\))?dup[AGCT]*"
Coding_Indels = "(c\.)(((\d+)|((\d+_\d+)))+(delins)[AGTC]+|(((\d+)|(\d+_\d+))del[AGCT]+ins[AGTC]+)|(\[\d+[AGCT]+>[AGCT]+; \d+del[AGCT]+\]))"
Coding_Inversions = "c\.\d+_\d+inv"
Coding_Conversions = "g.\d+_\d+con(.*):g\.\d+_\d+"
Coding_Translocations = "t\((\d+|[XY]);(\d+|[XY])\)\([pq]\d+(\.\d+)?;[pq]\d+(\.\d+)?\)(\(c\.\d+\+\d+_\d+\+\d+\))"
Coding_Insertions = "c\.((\d+_\d+)|\d+\+\d+_\d+\+\d+)ins(([AGTC]+))"
Missense_Alteration = "([p]\.)?[A-Z][a-z]{0,2}\d{2,4}[A-Z][a-z]{0,2}"
Nonsense_Alteration = "([p]\.)?[A-Z][a-z]{0,2}\d{2,4}(\*|Ter)(?!\w)"
Upstream_Translation_Initiation = "p\.\S+?\-\d+"
Translation_Termination_Codon = "p\.\*\S+\*(\d+|\?)"
Deletions_Without_Nonsense = "p\.\S+(\d+|\?)del"
Duplications_Protein = "p\.\S+(\d+|\?)dup"
Insertions_Protein = "p\.\S+(\d+|\?)ins\w+"
Translocations = "t\(\S+p\.\S+\s\S+"
Complex_Rearrangements = "p\.\S+(\d+|\?)delins\w+"

all_variant_patterns = [Coding_Substitutions, Coding_Deletions, Coding_Duplications, Coding_Indels, Coding_Inversions,
                       Coding_Conversions, Coding_Translocations, Coding_Insertions, Missense_Alteration,
                       Nonsense_Alteration, Upstream_Translation_Initiation, Translation_Termination_Codon,
                       Deletions_Without_Nonsense, Duplications_Protein, Insertions_Protein, Translocations,
                       Complex_Rearrangements]
all_variant_regex = [re.compile(x) for x in all_variant_patterns]

In [4]:
client = pymongo.MongoClient()
db = client["LOE"]

In [5]:
def quote(lst):
    return [ "\"" + x.strip() + "\"" for x in lst ] if isinstance(lst, list) else "\"" + lst + "\""

def add_fieldname(lst, name):
    return [ x + "[%s]" % name for x in lst ] if isinstance(lst, list) else lst + "[%s]" % name

def make_query(field_name, gene_symbol, therapy_id=None, cancer_umls_cui=None, variant=None, variant_type=None):
    OR = " OR "
    AND = " AND "
    gene = db["GENE"].find_one({"SYMBOL" : gene_symbol})
    gene_syns = quote([gene["SYMBOL"]] + gene["SYNS"])
    gene_syns = add_fieldname(gene_syns, field_name)
    
    therapy = db["THERAPY"].find_one({"THERAPY_ID" : therapy_id})
    therapy_syns = quote([therapy["NAME"]] + therapy["SYNS"])
    therapy_syns = add_fieldname(therapy_syns, field_name)
    
    if cancer_umls_cui is not None:
        cancer = db["CANCER"].find_one({"UMLS_CUI" : cancer_umls_cui})
        cancer_syns = quote([cancer["NAME"]] + cancer["SYNS"]) 
        cancer_syns = add_fieldname(cancer_syns, field_name)
    else:
        cancer_syns = None
    
    if variant:
        variant = [add_fieldname(quote(variant), field_name)]
        
    if variant_type:
        variant_type = [add_fieldname(quote(variant_type), field_name)]
    
    query_components = [ gene_syns, therapy_syns, cancer_syns, variant, variant_type ]
    query_components = ["(" + OR.join(x) + ")" for x in query_components if x is not None]
    return AND.join(query_components)

In [6]:
def search(gene_symbol, therapy_id, therapy_name, variant, variant_type):
    pubmed_query = make_query("Title/Abstract", gene_symbol, therapy_id, therapy_name, variant, variant_type)
    pmc_query = make_query("Body - All Words", gene_symbol, therapy_id, therapy_name, variant, variant_type)
    print "Pubmed Query: "
    print pubmed_query
    print "PMC Query: "
    print pmc_query
    pubmed_ids = pu.search_pubmed(pubmed_query, retmax=100000)
    pmc_ids = pu.search_pmc(pmc_query, retmax=100000)
    converted = pu.convert_pmc_to_pmid(pmc_ids)
    union = set(converted.values()).union(set(pubmed_ids))
    return (union, converted)

In [7]:
def get_pmc_text(pmc_id):
    print "PMC ID: " + pmc_id
    html = requests.get("https://www.ncbi.nlm.nih.gov/pmc/articles/%s/" % pmc_id).text
    soup = bs(html,'html.parser')
    sections = soup.find_all("div", class_="sec")
    components = {"abstract" : "", "body" : ""}
    abs_re = re.compile("abstract", re.IGNORECASE)
    ref_re = re.compile("references", re.IGNORECASE)

    for section in sections:
        sec_title = section.find("h2")
        if sec_title:
            title = sec_title.get_text()
            parent = sec_title.parent
            if abs_re.search(title):
                components["abstract"] = parent.get_text()
            elif not ref_re.search(title):
                components["body"] += parent.get_text()

    return components

def get_IBM_pmc_text(pmc_id):
    content_url = "https://watsonpow01.rch.stglabs.ibm.com/services/content/api/v1/content/pubmed/%s/%s?user_id=genomics&token=w4genom&version=2017-03-01"
    response = requests.get(content_url % ("pmid", pmc_id)).json()
    return response

In [8]:
print get_IBM_pmc_text("27837257")

[{u'body': u'Introduction\nThe development of numerous targeted small molecule inhibitors represents an important and evolving new approach to cancer therapy. However, as tumours often have defects in multiple oncogenic signalling pathways, single agent anti-tumour activity is modest, and thus combinations of targeted agents are being investigated. Specifically, the MAPK pathway, a major proliferative pathway, and the PI3K pathway, a major survival pathway, are frequently activated in cancer and are being concomitantly targeted.\n\n\nMany MEK inhibitors, such as PD 0325901 and selumetinib (AZD6244), have been developed to target the MAPK pathway and have shown potent growth inhibitory activity in experimental systems [1\u20134]. A novel orally available small molecule allosteric MEK inhibitor WX-554 (UCB1366554), which potently inhibits MEK1 and MEK2 with a half maximal inhibitory concentration (IC50) of 4.7 and 11\xa0nM, respectively, has been developed by Wilex and UCB Celltech. WX-5

In [13]:
gene_symbol = "KRAS"
therapy_id = "DB08911"
(pmids, pmc_to_pmid_dict) = search(gene_symbol, therapy_id, None, None, None)
print len(pmids)
pubmed_articles = dict( [ (x["PMID"], x) for x in pu.get_medline(list(pmids)) if x["PMID"] != "" ] )
'''
pubmed_query = make_query("Title/Abstract", gene_symbol, therapy_id, None, None, None)
print pubmed_query
pubmed_ids = pu.search_pubmed(pubmed_query, retmax=100000)
print len(pubmed_ids)
filtered = dict()
for article in pubmed_articles:
    abstract = article.get("AB")
    if abstract and outcome_terms_re.search(abstract):
        #filtered.append(article["PMID"])
        filtered[article["PMID"]] = article
        
print len(filtered)
'''

Pubmed Query: 
("KRAS"[Title/Abstract] OR "KRAS1"[Title/Abstract]) AND ("trametinib"[Title/Abstract] OR "gsk1120212"[Title/Abstract] OR "n-(3-{3-cyclopropyl-5-[(2-fluoro-4-iodophenyl)amino]-6,8-dimethyl-2,4,7-trioxo-3,4,6,7-tetrahydropyrido[4,3-d]pyrimidin-1(2h)-yl}phenyl)acetamide"[Title/Abstract] OR "jtp-74057"[Title/Abstract] OR "mekinist"[Title/Abstract] OR "mek inhibitor gsk1120212"[Title/Abstract])
PMC Query: 
("KRAS"[Body - All Words] OR "KRAS1"[Body - All Words]) AND ("trametinib"[Body - All Words] OR "gsk1120212"[Body - All Words] OR "n-(3-{3-cyclopropyl-5-[(2-fluoro-4-iodophenyl)amino]-6,8-dimethyl-2,4,7-trioxo-3,4,6,7-tetrahydropyrido[4,3-d]pyrimidin-1(2h)-yl}phenyl)acetamide"[Body - All Words] OR "jtp-74057"[Body - All Words] OR "mekinist"[Body - All Words] OR "mek inhibitor gsk1120212"[Body - All Words])
511


'\npubmed_query = make_query("Title/Abstract", gene_symbol, therapy_id, None, None, None)\nprint pubmed_query\npubmed_ids = pu.search_pubmed(pubmed_query, retmax=100000)\nprint len(pubmed_ids)\nfiltered = dict()\nfor article in pubmed_articles:\n    abstract = article.get("AB")\n    if abstract and outcome_terms_re.search(abstract):\n        #filtered.append(article["PMID"])\n        filtered[article["PMID"]] = article\n        \nprint len(filtered)\n'

In [22]:
def get_entities(text):
    model_id = "48e69a3e-b31c-43fe-bdf9-6dffae15ec67"
    api_key = "e0759019e81f7776b95cbebf8b2f2ffe0244a3f9"
    url = "https://gateway-a.watsonplatform.net/calls/text/TextGetRankedNamedEntities?apikey=%s&verbose=1&model=%s"
    data = {"outputMode" : "json", "text" : text}
    return requests.post(url % (api_key, model_id), data=data).json()["entities"]

In [14]:
print len(pmc_to_pmid_dict)
non_reviews = []
for (pmid, article) in pubmed_articles.items():
    pub_types = article.get("PT")
    if "Review" not in pub_types:
        non_reviews.append(pmid)
        
print len(non_reviews)

496
316


In [27]:
with open(os.path.join(base_dir, "LOE_Output_%s_%s-NEW.tsv" % (gene_symbol, therapy_id)), "wb") as f:
    f.write("PMID\tPMC ID\tOutcome terms\tCancer types\tVariants (Abstract)\tVariants (Body)\tTitle\tPublication Type\n")
    count_10 = 0
    count = 0
    for (pmc_id, pmid) in [ (x, y) for (x, y) in pmc_to_pmid_dict.items() if y in pubmed_articles ]:
        print "%s: %s" % (count, pmid)
        count_10 += 1
        count += 1
        if count_10 == 10:
            time.sleep(random.randint(10, 20))
            count_10 = 0
        pubmed_article = pubmed_articles[pmid]
        '''
        raw = requests.get("https://www.ncbi.nlm.nih.gov/CBBresearch/Lu/Demo/RESTful/tmTool.cgi/BioConcept/%s/json" % pmid).text
        result = {}
        try:
            result = json.loads(raw)
        except:
            print "Error"
            result = {"denotations" : []}
        disease_spans = [(x["span"]["begin"], x["span"]["end"]) for x in result.get("denotations") if x["obj"].startswith("Disease")]
        for (begin, end) in disease_spans:
            span = abstract[begin:end]
            if cancer_terms_re.search(span):
                diseases.add(span)
        '''
        pmc_response = get_IBM_pmc_text(pmc_id)
        #body = get_pmc_text(pmc_id)["body"]
        #abstract = result.get("text") or pubmed_article.get("AB") or ""
        abstract = pubmed_article.get("AB", "")
        diseases = set()
        if abstract != "":
            entities = get_entities(abstract)
            diseases = set([entity["text"] for entity in entities if entity["type"] == 'Cancer_Entity'])
        variants_in_abstract = set()
        variants_in_body = set()
        outcomes = set(outcome_terms_re.findall(abstract))
        for regex in all_variant_regex:
            match = regex.search(abstract)
            if match: variants_in_abstract.add(match.group())
                
            if pmc_response:
                body = pmc_response[0].get("body", "")
                match = regex.search(body)
                if match: variants_in_body.add(match.group())

        row = [pmid, pmc_id, ", ".join(outcomes), ", ".join(diseases), ", ".join(variants_in_abstract), 
               ", ".join(variants_in_body), pubmed_article.get("TI"), ", ".join(pubmed_article.get("PT")), "\n"]
        f.write("\t".join(row))

0: 24895460
1: 28149837
2: 28094001
3: 26791782
4: 27747084
5: 28261333
6: 27747083
7: 26110550
8: 27837257
9: 27690220
10: 24987059
11: 27556948
12: 21779504
13: 27035903
14: 26396549
15: 24436047
16: 25934522
17: 24423918
18: 22628411
19: 26802155
20: 23360111
21: 24204737
22: 25882375
23: 25915534
24: 26664139
25: 26582713
26: 28115009
27: 24737952
28: 25935754
29: 22983396
30: 24574860
31: 26627007
32: 25985019
33: 27267993
34: 28149764
35: 22536370
36: 24576830
37: 26360058
38: 26483300
39: 26824010
40: 27245685
41: 24612015
42: 26257864
43: 26959608
44: 26380542
45: 26347206
46: 26324703
47: 27147897
48: 26918901
49: 26621741
50: 25431423
51: 25127139
52: 22773810
53: 26316818
54: 25107706
55: 24481312
56: 24192036
57: 26730180
58: 26691657
59: 26105199
60: 23436801
61: 27003990
62: 26777152
63: 26091043
64: 27431571
65: 26556430
66: 27071537
67: 24229709
68: 23416860
69: 26860843
70: 26225238
71: 25329694
72: 27043212
73: 26842989
74: 27663730
75: 27139190
76: 28350009
77: 23588

ValueError: No JSON object could be decoded

In [13]:
pmid = "23938291"
result = requests.get("https://www.ncbi.nlm.nih.gov/CBBresearch/Lu/Demo/RESTful/tmTool.cgi/BioConcept/%s/json" % pmid).json()

In [14]:
print result
text = result["text"]
#entities_span = [x["span"] for x in result["denotations"]]
disease_spans = [(x["span"]["begin"], x["span"]["end"]) for x in result["denotations"] if x["obj"].startswith("Disease")]
for (begin, end) in disease_spans:
    span = text[begin:end]
    print span
#mutation_entities = [x for x in entities if x.startswith("Mutation")]
#print disease_spans
#print mutation_entities

{u'sourceid': u'23938291', u'sourcedb': u'PubMed', u'denotations': [{u'span': {u'begin': 144, u'end': 151}, u'obj': u'Species:9606'}, {u'span': {u'begin': 2195, u'end': 2202}, u'obj': u'Species:9606'}, {u'span': {u'begin': 194, u'end': 198}, u'obj': u'Gene:1956'}, {u'span': {u'begin': 203, u'end': 207}, u'obj': u'Gene:3845'}, {u'span': {u'begin': 47, u'end': 51}, u'obj': u'Gene:1956'}, {u'span': {u'begin': 64, u'end': 68}, u'obj': u'Gene:1956'}, {u'span': {u'begin': 69, u'end': 73}, u'obj': u'Gene:3845'}, {u'span': {u'begin': 384, u'end': 388}, u'obj': u'Gene:1956'}, {u'span': {u'begin': 398, u'end': 402}, u'obj': u'Gene:3845'}, {u'span': {u'begin': 415, u'end': 419}, u'obj': u'Gene:1956'}, {u'span': {u'begin': 424, u'end': 428}, u'obj': u'Gene:3845'}, {u'span': {u'begin': 560, u'end': 564}, u'obj': u'Gene:1956'}, {u'span': {u'begin': 577, u'end': 581}, u'obj': u'Gene:3845'}, {u'span': {u'begin': 1316, u'end': 1320}, u'obj': u'Gene:1956'}, {u'span': {u'begin': 1366, u'end': 1370}, u'ob

In [4]:
GENES=["AKT", "ALK", "KRAS", "KIT", "MTOR"]
#THERAPIES = "ARQ751 OR AZD5363 OR GSK2141795"
THERAPIES=["AZD5363", "AP26113", "Binimetinib", "Imatinib", "Temsirolimus"]
VARIANTS=["E17K", "EML4-ALK", "G12C", "W557G", "T1977K"]
DISEASES=["breast cancer", "lung cancer", "lung cancer", "Gastrointestinal Stromal Tumor", "Renal Clear Cell Carcinoma"]
#query = "AKT1 AND E17K AND (%s)" % (cancer_terms)

In [7]:
html = requests.get("https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4872785/").text
soup = bs(html,'html.parser')

In [None]:
sections = soup.find_all("div", class_="sec")
components = {"abstract" : "", "body" : ""}
abs_re = re.compile("abstract", re.IGNORECASE)
ref_re = re.compile("references", re.IGNORECASE)

for section in sections:
    sec_title = section.find("h2")
    if sec_title:
        title = sec_title.get_text()
        parent = sec_title.parent
        if abs_re.search(title):
            components["abstract"] = parent.get_text()
        elif not ref_re.search(title):
            components["body"] += parent.get_text()
            
print components["abstract"]
print "\n\n\n\n"
print components["body"]

In [12]:
#query = "AKT1 AND E17K AND AZD5363"
#(union, filtered) = search("ALK", "5017", "Non-Small Cell Lung Cancer", "EML4-ALK", "fusion gene") # Good example
(union, filtered) = search("BRAF", "1081", "Ovarian Cancer", "CUL1-BRAF", "fusion gene") # Nothing found
#(union, filtered) = search("BRAF", "1081", "Glioma", "KIAA1549-BRAF", "fusion gene")
#(union, filtered) = search("BRAF", "726", "melanoma", "V600E", "mutation") # XLS PMID refers to non-PUBMED/PMC content
#(union, filtered) = search("ROS1", "5167", "Non-Small Cell Lung Cancer", "TPM3-ROS1", "fusion gene") # XLS PMID is incorrect. That publication uses Crizotinib
#(union, filtered) = search("TSC1", "DB01590", "hepatocellular carcinoma", "", "")
print "Union size: %d" % len(union)
print "Filtered size: %d" % len(filtered)
print "\n".join(filtered)

Pubmed Query: 
("BRAF"[Title/Abstract] OR "BRAF1"[Title/Abstract]) AND ("selumetinib"[Title/Abstract] OR "mek inhibitor azd6244"[Title/Abstract] OR "azd6244"[Title/Abstract] OR "arry-142886"[Title/Abstract] OR "mek inhibitor azd6244"[Title/Abstract]) AND ("ovarian cancer"[Title/Abstract] OR "malignant ovarian neoplasm"[Title/Abstract] OR "cancer of ovary"[Title/Abstract] OR "cancer of the ovary"[Title/Abstract] OR "malignant neoplasm of ovary"[Title/Abstract] OR "malignant neoplasm of the ovary"[Title/Abstract] OR "malignant ovarian tumor"[Title/Abstract] OR "malignant tumor of ovary"[Title/Abstract] OR "malignant tumor of the ovary"[Title/Abstract] OR "ovarian cancer, nos"[Title/Abstract]) AND ("CUL1-BRAF"[Title/Abstract]) AND ("fusion gene"[Title/Abstract])
PMC Query: 
("BRAF"[Body - All Words] OR "BRAF1"[Body - All Words]) AND ("selumetinib"[Body - All Words] OR "mek inhibitor azd6244"[Body - All Words] OR "azd6244"[Body - All Words] OR "arry-142886"[Body - All Words] OR "mek inhibi

In [None]:
for i in range(5):
    query = "%s AND %s" % (GENES[i], VARIANTS[i])
    ids = pu.search_pubmed(query, retmax=100000)
    articles = pu.get_medline(ids)
    print "#%s" % (query)
    for article in articles:
        if "AB" in article:
            abstract = article["AB"]
            if outcome_terms_re.search(abstract) and patient_terms_re.search(abstract):
                #print "%s: Match found - %s" % (article["PMID"], ", ".join(outcome_terms_re.findall(abstract)))
                print "%s\t%s\t%s" % (article["PMID"], ", ".join(article["PT"]), article["TI"])
    print

In [3]:
journal = "geno-grp"
doi = "10.1016/j.cllc.2015.11.004" # we have access to Elsevier article "10.1016/j.cllc.2015.11.004"
response = requests.get("https://watsonpow01.rch.stglabs.ibm.com/services/content/api/v1/content/%s/fetch?userID=genomics&token=w4genom&field=doi&value=%s&version=2017-03-18" % (journal, doi)).json()



In [4]:
print response[0]["documentText"]

serial
JL
Clinical Lung Cancer
CLINICALLUNGCANCER
eng
JP__|__FR__|__US__|__ES__|__AU

 Hematology, Oncology and Palliative Medicine
 Pulmonary/Respiratory

280645
999999
fla
article
CKIBMJ00000001072/15257304/v17i5/S1525730415002685/main.xml
Elsevier Inc.
S1525-7304(15)00268-5
S1525730415002685
1-s2.0-S1525730415002685
10.1016/j.cllc.2015.11.004
S300
S300.1
FULL-TEXT
2016-10-06T18:07:34
2016-10-06T18:07:34
2016-10-06T18:07:34
NONSQUAMOUSNONSMALLCELLLUNGCANCERPATIENTSCARRYADOUBLEMUTATIONEGFREMLALKKRASFREQUENCYCLINICALPATHOLOGICALCHARACTERISTICSRESPONSETHERAPY
ULIVI
P

 
 TANAKA
 T
 2010
 651
 655
 
 
 MARCHETTI
 A
 2005
 857
 865
 
 
 MOK
 T
 2009
 947
 957
 
 
 ROSELL
 R
 2012
 239
 246
 
 
 YANG
 J
 2015
 141
 151
 
 
 SODA
 M
 2007
 561
 566
 
 
 SHAW
 A
 2013
 2385
 2394
 
 
 SHAW
 A
 2014
 1189
 1197
 
 
 GADGEEL
 S
 2014
 1119
 1128
 
 
 ROTHSCHILD
 S
 2014
 379
 381
 
 
 TAM
 I
 2006
 1647
 1653
 
 
 LEE
 W
 2010
 473
 477
 
 
 ZHANG
 Y
 2015
 61
 null
 
 
 METRO
 G
 2014
 86
 92