In [2]:
import os, sys, re, urllib, collections, requests
import numpy as np
import pandas as pd
from Bio import Entrez, Medline
from com.ibm.watson import DESKTOP, DOCS_DIR
import com.ibm.watson.utils.pubmed_utils as pu
from bs4 import BeautifulSoup as bs
import pymongo

In [3]:
base_dir = os.path.join(DESKTOP, "Levels_Of_Evidence")

In [4]:
client = pymongo.MongoClient()
db = client["LOE"]

In [4]:
loe_xls = pd.read_excel(os.path.join(base_dir, "Level_of_evidence_20170227_v29.7.3.xlsx"), converters={"cancer type" : str})

In [None]:
# LoE Genes
genes = set(loe_xls["gene"])
genes.remove("ABL1;ABL1")
genes.remove("EGFR;EGFR")
print genes

In [None]:
# Downloaded from ftp://ftp.ebi.ac.uk/pub/databases/genenames/new/tsv/hgnc_complete_set.txt
hgnc_genes = pd.read_csv(os.path.join(DOCS_DIR, "Curation", "hgnc_complete_set.tsv"), 
                         converters={"alias_symbol" : str}, sep="\t")
print hgnc_genes.head(2)

In [None]:
# Write gene_dictionary.tsv
with open(os.path.join(base_dir, "gene_dictionary.tsv"), "wb") as f:
    for symbol in genes:
        exists = hgnc_genes["symbol"] == symbol
        print symbol
        symbols = [symbol] + hgnc_genes[exists].iloc[0]["alias_symbol"].split("|") if exists.any() else []
        symbols.append("\n")
        f.write("\t".join(symbols))

In [None]:
# Load into Mongo
with open(os.path.join(base_dir, "gene_dictionary.tsv")) as f:
    for raw in f:
        line = raw.strip().split("\t")
        symbol = line[0]
        synonyms = line[1:] if len(line) > 1 else []
        result = db["GENE"].insert_one({"SYMBOL" : symbol, "SYNS" : synonyms })
        print "Inserted: " + str(result.inserted_id)

In [5]:
wga_drugdb_dev_table = pd.read_csv("/Users/singhv/Documents/Curation/Drug/WGA_ALL_DRUG_GENERAL4.tsv", 
                                   delimiter="\t", header=None, na_values = ["NULL"],
                                   converters = {"ALIAS_NAME" : str, "GENERIC_NAME" : str, 'THERAPY' : str},
                                   names=['THERAPY_ID', 'THERAPY', 'NCI_CODE', 'NCI_MOA', 'NCI_CUI', 'GENERIC_NAME', 
                                          'FDA_STATUS', 'ALIAS_NAME', 'CODE_NAME', 'ACTIVE', 'RETIRE_DATE', 
                                          'MANUALLY_REVIEWED', 'FDA_LABEL_INDICATIONS_AND_USAGE', 'CREATED', 
                                          'THERAPY_SHORT_NAME', 'UMLS_CUI'])

In [6]:
# LoE therapies
valid_values = re.compile("DB[0-9]+|[0-9]+")
therapies = set([valid_values.match(unicode(x)).group() for x in loe_xls["Therapy_id"]]) # Removing invalid characters
print therapies

set([u'5173', u'DB01169', u'760', u'DB08828', u'3049', u'5073', u'5075', u'5274', u'DB04868', u'5271', u'5272', u'4140', u'345', u'DB01259', u'DB01254', u'DB06643', u'5064', u'5146', u'DB06626', u'5143', u'DB00072', u'4031', u'4039', u'4139', u'DB05294', u'299', u'820', u'DB00619', u'5189', u'DB00755', u'1081', u'5017', u'5227', u'5154', u'DB00530', u'DB08881', u'3303', u'5253', u'DB00877', u'5256', u'5257', u'5255', u'3026', u'5258', u'5259', u'5002', u'5006', u'5005', u'4115', u'DB08896', u'5124', u'4428', u'DB01268', u'4058', u'649', u'DB02546', u'5038', u'5114', u'5037', u'5034', u'5140', u'5279', u'3005', u'DB06287', u'4102', u'DB01590', u'3057', u'DB08865', u'5238', u'4064', u'4065', u'5235', u'5236', u'1030', u'561', u'5026', u'168', u'5102', u'DB08875', u'5100', u'DB08877', u'163', u'862', u'4071', u'5226', u'726', u'833', u'4081', u'885', u'156', u'5118', u'DB08916', u'DB08912', u'DB08911', u'DB05773', u'3104', u'1059', u'5197', u'4004', u'3101', u'DB06589', u'DB01269', u'DB06

In [None]:
# Write therapy_dictionary.tsv
with open(os.path.join(base_dir, "therapy_dictionary.tsv"), "wb") as f:
    for therapy_id in therapies:
        exists = wga_drugdb_dev_table['THERAPY_ID'] == therapy_id
        names = []
        print therapy_id
        if exists.any():
            row = wga_drugdb_dev_table[exists].iloc[0]
            names.append(row["THERAPY"].lower())
            if (row["GENERIC_NAME"] != "NULL") and (row["GENERIC_NAME"].lower() not in names): 
                names.append(row["GENERIC_NAME"].lower())
            names.extend([x.lower() for x in row["ALIAS_NAME"].split("@") if x.lower() not in names])
        names = [therapy_id] + names if names else [therapy_id]
        names.append("\n")
        f.write("\t".join(names))

In [None]:
# Load into Mongo
with open(os.path.join(base_dir, "therapy_dictionary.tsv")) as f:
    for raw in f:
        line = raw.strip().split("\t")
        if len(line) > 0:
            therapy_id = line[0]
            name = line[1] if len(line) > 1 else ""
            synonyms = line[2:] if len(line) > 2 else []
            result = db["THERAPY"].insert_one({"THERAPY_ID" : therapy_id, "NAME" : name, "SYNS" : synonyms})
            print "Inserted: " + str(result.inserted_id)

In [37]:
# LoE cancer types
cancer_types = set([x for x in loe_xls["cancer type"] if isinstance(x, basestring)])
print cancer_types

set(['Hepatobiliary Cancer', 'Unknown Primary-Metastatic Malignant Neoplasm of Unknown Primary Origin', 'Prostate Cancer', 'Melanoma', 'Colorectal Cancer', 'Dermatologic-Melanoma', 'Ovarian Cancer', 'Acute Myeloid Leukemia', 'cholangiocarcinoma', 'Thoracic-Lung Carcinoma-Lung Squamous Cell Carcinoma', 'Leukemia', 'Soft Tissue Sarcoma', 'CNS Embryonal Tumor', 'Neurofibroma', 'Cancer of Unknown Primary', 'Colorectal Adenocarcinoma', 'All Liquid Tumors', 'Non-Small Cell Lung Carcinoma', 'Histiocytic Disorder', 'Endometrial Cancer', 'GIST', 'CNS Cancer', 'Thoracic-Lung Carcinoma-Lung Adenocarcinoma', 'melanoma', 'Basal Cell carcinoma', 'Chondrosarcoma', 'bladder cancer', 'Nerve Sheath Tumor', 'Embryonal Tumor', 'any', 'soft tissue sarcoma', 'Diffuse Large B-Cell Lymphoma', 'Invasive Breast Carcinoma', 'Ovary/Fallopian Tube', 'Lung Adenocarcinoma', 'Lung Squamous Cell Carcinoma', 'Breast Cancer', 'Thymic Tumor', 'Salivary Gland Cancer', 'Non-Hodgkin Lymphoma', 'colorectal cancer', 'esophage

In [29]:
cds_url = "https://watsonpow01.rch.stglabs.ibm.com/services/concept_detection/api/v1/concepts?"
query = "version=2017-03-23&filters=%s&filter_groups=%s&libraries=umls.latest&longest_span=true&verbose=false"
filter_ = "semanticType:neop"
text = "GDGFDHFDF<DREIRE"
group = "life sciences"
query = {"version" : "2017-03-23", "filters" : filter_, "libraries" : "umls.latest", "longest_span" : "true", 
         "verbose" : "false", "filter_groups" : group}
headers = {"Content-Type" : "text/plain", "Accept" : "application/json"}
result = requests.post(cds_url + urllib.urlencode(query), headers = headers, data=text).json()
print result
if "data" in result["unstructured"][0]:
    print result["unstructured"][0]["data"]["concepts"][0]["cui"]

{u'unstructured': [{}]}


In [None]:
# Write cancer_types_dictionary.tsv
with open(os.path.join(base_dir, "cancer_types_dictionary.tsv"), "wb") as f:
    for cancer_type in cancer_types:
        print cancer_type
        result = requests.post(cds_url + urllib.urlencode(query), headers = headers, data=cancer_type).json()
        names = []
        if "data" in result["unstructured"][0]:
            umls_cui = result["unstructured"][0]["data"]["concepts"][0]["cui"]
            result1 = db["NCIT"].find_one({"UMLS_CUI" : umls_cui})
            if result1: 
                names.append(result1["UMLS_CUI"])
                names += result1["SYNS"]
        names = [cancer_type] + names + ["\n"]
        f.write("\t".join(names))

In [7]:
# Insert a particular entry to CANCER
result = client["WATSON"]["NCIT"].find_one({"CODE" : "C3099"})
if result:
    db["CANCER"].insert_one({"NAME" : "hepatocellular carcinoma", 
                             "SYNS" : list(set([x.lower() for x in result["SYNS"]])),
                             "UMLS_CUI" : result["UMLS_CUI"]
                            })

In [5]:
# Load into Mongo
with open(os.path.join(base_dir, "cancer_types_dictionary.tsv")) as f:
    for raw in f:
        line = raw.strip().split("\t")
        if len(line) > 0:
            cancer_type = line[0].lower()
            umls_cui = line[1] if len(line) > 1 else ""
            synonyms = [x.lower() for x in line[2:] if (len(line) > 2) and (x.lower() != cancer_type)]
            result = db["CANCER"].insert_one({"NAME" : cancer_type, "UMLS_CUI" : umls_cui, "SYNS" : synonyms})
            print "Inserted: " + str(result.inserted_id)

Inserted: 58d9667b63ca025d936919ab
Inserted: 58d9667b63ca025d936919ac
Inserted: 58d9667b63ca025d936919ad
Inserted: 58d9667b63ca025d936919ae
Inserted: 58d9667b63ca025d936919af
Inserted: 58d9667b63ca025d936919b0
Inserted: 58d9667b63ca025d936919b1
Inserted: 58d9667b63ca025d936919b2
Inserted: 58d9667b63ca025d936919b3
Inserted: 58d9667b63ca025d936919b4
Inserted: 58d9667b63ca025d936919b5
Inserted: 58d9667b63ca025d936919b6
Inserted: 58d9667b63ca025d936919b7
Inserted: 58d9667b63ca025d936919b8
Inserted: 58d9667b63ca025d936919b9
Inserted: 58d9667b63ca025d936919ba
Inserted: 58d9667b63ca025d936919bb
Inserted: 58d9667b63ca025d936919bc
Inserted: 58d9667b63ca025d936919bd
Inserted: 58d9667b63ca025d936919be
Inserted: 58d9667b63ca025d936919bf
Inserted: 58d9667b63ca025d936919c0
Inserted: 58d9667b63ca025d936919c1
Inserted: 58d9667b63ca025d936919c2
Inserted: 58d9667b63ca025d936919c3
Inserted: 58d9667b63ca025d936919c4
Inserted: 58d9667b63ca025d936919c5
Inserted: 58d9667b63ca025d936919c6
Inserted: 58d9667b63