In [15]:
import os
import pickle
%matplotlib inline
import pandas as pd
import seaborn as sns
import shelve
import re
from collections import defaultdict, Counter
from tqdm import tqdm
from itertools import chain
from more_itertools import chunked
from collections import Counter
from pprint import pprint
import requests
from pyquery import PyQuery as pq

In [2]:
uri_to_curie = lambda s: s.split("/")[-1].replace("_", ":")

In [3]:
# edges = pd.read_csv('edges_biolink.csv')
nodes = pd.read_csv("nodes_biolink.csv", index_col=0)

In [6]:
nodes.head()

Unnamed: 0_level_0,label,TYPE,bl_type
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
C0038195,State Medicine,Activities & Behaviors,activity_and_behavior
C1290952,Taking medication,Activities & Behaviors,activity_and_behavior
C0085092,Parenting behavior,Activities & Behaviors,activity_and_behavior
C1096771,Murderer,Activities & Behaviors,activity_and_behavior
C1299651,Serious reportable event,Activities & Behaviors,activity_and_behavior


In [7]:
nodes.bl_type.value_counts()

chemical_substance       60964
individual_organism      41618
disease                  38245
protein                  28504
genomic_entity           22456
anatomical_entity        15181
biological_process        9163
named_thing               1342
activity_and_behavior     1011
Name: bl_type, dtype: int64

In [8]:
## parse UMLS flat file to get all UMLS xrefs
# see: https://www.ncbi.nlm.nih.gov/books/NBK9685/

In [186]:
names = "CUI,LAT,TS,LUI,STT,SUI,ISPREF,AUI,SAUI,SCUI,SDUI,SAB,TTY,CODE,STR,SRL,SUPPRESS,CVF,X".split(",")
iter_csv = pd.read_csv("MRCONSO.RRF", delimiter="|", names=names, index_col=None, chunksize=10000)
# only get english
umls = pd.concat([chunk[chunk['LAT'] == "ENG"] for chunk in iter_csv])
# only get CUIs in our list of nodes
umls = umls[umls.CUI.isin(nodes.index)]

In [187]:
# only keep resources with >x counts
# keep_sources = {k for k,v in umls.SAB.value_counts().items() if v>=4000}
# umls = umls[umls.SAB.isin(keep_sources)]

In [188]:
umls['xref'] = umls.SAB + ":" + umls.CODE.map(str)
# fix this MSH MESH nonsense
umls.xref = umls.xref.str.replace("MSH:", "MESH:")
# NCI_FDA is UNII
umls.xref = umls.xref.str.replace("NCI_FDA:", "UNII:")

In [189]:
umls.head(2)

Unnamed: 0,CUI,LAT,TS,LUI,STT,SUI,ISPREF,AUI,SAUI,SCUI,SDUI,SAB,TTY,CODE,STR,SRL,SUPPRESS,CVF,X,xref
6,C0000039,ENG,P,L0000039,PF,S0007564,N,A0016515,,M0023172,D015060,MSH,MH,D015060,"1,2-Dipalmitoylphosphatidylcholine",0,N,256.0,,MESH:D015060
7,C0000039,ENG,P,L0000039,PF,S0007564,N,A17972823,,N0000007747,,NDFRT,PT,N0000007747,"1,2-Dipalmitoylphosphatidylcholine",0,N,256.0,,NDFRT:N0000007747


In [190]:
XREF = dict(umls.groupby("CUI")['xref'].apply(set))
XREF = defaultdict(set, XREF)
print(XREF['C0591520'])

{'CHV:0000041179', 'RXNORM:151775', 'MESH:D000068298', 'RCD:x02cs'}


In [117]:
# all xrefs we get from umls
# Counter(list(chain(*[list(map(lambda x:x.split(":",1)[0], y)) for y in XREF.values()])))

### Chemicals and drugs

In [118]:
# what xrefs are on chemicals?
chem_umls = nodes[nodes.bl_type == "chemical_substance"].index
xref_chem = {k:v for k,v in XREF.items() if k in chem_umls}
print(len(chem_umls))
c = Counter(list(chain(*[list(map(lambda x:x.split(":",1)[0], y)) for y in xref_chem.values()])))
pprint(c.most_common(25))
# nearly all have a mesh ID. not much of anything else
# neither mesh nor umls have inchikeys, or inchi, or smiles or anything usefull for linking out
# blech

60964
[('MESH', 51016),
 ('SNOMEDCT_US', 14463),
 ('NCI', 10448),
 ('RXNORM', 9297),
 ('NDFRT', 8986),
 ('CHV', 8200),
 ('MMSL', 7749),
 ('UNII', 6210),
 ('NDDF', 6165),
 ('RCD', 5967),
 ('MTHSPL', 5967),
 ('SNMI', 5314),
 ('MTH', 5115),
 ('LNC', 5053),
 ('VANDF', 3921),
 ('CSP', 2807),
 ('MEDCIN', 2798),
 ('DRUGBANK', 2764),
 ('SNM', 2628),
 ('PDQ', 2284),
 ('AOD', 1860),
 ('LCH_NW', 1768),
 ('NCI_NCI-GLOSS', 1481),
 ('LCH', 1154),
 ('PSY', 546)]


In [32]:
import pandas as pd
import requests
pd.set_option("display.width", 120)
import sys, os
sys.path.insert(0, "/home/gstupp/projects/WikidataIntegrator")
from wikidataintegrator import wdi_helpers, wdi_core, wdi_login

URL = "http://id.nlm.nih.gov/mesh/sparql"
PREFIX = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX meshv: <http://id.nlm.nih.gov/mesh/vocab#>
PREFIX mesh: <http://id.nlm.nih.gov/mesh/>
"""

def sparql_query(query):
    params = {'query': PREFIX + query, 'format': 'JSON', 'limit': 1000, 'offset': 0}
    r = requests.get(URL, params=params)
    res = [{k: v['value'] for k, v in x.items()} for x in r.json()['results']['bindings']]
    t = tqdm()
    while True:
        t.update(1)
        params['offset'] += 1000
        r = requests.get(URL, params=params).json()['results']['bindings']
        if not r:
            break
        res.extend([{k: v['value'] for k, v in x.items()} for x in r])
    df = pd.DataFrame(res)
    return df

In [None]:
query = """
SELECT distinct ?mesh ?meshLabel ?r ?rr
FROM <http://id.nlm.nih.gov/mesh> WHERE {
  ?mesh meshv:active 1 .
  ?mesh meshv:preferredMappedTo ?p .
  ?p meshv:treeNumber ?treeNum .
  FILTER(STRSTARTS(STR(?treeNum), "http://id.nlm.nih.gov/mesh/D")) .
  ?mesh rdfs:label ?meshLabel .
  ?mesh meshv:preferredConcept [meshv:registryNumber ?r] .
  #OPTIONAL {?mesh meshv:preferredConcept [meshv:relatedRegistryNumber ?rr]}
}
"""
df = sparql_query(query)

In [None]:
df.r = df.r.replace("0", pd.np.NaN)
df.dropna(subset=["r"], inplace=True)
df = df[~df.r.str.startswith("EC ")]
df.mesh = df.mesh.str.replace("http://id.nlm.nih.gov/mesh/", "")
df.set_index("mesh", inplace=True)

In [55]:
df.to_csv("mesh_xrefs.csv")
df.head()

Unnamed: 0_level_0,meshLabel,r
mesh,Unnamed: 1_level_1,Unnamed: 2_level_1
C000603935,aripiprazole lauroxil,B786J7A343
C070576,"PML protein, human",143220-95-5
C008700,2-octyl-gamma-bromoacetoacetate,40529-35-9
C009034,prenol,55MY0HM445
C010279,1-pentyn-3-ol,4187-86-4


In [120]:
mesh_xrefs = pd.read_csv("mesh_xrefs.csv", index_col=0)
mesh_xrefs.r = mesh_xrefs.r.apply(lambda x: "CAS:" + x if "-" in x else "UNII:" + x)
mesh_xrefs = mesh_xrefs.groupby("mesh").r.apply(set).to_dict()
mesh_xrefs = {"MESH:"+k:v for k,v in mesh_xrefs.items()}
len(mesh_xrefs)

38952

In [121]:
for k,v in xref_chem.items():
    for vv in list(v):
        if vv in mesh_xrefs:
            v.update(mesh_xrefs[vv])

In [142]:
# download: 'http://fdasis.nlm.nih.gov/srs/download/srs/UNII_Data.zip'
unii_df = pd.read_csv("UNII Records 15Nov2017.txt", dtype=str, sep='\t', low_memory=False)
unii_df.dropna(subset=['INCHIKEY'], inplace=True)

In [143]:
unii_df.head()

Unnamed: 0,UNII,PT,RN,EC,NCIT,RXCUI,PUBCHEM,ITIS,NCBI,PLANTS,GRIN,MPNS,INN_ID,MF,INCHIKEY,SMILES,INGREDIENT_TYPE
0,0001H6R5H1,CEROUS SALICYLATE,526-17-0,,,,76966289,,,,,,,3C7H5O3.Ce,RBJPAJHTYHKKTB-UHFFFAOYSA-K,[Ce+3].OC1=C(C=CC=C1)C([O-])=O.OC2=C(C=CC=C2)C...,INGREDIENT SUBSTANCE
1,000360VJE1,DI(DEHYDROABIETYL)AMINE ACETATE,53404-27-6,,,,76969106,,,,,,,C40H59N.C2H4O2,SETIUTJHVNMKFM-TUICDNFPSA-N,CC(O)=O.CC(C)C1=CC=C2C(CC[C@H]3[C@](C)(CNC[C@]...,INGREDIENT SUBSTANCE
4,00072J7XWS,GERMANIUM,7440-56-4,231-164-3,C95170,4784.0,6326954,,,,,,,Ge,QUZPNFFHZPRKJD-UHFFFAOYSA-N,[Ge],INGREDIENT SUBSTANCE
7,0009YL8Y42,"5-(3-BROMO-4,5-DIMETHOXYBENZYL)PYRIMIDINE-2,4-...",,,,,76963562,,,,,,,C13H15BrN4O3,WXKLFFZKCBICOL-UHFFFAOYSA-N,COC1=C(OC)C(OBr)=CC(CC2=C(N)N=C(N)N=C2)=C1,INGREDIENT SUBSTANCE
9,000F949089,SPIROFYLLINE,98204-48-9,,C74214,,3086451,,,,,,6195.0,C24H28N6O5,DSRGPEAMMDAUGF-UHFFFAOYSA-N,CN1C2=C(N(CC(=O)N3CC4(CCN(CCC5=CC=CC=C5)CC4)OC...,INGREDIENT SUBSTANCE


In [147]:
n=0
for k,v in tqdm(xref_chem.items()):
    for vv in list(v):
        if vv.startswith("UNII:"):
            xref = vv.replace("UNII:", "")
            s = unii_df.query("UNII == @xref").INCHIKEY
            if not s.empty:
                n+=1
                v.add("INCHIKEY:" + list(s)[0])

100%|██████████| 58721/58721 [00:41<00:00, 1422.90it/s]


In [153]:
xref_inchi = {k:v for k,v in xref_chem.items() if any(vv.startswith("INCHIKEY:") for vv in v)}
xref_inchi = {k:[vv for vv in v if vv.startswith("INCHIKEY:")][0].replace("INCHIKEY:", "") for k,v in xref_inchi.items()}
print(len(xref_inchi))
list(xref_inchi.items())[:4]

10030


[('C0048306', 'OSVMTWJCGUFAOD-KZQROQTASA-N'),
 ('C0044554', 'KWKAKUADMBZCLK-UHFFFAOYSA-N'),
 ('C0066326', 'AFBPFSWMIHJQDM-UHFFFAOYSA-N'),
 ('C0055402', 'CFKMVGJGLGKFKI-UHFFFAOYSA-N')]

In [177]:
url = "https://www.ebi.ac.uk/chembl/api/data/molecule?molecule_structures__standard_inchi_key__in={}&format=json&limit=100"
for chunk in tqdm(chunked(xref_inchi.items(), 100), total=len(xref_inchi)/100):
    chunk = dict(chunk)
    chunk = {v:k for k,v in chunk.items()}
    inchis = ",".join(chunk)
    mols = requests.get(url.format(inchis)).json()['molecules']
    for m in mols:
        chembl = m['molecule_chembl_id']
        inchi = m['molecule_structures']['standard_inchi_key']
        XREF[chunk[inchi]].add("CHEMBL:" + chembl)

101it [06:58,  4.15s/it]                           


In [179]:
len({k:v for k,v in XREF.items() if any(vv.startswith("CHEMBL:") for vv in v)})

7887

In [None]:
## UBERON

In [2]:
!wget -N http://purl.obolibrary.org/obo/uberon.owl

--2018-05-10 11:46:53--  http://purl.obolibrary.org/obo/uberon.owl
Resolving purl.obolibrary.org (purl.obolibrary.org)... 52.3.123.63
Connecting to purl.obolibrary.org (purl.obolibrary.org)|52.3.123.63|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: http://ontologies.berkeleybop.org/uberon.owl [following]
--2018-05-10 11:46:53--  http://ontologies.berkeleybop.org/uberon.owl
Resolving ontologies.berkeleybop.org (ontologies.berkeleybop.org)... 13.33.227.43, 13.33.227.230, 13.33.227.162, ...
Connecting to ontologies.berkeleybop.org (ontologies.berkeleybop.org)|13.33.227.43|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 68929267 (66M) [application/rdf+xml]
Saving to: ‘uberon.owl’


2018-05-10 11:46:59 (11.2 MB/s) - ‘uberon.owl’ saved [68929267/68929267]



In [3]:
s = """
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX oboInOwl: <http://www.geneontology.org/formats/oboInOwl#>

SELECT * WHERE {
  ?item oboInOwl:hasDbXref ?xref
}
"""
with open("query.sparql", 'w') as f:
    f.write(s)

In [4]:
!robot query --input uberon.owl --query query.sparql uberon.csv

In [36]:
df = pd.read_csv("uberon.csv")
df = df[df.xref.str.startswith("UMLS:")]
df.xref = df.xref.str.replace("UMLS:", "")
df.item = df.item.apply(uri_to_curie)
df.head()

Unnamed: 0,item,xref
6,UBERON:0006472,C1272528
66,UBERON:0001439,C0222661
87,UBERON:0001072,C0042458
144,UBERON:0001705,C0027342
179,UBERON:0002370,C0040113


In [62]:
s = df.groupby("xref")['item'].apply(set)
for umls, x in dict(s).items():
    XREF[umls].update(x)

In [64]:
XREF['C1272528']

{'SNOMEDCT_US:384765009', 'UBERON:0006472'}

In [65]:
## DOID

In [66]:
!wget -N http://purl.obolibrary.org/obo/doid.owl

--2018-05-10 12:23:52--  http://purl.obolibrary.org/obo/doid.owl
Resolving purl.obolibrary.org (purl.obolibrary.org)... 52.3.123.63
Connecting to purl.obolibrary.org (purl.obolibrary.org)|52.3.123.63|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/DiseaseOntology/HumanDiseaseOntology/master/src/ontology/doid.owl [following]
--2018-05-10 12:23:52--  https://raw.githubusercontent.com/DiseaseOntology/HumanDiseaseOntology/master/src/ontology/doid.owl
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.24.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.24.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 26195817 (25M) [text/plain]
Saving to: ‘doid.owl’


Last-modified header missing -- time-stamps turned off.
2018-05-10 12:23:54 (20.1 MB/s) - ‘doid.owl’ saved [26195817/26195817]



In [67]:
!robot query --input doid.owl --query query.sparql doid.csv

In [84]:
df = pd.read_csv("doid.csv")
df.dropna(inplace=True)
df = df[df.xref.str.startswith("UMLS_CUI:")]
df.xref = df.xref.str.replace("UMLS_CUI:", "")
df.item = df.item.apply(uri_to_curie)
df.head()

Unnamed: 0,item,xref
0,DOID:1943,C0263518
1,DOID:12960,C1510455
7,DOID:9455,C0029591
16,DOID:9123,C0936250
43,DOID:5591,C1367774


In [85]:
s = df.groupby("xref")['item'].apply(set)
for umls, x in dict(s).items():
    XREF[umls].update(x)

In [87]:
XREF['C0263518']

{'DOID:1943',
 'ICD10CM:L65.0',
 'NCI:C112200',
 'SNOMEDCT_US:201147004',
 'SNOMEDCT_US:39479004'}

In [200]:
XREF['C0591520']

{'CHEMBL:CHEMBL1201396', 'MESH:D000068298', 'RXNORM:151775', 'RXNORM:41126'}

## proteins

In [245]:
# I did: cat MRSAT.RRF.a* > MRSAT.RRF
names = list("abcdefghijklmn")
iter_csv = pd.read_csv("MRSAT.RRF", delimiter="|", names=names, index_col=None, chunksize=1000000)
chunks = []
umls_uniprot = dict()
for chunk in tqdm(iter_csv, total=67668372/10000):
    chunk.fillna(method='ffill', inplace=True)
    chunk = chunk[chunk.i == "SWISS_PROT"]
    d = dict(zip(chunk.a, chunk.k))
    umls_uniprot.update(d)

  if self.run_code(code, result):
  if self.run_code(code, result):
 10%|█         | 68/676.68372 [01:55<17:17,  1.70s/it]


In [247]:
len(umls_uniprot)

4305