In [1]:
import pandas as pd
import wikidataintegrator as wdi

### Du et al: Ligand cluster-based protein network and ePlatton, a multi-target ligand finder PMID:27143991 

In [2]:
du_data = pd.read_csv('./paper_data/Du et al 2016 PMID:27143991/DISEASETABLE-disease_name-disease_id-all_gene_symbolCSV-gene_symbolCSV-chembl_idCSV-cluster_idCSV.tab', 
                      sep='\t', header=None, names=['disease', 'omim','genes', 'primary_gene', 'chembl', 'cluster'])
du_data.head()

Unnamed: 0,disease,omim,genes,primary_gene,chembl,cluster
0,"17,20-lyase deficiency, isolated",202110,CYP17A1|CYP17|P450C17,CYP17A1,CHEMBL1766178|CHEMBL1172847|CHEMBL390569|CHEMB...,CLUSTER5314|CLUSTER5656|CLUSTER4969|CLUSTER628...
1,"17-alpha-hydroxylase/17,20-lyase deficiency",202110,CYP17A1|CYP17|P450C17,CYP17A1,CHEMBL1766178|CHEMBL1172847|CHEMBL390569|CHEMB...,CLUSTER5314|CLUSTER5656|CLUSTER4969|CLUSTER628...
2,3MC syndrome 2,265050,COLEC11|CLK1|3MC2,CLK1,CHEMBL1435542|CHEMBL1892019|CHEMBL1802401|CHEM...,CLUSTER10427|CLUSTER7415|CLUSTER2715|CLUSTER10...
3,46XY sex reversal 3,612965,NR5A1|FTZF1|FTZ1|SF1|AD4BP|POF7|SRXY3|SPGF8,NR5A1,CHEMBL1765962|CHEMBL1715382|CHEMBL1765954|CHEM...,CLUSTER1423|CLUSTER4891|CLUSTER11468|CLUSTER31...
4,46XY sex reversal 8,614279,AKR1C2|DDH2|DD2|HAKRD|SRXY8,AKR1C2,CHEMBL686|CHEMBL1277741|CHEMBL1275703|CHEMBL33...,CLUSTER10569|CLUSTER3564|CLUSTER8033|CLUSTER78...


In [3]:
du_data.count()

disease         736
omim            736
genes           736
primary_gene    736
chembl          736
cluster         736
dtype: int64

In [4]:
query = '''
select distinct ?d ?omim where { 
  ?d wdt:P492 ?omim .
  
}
group by ?d ?omim ?umls
'''

r = wdi.wdi_core.WDItemEngine.execute_sparql_query(query)

omim_map = {}

for x in r['results']['bindings']:
    omim_map.update({x['omim']['value']: x['d']['value']})

missing_set = set()
for c, x in du_data.iterrows():
    if x['omim'] not in omim_map:
        print('OMIM missing', x['omim'])
        missing_set.add(x['omim'])

OMIM missing 608688
OMIM missing 613933
OMIM missing \N
OMIM missing \N
OMIM missing 307200
OMIM missing 610251
OMIM missing \N
OMIM missing \N
OMIM missing \N
OMIM missing \N
OMIM missing 613546
OMIM missing 300645
OMIM missing 602483
OMIM missing \N
OMIM missing \N
OMIM missing \N
OMIM missing 613291
OMIM missing \N
OMIM missing 610474
OMIM missing 211800
OMIM missing 614162
OMIM missing \N
OMIM missing 212070
OMIM missing \N
OMIM missing 609338
OMIM missing 610127
OMIM missing 615362
OMIM missing 300863
OMIM missing 609441
OMIM missing \N
OMIM missing 609535
OMIM missing 614306
OMIM missing \N
OMIM missing \N
OMIM missing \N
OMIM missing 613912
OMIM missing 217090
OMIM missing 614039
OMIM missing \N
OMIM missing 614662
OMIM missing 122700
OMIM missing \N
OMIM missing 612247
OMIM missing 614564
OMIM missing \N
OMIM missing \N
OMIM missing \N
OMIM missing 610549
OMIM missing 112250
OMIM missing 606835
OMIM missing \N
OMIM missing \N
OMIM missing \N
OMIM missing 612132
OMIM missing 614

In [5]:
len(missing_set)

138

In [6]:
query = '''
select distinct ?c ?chembl ?cLabel where { 
  ?c wdt:P592 ?chembl . 
  SERVICE wikibase:label {bd:serviceParam wikibase:language "en" .}
  
}
'''

r = wdi.wdi_core.WDItemEngine.execute_sparql_query(query)

chembl_map = {}

for x in r['results']['bindings']:
    chembl_map.update({x['chembl']['value']: (x['c']['value'], x['cLabel']['value'])})
    

du_list = []
for c, x in du_data.iterrows():
    
    if x['omim'] in omim_map:
        for ci in x['chembl'].split('|'):
            if ci in chembl_map:
                du_list.append({
                    'Disease name': x['disease'],
                    'Drug name': chembl_map[ci][1],
                    'disease_qid': omim_map[x['omim']],
                    'drug_qid': chembl_map[ci][0]
                    
                })
        


In [7]:
len(du_list)

6808

In [8]:
import pprint
pprint.pprint(du_list, width=300)

[{'Disease name': '17,20-lyase deficiency, isolated', 'Drug name': 'bw-19', 'disease_qid': 'http://www.wikidata.org/entity/Q4127184', 'drug_qid': 'http://www.wikidata.org/entity/Q27294361'},
 {'Disease name': '17,20-lyase deficiency, isolated', 'Drug name': '(2S,4R)-ketoconazole', 'disease_qid': 'http://www.wikidata.org/entity/Q4127184', 'drug_qid': 'http://www.wikidata.org/entity/Q27120779'},
 {'Disease name': '17,20-lyase deficiency, isolated', 'Drug name': 'econazole', 'disease_qid': 'http://www.wikidata.org/entity/Q4127184', 'drug_qid': 'http://www.wikidata.org/entity/Q417141'},
 {'Disease name': '17,20-lyase deficiency, isolated', 'Drug name': '(R)-orteronel', 'disease_qid': 'http://www.wikidata.org/entity/Q4127184', 'drug_qid': 'http://www.wikidata.org/entity/Q27254066'},
 {'Disease name': '17,20-lyase deficiency, isolated', 'Drug name': '6-(7-hydroxy-5,6-dihydropyrrolo[1,2-c]imidazol-7-yl)-N-methyl-2-naphthalenecarboxamide', 'disease_qid': 'http://www.wikidata.org/entity/Q412718

In [9]:
import json

with open('du_data.txt', 'w') as outfile:
    json.dump(du_list, outfile)