In [1]:
import pandas as pd
import wikidataintegrator as wdi

## Prediction of drug-target interactions and drug repositioning via network-based inference (PMID:22589709)

In [4]:
# data from http://lmmd.ecust.edu.cn/database/dti/

path = '/home/sebastian/jupyter-notebooks/paper_data/Cheng et al 2012 PMID:22589709/DrugBank_Global_Database/Global_Target_List_With_Score_1.xls.xlsx'

pred_data = pd.read_excel(path, header=None)


In [5]:
pred_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1914,1915,1916,1917,1918,1919,1920,1921,1922,1923
0,"Chlorophyll a-b binding protein, chloroplast ...",DB03001,DB02686,DB00819,DB03796,DB01159,DB04079,DB03381,DB02944,DB02687,...,,,,,,,,,,
1,,0.277778,0.25,0.125,0.0271673,0.0271673,0.0192308,0.0192308,0.0192308,0.0192308,...,,,,,,,,,,
2,RNASE4 protein ...,DB02379,DB04184,DB03014,DB01867,DB02061,DB03389,DB02944,DB02212,DB03754,...,,,,,,,,,,
3,,0.0417496,0.0402842,0.0279834,0.0188596,0.0187729,0.0179487,0.0175415,0.0129892,0.0129049,...,,,,,,,,,,
4,"Thiazole biosynthetic enzyme, chloroplast ...",DB04147,DB04522,DB02451,DB03381,DB02944,DB02687,DB03796,DB01728,DB02482,...,,,,,,,,,,


In [6]:
pred_data.columns

Int64Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
            ...
            1914, 1915, 1916, 1917, 1918, 1919, 1920, 1921, 1922, 1923],
           dtype='int64', length=1924)

In [15]:
pred_map = {}

for count, x in pred_data.iterrows():
    
    if pd.notnull(x[0]) and any([pd.notnull(z) for z in x[1:]]):
        drugs = []
        for y in x[1:]:
            if pd.notnull(y):
                drugs.append(y)
        
        pred_map.update({x[0]: drugs})
        
            
            
        

In [16]:
print(len(pred_map))

3487


In [36]:
import requests
import pprint

s = requests.Session()


def sparql_map(prop_nr):
    query = '''
    SELECT DISTINCT * WHERE {{
        ?c wdt:{} ?id .
    }}
    '''.format(prop_nr)
    
    results = wdi.wdi_core.WDItemEngine.execute_sparql_query(query)
    
    id_map = {}
    for x in results['results']['bindings']:
        id_map.update({x['id']['value']: x['c']['value']})
        
    return id_map


def sparql_search(items, data_class, category):
    query = '''
    SELECT DISTINCT * WHERE {{
        VALUES ?c {{ wd:{0} }}
        ?c wdt:{2} wd:{1} .
        ?c wdt:P703 wd:Q15978631 .
    }}
    
    '''.format(' wd:'.join(items), data_class, category)
    
#     print(query)
    
    results = wdi.wdi_core.WDItemEngine.execute_sparql_query(query)
    
    result_qids = []
    for x in results['results']['bindings']:
        result_qids.append(x['c']['value'].split('/').pop())
        
    return result_qids
    

def search_wikidata(search_term, data_class, category):
    # data class e.g chemical compound (Q11173), category e.g. P31 (instance of), P279 (subclass of)
    
    url = 'https://www.wikidata.org/w/api.php'
    params = {
        'action': 'wbsearchentities',
        'search': search_term,
        'language': 'en',
        'type': 'item',
        'format': 'json',
        'limit': 50
    }
    
    r = s.get(url=url, params=params).json()
    if len(r['search']) == 0:
        print('no results for:', search_term)
        return []
    else:
        items = list()
        for x in r['search']:
            items.append(x['title'])
        
        typefiltered_items = sparql_search(items, data_class, category)
        return typefiltered_items

drugbank_wd_map = sparql_map(prop_nr='P715')
drug_target_list = []

something_found_count = 0
for c, x in enumerate(pred_map):
    protein_items = search_wikidata(x, 'Q8054', 'P279')
#     print('protein items:', protein_items)
    sres_length = len(protein_items)
    
    for i in protein_items:
        for ii in pred_map[x]:
            if ii[2:] in drugbank_wd_map:
                drug_target_list.append({'drug_qid': drugbank_wd_map[ii[2:]], 
                                         'target_qid': 'http://www.wikidata.org/entity/' + i})
        
    if sres_length > 0:
        something_found_count += 1
    
    if c % 100 == 0:
        print('current count:', something_found_count, 'total searches:', c)
    
print(something_found_count)
    

no results for: Cytochrome b2, mitochondrial                                                                                                        
current count: 0 total searches: 0
no results for: Hepatic peroxysomal alanine:glyoxylate aminotransferase                                                                             
no results for: Proline oxidase, mitochondrial                                                                                                      
no results for: P-selectin cytoplasmic tail-associated protein                                                                                      
no results for: Cytochrome P450 3A4                                                                                                                 
no results for: Alpha-neurotoxin TX12                                                                                                               
no results for: High affinity immunoglobulin epsilon receptor subunit a

In [37]:
print(drug_target_list[1])

{'target_qid': 'http://www.wikidata.org/entity/Q21139622', 'drug_qid': 'http://www.wikidata.org/entity/Q341500'}


In [None]:
# integrate the other data sets for enzymes, ion channels, nuclear receptors, GPCRs. These are
# mappings between NCBI entrez gene IDs and KEGG chemical compound IDs