In [1]:
import pandas as pd
import wikidataintegrator as wdi

## Prediction of drug-target interactions and drug repositioning via network-based inference (PMID:22589709)

In [2]:
# data from http://lmmd.ecust.edu.cn/database/dti/

path = '/home/sebastian/jupyter-notebooks/paper_data/Cheng et al 2012 PMID:22589709/DrugBank_Global_Database/Global_Target_List_With_Score_1.xls.xlsx'

pred_data = pd.read_excel(path, header=None)


In [3]:
pred_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1914,1915,1916,1917,1918,1919,1920,1921,1922,1923
0,"Chlorophyll a-b binding protein, chloroplast ...",DB03001,DB02686,DB00819,DB03796,DB01159,DB04079,DB03381,DB02944,DB02687,...,,,,,,,,,,
1,,0.277778,0.25,0.125,0.0271673,0.0271673,0.0192308,0.0192308,0.0192308,0.0192308,...,,,,,,,,,,
2,RNASE4 protein ...,DB02379,DB04184,DB03014,DB01867,DB02061,DB03389,DB02944,DB02212,DB03754,...,,,,,,,,,,
3,,0.0417496,0.0402842,0.0279834,0.0188596,0.0187729,0.0179487,0.0175415,0.0129892,0.0129049,...,,,,,,,,,,
4,"Thiazole biosynthetic enzyme, chloroplast ...",DB04147,DB04522,DB02451,DB03381,DB02944,DB02687,DB03796,DB01728,DB02482,...,,,,,,,,,,


In [4]:
pred_data.columns

Int64Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
            ...
            1914, 1915, 1916, 1917, 1918, 1919, 1920, 1921, 1922, 1923],
           dtype='int64', length=1924)

In [5]:
pred_map = {}

for count, x in pred_data.iterrows():
    
    if pd.notnull(x[0]) and any([pd.notnull(z) for z in x[1:]]):
        drugs = []
        for y in x[1:]:
            if pd.notnull(y):
                drugs.append(y)
        
        pred_map.update({x[0].strip(): drugs})
        
            
            
        

In [6]:
print(len(pred_map))

3487


In [7]:
import requests
import pprint

s = requests.Session()


def sparql_map(prop_nr):
    query = '''
    SELECT DISTINCT * WHERE {{
        ?c wdt:{} ?id .
    }}
    '''.format(prop_nr)
    
    results = wdi.wdi_core.WDItemEngine.execute_sparql_query(query)
    
    id_map = {}
    for x in results['results']['bindings']:
        id_map.update({x['id']['value']: x['c']['value']})
        
    return id_map


def sparql_search(items, data_class, category):
    query = '''
    SELECT DISTINCT * WHERE {{
        VALUES ?c {{ wd:{0} }}
        ?c wdt:{2} wd:{1} .
        ?c wdt:P703 wd:Q15978631 .
    }}
    
    '''.format(' wd:'.join(items), data_class, category)
    
#     print(query)
    
    results = wdi.wdi_core.WDItemEngine.execute_sparql_query(query)
    
    result_qids = []
    for x in results['results']['bindings']:
        result_qids.append(x['c']['value'].split('/').pop())
        
    return result_qids
    

def search_wikidata(search_term, data_class, category):
    # data class e.g chemical compound (Q11173), category e.g. P31 (instance of), P279 (subclass of)
    
    url = 'https://www.wikidata.org/w/api.php'
    params = {
        'action': 'wbsearchentities',
        'search': search_term,
        'language': 'en',
        'type': 'item',
        'format': 'json',
        'limit': 50
    }
    
    r = s.get(url=url, params=params).json()
    if len(r['search']) == 0:
        print('no results for:', search_term)
        return []
    else:
        items = list()
        for x in r['search']:
            items.append(x['title'])
        
        typefiltered_items = sparql_search(items, data_class, category)
        return typefiltered_items

drugbank_wd_map = sparql_map(prop_nr='P715')
drug_target_list = []

something_found_count = 0
for c, x in enumerate(pred_map):
    protein_items = search_wikidata(x, 'Q8054', 'P279')
#     print('protein items:', protein_items)
    sres_length = len(protein_items)
    
    for i in protein_items:
        for ii in pred_map[x]:
            if ii[2:] in drugbank_wd_map:
                drug_target_list.append({'drug_qid': drugbank_wd_map[ii[2:]], 
                                         'target_qid': 'http://www.wikidata.org/entity/' + i})
        
    if sres_length > 0:
        something_found_count += 1
    
    if c % 100 == 0:
        print('current count:', something_found_count, 'total searches:', c)
    
print(something_found_count)
    

current count: 0 total searches: 0
no results for: 50S ribosomal protein L1P                                                                                                           
no results for: Outer membrane protein p64k or PM-6                                                                                                 
no results for: Exoglucanase 1                                                                                                                      
no results for: Betaine--homocysteine S-methyltransferase 1                                                                                         
no results for: Low-affinity cationic amino acid transporter 2                                                                                      
no results for: Cytochrome P450 165B3                                                                                                               
no results for: AGR_L_3209p                                            

In [8]:
print(drug_target_list[1])

{'target_qid': 'http://www.wikidata.org/entity/Q1811387', 'drug_qid': 'http://www.wikidata.org/entity/Q29417'}


Only 962 of approximately 3,600 proteins could reliably be mapped to a human protein in Wikidata. The reason for that is that a share of the proteins is actually from bacteria and plants. The other issue is that Wikidata does frequently not have the Uniprot primary label as a label or alias, so a search does not return a result. 

In [9]:
## Better alternative: Get names from Uniprot SPARQL


In [13]:
def query_uniprot_labels():
    label_uniprot_map = {}
    query = '''
    PREFIX up:<http://purl.uniprot.org/core/> 
    PREFIX taxon:<http://purl.uniprot.org/taxonomy/> 
    PREFIX skos:<http://www.w3.org/2004/02/skos/core#> 
    
    SELECT ?protein ?name WHERE {
        ?protein a up:Protein .
        ?protein up:organism taxon:9606 .
        ?protein up:recommendedName ?recommended .
        ?recommended up:fullName ?name .
    }
    '''
    
    url = 'http://sparql.uniprot.org/sparql/'
    params = {
        'format': 'json',
        'query': query
    }
    headers = {
        'Accept': 'application/json'
    }
    
    r = requests.get(url, params=params, headers=headers).json()
    
    print(len(r['results']['bindings']))
    
    for x in r['results']['bindings']:
        label_uniprot_map.update({x['name']['value'].lower(): x['protein']['value']})
        
    return label_uniprot_map
    
prot_label_uniprot_map = query_uniprot_labels()


34535
{'membrane-spanning 4-domains subfamily a member 5': 'http://purl.uniprot.org/uniprot/Q9H3V2', 'beta-citrylglutamate synthase b': 'http://purl.uniprot.org/uniprot/Q9ULI2', 'wd repeat-containing protein 38': 'http://purl.uniprot.org/uniprot/Q5JTN6', 'metastasis-associated protein mta3': 'http://purl.uniprot.org/uniprot/Q9BTC8', 'plasminogen receptor (kt)': 'http://purl.uniprot.org/uniprot/Q9HBL7', 'keratin-associated protein 9-1': 'http://purl.uniprot.org/uniprot/A8MXZ3', 'terminal uridylyltransferase 4': 'http://purl.uniprot.org/uniprot/Q5TAX3', 'protein prenyltransferase alpha subunit repeat-containing protein 1': 'http://purl.uniprot.org/uniprot/Q7Z6K3', '6-phosphofructo-2-kinase/fructose-2,6-bisphosphatase 2': 'http://purl.uniprot.org/uniprot/O60825', 'semaphorin-3d': 'http://purl.uniprot.org/uniprot/O95025', 'tctex1 domain-containing protein 2': 'http://purl.uniprot.org/uniprot/Q8WW35', 'phospholipase a1 member a': 'http://purl.uniprot.org/uniprot/Q53H76', 'g antigen 13': 'ht

KeyError: 1

In [27]:
uniprot_map = sparql_map('P352')
alternative_drug_target_list = list()

tmp_count = 0
for x, drugbank in pred_map.items():
    prot_name = x.strip().lower()
    if prot_name in prot_label_uniprot_map:
        tmp_count += 1
        uniprot = prot_label_uniprot_map[prot_name].split('/').pop()
        if uniprot in uniprot_map:
            uniprot_qid = uniprot_map[uniprot]
            
            for db in drugbank:
                if db[2:] in drugbank_wd_map:
                    db_qid = drugbank_wd_map[db[2:]]
                    alternative_drug_target_list.append({'target_qid': uniprot_qid, 'drug_qid': db_qid})
                    
print(tmp_count)
print(len(alternative_drug_target_list))

1415
255949


In [31]:
# integrate the other data sets for enzymes, ion channels, nuclear receptors, GPCRs. These are
# mappings between NCBI entrez gene IDs and KEGG chemical compound IDs

# print(len(set(alternative_drug_target_list)))

def generate_gene_protein_map():
    query = '''
    SELECT DISTINCT * WHERE {
      ?c wdt:P351 ?entrez.
      ?c wdt:P703 wd:Q15978631.
      ?c wdt:P688 ?protein.
      ?protein wdt:P352 ?uniprot .
    }
    '''
    
    results = wdi.wdi_core.WDItemEngine.execute_sparql_query(query)
    
    id_map = {}
    for x in results['results']['bindings']:
        if x['c']['value'] in id_map:
            id_map[x['c']['value']].add(x['protein']['value'])
        else:
            id_map.update({x['entrez']['value']: {x['protein']['value']}})
        
    return id_map

gene_protein_map = generate_gene_protein_map()

files = [
    '/home/sebastian/jupyter-notebooks/paper_data/Cheng et al 2012 PMID:22589709/GPCRs_Database/GPCR_Target_List_With_Score.xls',
    '/home/sebastian/jupyter-notebooks/paper_data/Cheng et al 2012 PMID:22589709/Enzyme_Database/Enzyme_Target_List_With_Score.xls',
    '/home/sebastian/jupyter-notebooks/paper_data/Cheng et al 2012 PMID:22589709/Ion_Channels_Database/Ion_Target_List_With_Score.xls',
    '/home/sebastian/jupyter-notebooks/paper_data/Cheng et al 2012 PMID:22589709/Nuclear_Receptors_Database/Nuclear_Target_List_With_Score.xls'   
]

print(len(gene_protein_map))

20175


In [36]:
kegg_map = sparql_map('P665')

for ff in files:
    data = pd.read_excel(ff, header=None)
    
    for count, x in data.iterrows():
    
        if pd.notnull(x[0]) and any([pd.notnull(z) for z in x[1:]]):
            drugs = [y for y in x[1:] if pd.notnull(y)]
            protein_qids = []
            drug_qids = [kegg_map[xx] for xx in drugs if xx in kegg_map]
            entrez_id = x[0][4:]
            if entrez_id in gene_protein_map:
                
                protein_qids = gene_protein_map[entrez_id]
                for z in protein_qids:
                    tmp = [{'target_qid': z, 'drug_qid': yy} for yy in drug_qids]
                    alternative_drug_target_list.extend(tmp)

print(len(alternative_drug_target_list))

256703
