## Analysis and comparison of chemical data in Wikipedia and Wikidata

This notebook is intended for retrieving and comparing chemical compound data in Wikipedia and Wikidata

In [2]:
import requests
import pandas as pd
import json
import os

# query dbpedia for chemistry info

query = '''
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
PREFIX dc: <http://purl.org/dc/elements/1.1/>
PREFIX : <http://dbpedia.org/resource/>
PREFIX dbpedia2: <http://dbpedia.org/property/>
PREFIX dbpedia: <http://dbpedia.org/>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX wikidata: <http://www.wikidata.org/entity/>
PREFIX dbp: <http://dbpedia.org/property/>
PREFIX dbo: <http://dbpedia.org/ontology/>

SELECT DISTINCT * WHERE {{
 {{?c rdf:type dbo:ChemicalSubstance .}} UNION
 {{?c rdf:type dbo:ChemicalCompound .}}
 
 OPTIONAL {{?c rdfs:label ?label FILTER(LANG(?label) = 'en').}}
 OPTIONAL {{?c dbp:stdinchikey ?stdinchikey.}}
 OPTIONAL {{?c dbp:stdinchi ?stdinchi .}}
 OPTIONAL {{?c dbp:smiles ?smiles .}}
 OPTIONAL {{?c dbo:fdaUniiCode ?fdaUniiCode .}}
 OPTIONAL {{?c dbp:chembl ?chembl .}} 
 OPTIONAL {{?c owl:sameAs ?wikidata . FILTER(STRSTARTS(STR(?wikidata), 
                                    'http://wikidata.dbpedia.org/resource/'))}}
 OPTIONAL {{?c dbp:routesOfAdministration ?routesOfAdministration .}} 
 OPTIONAL {{?c dbp:pubchem ?pubchem .}} 
 OPTIONAL {{?c dbp:casNumber ?casNumber .}}
 OPTIONAL {{?c dbp:kegg ?kegg .}}
 OPTIONAL {{?c dbp:iupharLigand ?iupharLigand .}}
 OPTIONAL {{?c dbp:chemspiderid ?chemspiderid .}} 
 OPTIONAL {{?c dbp:synonyms ?synonyms .}}  
 OPTIONAL {{?c dbp:atcPrefix ?atcPrefix .}}
 OPTIONAL {{?c dbp:atcSuffix ?atcSuffix .}}
 

 
}}
LIMIT 100000 OFFSET {}
'''

# headers not required for this endpoint
headers = {
    'content-type': 'application/sparql-results+json',
    'charset': 'utf-8',
    'Accept': 'application/sparql-results+json'
}

url = 'http://dbpedia.org/sparql'

tmp_data = {}
for c in [0, 10000]:
    params = {
        'query': query.format(c),
        'format': 'json',
        'maxrows': '100000',
        'timeout': '60000'
    }
    r = requests.get(url, params=params)
    if tmp_data == {}:    
        tmp_data.update(r.json())
    else:
        tmp_data['results']['bindings'].extend(r.json()['results']['bindings'])
    print(len(tmp_data['results']['bindings']))
    

if not os.path.exists('./dbpedia_data'):
    os.makedirs('./dbpedia_data')
    
with open('./dbpedia_data/dbpedia_query_results.json', 'w') as of:
    json.dump(tmp_data, of)


10000
18001


In [3]:
columns = tmp_data['head']['vars']
columns

['c',
 'label',
 'stdinchikey',
 'stdinchi',
 'smiles',
 'fdaUniiCode',
 'chembl',
 'wikidata',
 'routesOfAdministration',
 'pubchem',
 'casNumber',
 'kegg',
 'iupharLigand',
 'chemspiderid',
 'synonyms',
 'atcPrefix',
 'atcSuffix']

In [4]:
dbpedia_chem_data = pd.DataFrame(columns=columns)
# dbpedia_chem_data.set_index

def get_all_single_compound_data(cmpnd):
    for x in tmp_data['results']['bindings']:
        if x['c']['value'] == cmpnd:
            yield {k: v['value'] for k, v in x.items() if k != 'c'}

count = 0
for y in tmp_data['results']['bindings']:
    cmpnd = y['c']['value']
    #print(cmpnd)
    if cmpnd in dbpedia_chem_data['c'].values:
        continue
                       
    dbpedia_chem_data.loc[count, 'c'] = cmpnd

    for zz in get_all_single_compound_data(cmpnd):
        for col, value in zz.items():
            if pd.notnull(dbpedia_chem_data.loc[count, col]):
                td = set(dbpedia_chem_data.loc[count, col].split('|'))
                td.add(value)
                dbpedia_chem_data.loc[count, col] = '|'.join(td)
            else:
                dbpedia_chem_data.loc[count, col] = value
                
    count += 1
    if count % 1000 == 0:
        print(count)
        #print(dbpedia_chem_data[count-10:count])
#         break
            
dbpedia_chem_data

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000


Unnamed: 0,c,label,stdinchikey,stdinchi,smiles,fdaUniiCode,chembl,wikidata,routesOfAdministration,pubchem,casNumber,kegg,iupharLigand,chemspiderid,synonyms,atcPrefix,atcSuffix
0,http://dbpedia.org/resource/Frovatriptan,Frovatriptan,XPSQPHWEGNHMSK-SECBINFHSA-N,1,O=Cc3ccc1cc3,H82Q2D5WA7,1279,http://wikidata.dbpedia.org/resource/Q410195,http://dbpedia.org/resource/Mouth,77992,158930,D07997,7191,70378,6|-6,N02,CC07
1,http://dbpedia.org/resource/Methylprednisolone,Methylprednisolone,VHRSUDSXCMQTMA-PJHHCJLFSA-N,1,O=C\1\C=C/[C@]4C,X4W7ZR7023,650,http://wikidata.dbpedia.org/resource/Q417222,"IV, IM, IV Infusion, Oral, Rectal, Topical",6741,83,D00407,7088,6485,-111721,D07,AA01
2,http://dbpedia.org/resource/Neomycin,Neomycin,PGBHMTALBVVCIT-DPNHOFNISA-N,1,O[C@H]4O[C@@H]CN,I16QD7X297,449118,http://wikidata.dbpedia.org/resource/Q423098,http://dbpedia.org/resource/Mouth|http://dbped...,8378,1404-04-02,D08260,709,8075,Framycetin,A01,AB08
3,http://dbpedia.org/resource/Cetirizine,Cetirizine,ZKLPARSLTMPFCP-UHFFFAOYSA-N,1,Clc1cccCN3CCNCCOCCO,YO7261ME24,1000,http://wikidata.dbpedia.org/resource/Q423075,Oral,2678,83881,D07662,1222,2577,"Alerid, Alatrol, Alzene, Cetirizina, Cetirin, ...",R06,AE07
4,http://dbpedia.org/resource/Imiquimod,Imiquimod,DOUYETYNHWVLEO-UHFFFAOYSA-N,1,n3c1ccccc1c2cc3N,P1QW714R7M,1282,http://wikidata.dbpedia.org/resource/Q423417,http://dbpedia.org/resource/Topical_medication,57469,99011,D02500,5003,51809,1,D06,BB10
5,http://dbpedia.org/resource/Bicalutamide,Bicalutamide,LKJPYSCBVHEWIU-UHFFFAOYSA-N,1,O=CCCSc2ccccc2,A0Z3NAU9DP,63560,http://wikidata.dbpedia.org/resource/Q1988832,Oral,2375,90357,D00961,2863,2284,ICI-176334,L02,BB03
6,http://dbpedia.org/resource/Goserelin,Goserelin,BLCLNMBMMGCOAS-URPVMXJPSA-N,1,CCC[C@H]CN[C@@H]CN1CCC[C@H]1CNNC=O,0F65R8P09N,1201247,http://wikidata.dbpedia.org/resource/Q1992653,http://dbpedia.org/resource/Implant_(medicine),5311128,65807,D04405,3879,4470656,D-Ser6Azgly10LHRH,L02,AE03
7,http://dbpedia.org/resource/Ursodeoxycholic_acid,Ursodeoxycholic acid,RUDATBOHQWOJDD-UZVSRGJWSA-N,1,O=CCC[C@H]C,724L30Y2QR,1551,http://wikidata.dbpedia.org/resource/Q241374,oral,31401,128,D00734,7104,29131,"ursodeoxycholic acid, Actigall, Ursosan, Urso,...",A05,AA02
8,http://dbpedia.org/resource/Foscarnet,Foscarnet,ZJAOAACCNHFJAH-UHFFFAOYSA-N,1,O=CPO,8C5OQ81LWT,666,http://wikidata.dbpedia.org/resource/Q420387,http://dbpedia.org/resource/Intravenous_therapy,3415,63585,6456.0,5497,3297,"(phosphonomethanoic acid, dihydroxyphosphineca...",J05,AD01
9,http://dbpedia.org/resource/Norepinephrine_(me...,Norepinephrine (medication),SFLSHLFXELFNJZ-QMMMGPOBSA-N,1,Oc1ccc[C@@H]CN,X4W3ENH1CV,1437,,Intravenous,439260,51,D00076,505,388394,l-1--2-aminoethanol|--Norepinephrine|Noradrena...,1.0,CA03


In [5]:
dbpedia_chem_data.to_csv('dbpedia_chem_data.csv')

In [6]:
dbpedia_chem_data.count()

c                         17158
label                     17139
stdinchikey                4315
stdinchi                   4320
smiles                     4913
fdaUniiCode                3445
chembl                     3066
wikidata                  16850
routesOfAdministration     2553
pubchem                    4886
casNumber                  5197
kegg                       2201
iupharLigand               1416
chemspiderid               5200
synonyms                    880
atcPrefix                  4740
atcSuffix                  2727
dtype: int64

In [15]:
missing_wd = dbpedia_chem_data.loc[pd.isnull(dbpedia_chem_data['wikidata'].values), 'c']

print(len(missing_wd_df))
for x in missing_wd:
    print(x)

308
http://dbpedia.org/resource/Norepinephrine_(medication)
http://dbpedia.org/resource/Hydroxyprogesterone_heptanoate
http://dbpedia.org/resource/Bismuth_subcitrate/metronidazole/tetracycline
http://dbpedia.org/resource/Rimeporide
http://dbpedia.org/resource/NBOMe-mescaline
http://dbpedia.org/resource/Ixazomib
http://dbpedia.org/resource/Phenylpiracetam_hydrazide
http://dbpedia.org/resource/Saracatinib
http://dbpedia.org/resource/3',4'-Dimethoxy-α-pyrrolidinopentiophenone
http://dbpedia.org/resource/Dimethylaminopivalophenone
http://dbpedia.org/resource/Nifoxipam
http://dbpedia.org/resource/4-Fluoromethylphenidate
http://dbpedia.org/resource/Deschloroketamine
http://dbpedia.org/resource/Para-Chloromethamphetamine
http://dbpedia.org/resource/5F-NNE1
http://dbpedia.org/resource/THJ-018
http://dbpedia.org/resource/AMB-CHMINACA
http://dbpedia.org/resource/BIM-018
http://dbpedia.org/resource/FDU-PB-22
http://dbpedia.org/resource/FUB-144
http://dbpedia.org/resource/PX-1
http://dbpedia.org/r