In [None]:
# A notebook for extracting indications from EMA data

In [1]:
import pandas as pd
import wikidataintegrator as wdi

In [2]:
ema_data_path = './ema_data/EMA_drugs.csv'
ema_data = pd.read_csv(ema_data_path)
ema_data.head()

Unnamed: 0,Medicine Name,Product Number,Active Substance,Common name,Atc code,Marketing Authorisation Holder,Status,Revision number,Authorisation date,Indication,Condition Approval,Exceptional Circumstance,Is Orphan,Is Generic,Biosimilar
0,Abasaglar (previously Abasria),EMEA/H/C/002835,insulin glargine,insulin glargine,A10AE04,Eli Lilly Regional Operations GmbH,Authorised,4,09/09/2014,"Treatment of diabetes mellitus in adults, adol...",no,no,no,no,yes
1,Abilify,EMEA/H/C/000471,aripiprazole,aripiprazole,N05AX12,Otsuka Pharmaceutical Europe Ltd,Authorised,39,04/06/2004,Abilify is indicated for the treatment of schi...,no,no,no,no,no
2,Abilify Maintena,EMEA/H/C/002755,aripiprazole,aripiprazole,N05AX12,Otsuka Pharmaceutical Europe Ltd,Authorised,10,15/11/2013,Maintenance treatment of schizophrenia in adul...,no,no,no,no,no
3,Ablavar (previously Vasovist),EMEA/H/C/000601,gadofosveset trisodium,gadofosveset trisodium,V08CA,TMC Pharma Services Ltd.\n,Withdrawn,10,03/10/2005,This medicinal product is for diagnostic use o...,no,no,no,no,no
4,Abraxane,EMEA/H/C/000778,paclitaxel,paclitaxel,L01CD01,Celgene Europe Ltd\n,Authorised,19,11/01/2008,Abraxane monotherapy is indicated for the trea...,no,no,no,no,no


TODO: Data needs to be filtered for everything which is not an EMA authorized drug, at some point, but the info that a certain compound has been e.g. withdrawn is interesting for Wikidata and repurposing applications

## Get labels and aliases for drugs/compounds and disease

In [11]:
drug_query = '''
SELECT * WHERE {
    ?compound wdt:P2275 ?who_name FILTER (LANG(?who_name) = "en") .  
}
'''

# http://tinyurl.com/z6397rh
drug_query = '''
SELECT ?compound ?label ?who_name (GROUP_CONCAT(DISTINCT(?alias); separator="|") AS ?aliases) WHERE {{
  {{?compound wdt:P31 wd:Q11173 .}} UNION  # chemical compound
  {{?compound wdt:P31 wd:Q12140 .}} UNION  # pharmaceutical drug
  {{?compound wdt:P31 wd:Q79529 .}} UNION  # chemical substance
  {{?compound wdt:P2275 ?who_name FILTER (LANG(?who_name) = "en") .}}
  
  OPTIONAL {{
    ?compound rdfs:label ?label FILTER (LANG(?label) = "en") .
  }}
  OPTIONAL {{
    ?compound skos:altLabel ?alias FILTER (LANG(?alias) = "en") .
  }}
}}
GROUP BY ?compound ?label ?who_name ?aliases
OFFSET {0}
LIMIT 100000
'''


drug_qid_map = {}

cc = 0
while True:
    r = wdi.wdi_core.WDItemEngine.execute_sparql_query(query=drug_query.format(100000 * cc))
    cc += 1
    
    if len(r['results']['bindings']) == 0:
        break
    
    for x in r['results']['bindings']:
        qid = x['compound']['value']
        
        if 'who_name' in x:
            drug_qid_map.update({x['who_name']['value'].lower(): qid})
            
        if 'label' in x:
            drug_qid_map.update({x['label']['value'].lower(): qid})
            
        if 'aliases' in x:
            drug_qid_map.update({y.lower(): qid for y in x['aliases']['value'].split('|')})
            
        count += 1
            
print('Drug to QID map has {} entries!'.format(len(drug_qid_map)))

no_match_count = 0
combo_drug_count = 0
for count, line in ema_data.iterrows():
#     print(count, line)
    ema_name = line['Common name']
    
    if not all([y in drug_qid_map for y in ema_name.lower().split(' / ')]):
        print(ema_name, count)
        no_match_count += 1
    
    if ' / ' in ema_name:
        combo_drug_count += 1 
        
print(no_match_count, 'did not match')
print(combo_drug_count, 'combination drugs detected')

274072
insulin human (rDNA) 12
human insulin(rDNA) 13
pandemic influenza vaccine (H5N1) (split virion, inactivated, adjuvanted) 19
octocog alfa 22
prepandemic influenza vaccine (H5N1) (surface antigen, inactivated, adjuvanted) 27
eftrenonacog alfa 39
hepatitis-A (inactivated) and hepatitis-B (rDNA) (HAB) vaccine (adsorbed) 41
5-aminolevulinic acid hydrochloride 42
pandemic influenza vaccine (H1N1) (split virion, inactivated, adjuvanted) 53
antithrombin alfa 65
nonacog alfa 81
tasonermin 84
meningococcal group-B vaccine (rDNA, component, adsorbed) 87
epoetin theta 92
sugammadex 103
aclidinium / formoterol fumarate dihydrate 105
budesonide / formoterol fumarate dihydrate 112
influenza vaccine (H1N1)v (whole virion, inactivated, prepared in cell culture) 131
human protein C 133
human papillomavirus vaccine [types 16, 18] (recombinant, adjuvanted, adsorbed) 136
characterised viable autologous cartilage cells expanded ex vivo expressing specific marker proteins 141
reslizumab 147
C1 inhibit

In [10]:
print(len(drug_qid_map))
drug_qid_map['besilesomab']

274072


'http://www.wikidata.org/entity/Q1871058'

In [None]:
disease_query = '''
SELECT ?disease ?label (GROUP_CONCAT(DISTINCT(?alias); separator="|") AS ?aliases) WHERE { 
  {?disease wdt:P486 ?mesh .} UNION  # Mesh
  {?disease wdt:P699 ?doid .} UNION  # DO
  {?disease wdt:P494 wd:Q11173 .} MINUS  # ICD-10
  {?disease wdt:P31 wd:Q11173 .} 
  
  OPTIONAL {
    ?disease rdfs:label ?label FILTER (LANG(?label) = "en") .
  }
  OPTIONAL {
    ?disease skos:altLabel ?alias FILTER (LANG(?alias) = "en") .
  }
}
GROUP BY ?disease ?label ?aliases
'''

indication_keywords = ['is indicated']
