## map to wikidata (chemicals)

In [2]:
from wikidataintegrator import wdi_helpers
import pandas as pd
df = pd.read_csv('fullSeedData-2016-06-16_UNII.csv', index_col=0, low_memory=False)

In [3]:
# no mixtures
df = df.query("substanceClass == 'chemical'")

In [4]:
# get all inchikeys in wikidata
inchi_wdid = wdi_helpers.id_mapper("P235")
wdid_inchi = {v:k for k,v in inchi_wdid.items()}
print("{} inchikeys in wikidata".format(len(inchi_wdid)))

151548 inchikeys in wikidata


In [5]:
# dict of CAS id to wikidata QID
cas_wdid = wdi_helpers.id_mapper("P231")
print("{} cas numbers in wikidata".format(len(cas_wdid)))

70755 cas numbers in wikidata


In [6]:
df['qid_cas'] = df.CAS_primary.map(lambda x:cas_wdid.get(x, None))
df[['CAS_primary', 'qid_cas']].count()

CAS_primary    54190
qid_cas        52632
dtype: int64

In [7]:
# dict of UNII id to wikidata QID
unii_wdid = wdi_helpers.id_mapper("P652")
print("{} UNII in wikidata".format(len(unii_wdid)))

58967 UNII in wikidata


In [8]:
df['qid_unii'] = df.UNII.map(lambda x:unii_wdid.get(x, None))
df[['CAS_primary', 'qid_cas', 'qid_unii']].count()

CAS_primary    54190
qid_cas        52632
qid_unii       52627
dtype: int64

In [9]:
# check agreement of cas and unii, and add inchikey
(df.qid_cas == df.qid_unii).value_counts()

True     52031
False     2400
dtype: int64

In [10]:
df['inchikey_from_wd'] = df[df.qid_cas == df.qid_unii].qid_cas.map(lambda x:wdid_inchi.get(x, None))
df[['CAS', 'UNII', 'qid_cas', 'qid_unii', 'inchikey_from_wd', 'INCHIKEY_from_unii']].count()

CAS                   54431
UNII                  54431
qid_cas               52632
qid_unii              52627
inchikey_from_wd      51956
INCHIKEY_from_unii    44673
dtype: int64

In [11]:
## Validate inchikeys coming from wikidata with inchikey coming from the UNII mapping
# this is field INCHIKEY_from_unii, which comes from https://fdasis.nlm.nih.gov/srs/download/srs/UNII_Data.zip
agree = df[df.INCHIKEY_from_unii.notnull() & df.inchikey_from_wd.notnull() & (df.INCHIKEY_from_unii == df.inchikey_from_wd)]
disagree = df[df.INCHIKEY_from_unii.notnull() & df.inchikey_from_wd.notnull() & (df.INCHIKEY_from_unii != df.inchikey_from_wd)]
print("number of drugs with exact same inchikey: {}".format(len(agree)))
print("number of drugs with different inchikey (not counting missing): {}".format(len(disagree)))

number of drugs with exact same inchikey: 42304
number of drugs with different inchikey (not counting missing): 825


In [12]:
## Don't use ones with disagreements
# keep if found in either wikidata or unii
df['inchikey'] = None
df.inchikey.fillna(df.inchikey_from_wd, inplace=True)
df.inchikey.fillna(df.INCHIKEY_from_unii, inplace=True)
df.loc[df.INCHIKEY_from_unii.notnull() & df.inchikey_from_wd.notnull() & (df.INCHIKEY_from_unii != df.inchikey_from_wd), 'inchikey'] = None
#df[['INCHIKEY_from_unii', 'inchikey_from_wd', 'inchikey']]

In [17]:
## there are some dupes
vc = df.inchikey.value_counts()
dupes = df[df.inchikey.isin(vc.index[vc > 1])].sort_values("inchikey")
print(len(dupes))
dupes[['UNII', 'CAS', 'INCHIKEY_from_unii', 'inchikey_from_wd', 'inchikey']]

12


Unnamed: 0,UNII,CAS,INCHIKEY_from_unii,inchikey_from_wd,inchikey
17465,TLN93NUR7S,74635-27-1,BAZSXBOAXJLRNH-UHFFFAOYSA-N,,BAZSXBOAXJLRNH-UHFFFAOYSA-N
51251,OI55X42ZZH,70565-74-1,BAZSXBOAXJLRNH-UHFFFAOYSA-N,,BAZSXBOAXJLRNH-UHFFFAOYSA-N
15202,EBV7H5W26H,595-40-4,GCHPUFAZSONQIV-YFKPBYRVSA-N,,GCHPUFAZSONQIV-YFKPBYRVSA-N
23949,JUL973T11C,595-39-1,,GCHPUFAZSONQIV-YFKPBYRVSA-N,GCHPUFAZSONQIV-YFKPBYRVSA-N
13416,G56VK1HF36,92623-85-3,,GJJFMKBJSRMPLA-HIFRSBDPSA-N,GJJFMKBJSRMPLA-HIFRSBDPSA-N
15238,ES1O38J3C4,96847-55-1,GJJFMKBJSRMPLA-HIFRSBDPSA-N,,GJJFMKBJSRMPLA-HIFRSBDPSA-N
23166,1FU18G2315,29804-22-6,,HFOFYNMWYRXIBP-MOPGFXCFSA-N,HFOFYNMWYRXIBP-MOPGFXCFSA-N
55000,E5FWS893X3,54910-52-0,HFOFYNMWYRXIBP-MOPGFXCFSA-N,,HFOFYNMWYRXIBP-MOPGFXCFSA-N
5462,24QAP1VCUX,5655-61-8,KGEKLUUHTZCSIP-HOSYDEDBSA-N,,KGEKLUUHTZCSIP-HOSYDEDBSA-N
48139,213431586X,76-49-3,,KGEKLUUHTZCSIP-HOSYDEDBSA-N,KGEKLUUHTZCSIP-HOSYDEDBSA-N


In [19]:
# some things we miss:
missing_or_wrong = df[df.inchikey.isnull()]
missing_or_wrong.to_csv("missing_or_wrong.csv")
len(missing_or_wrong)

1768

In [20]:
df.to_csv("fullSeedData-2016-06-16_UNII_wikidata.csv.gz", compression='gzip')

In [21]:
## Summary (chemicals)
print("{} out of {} mapped to inchikey".format(df.inchikey.notnull().sum(), len(df)))

52663 out of 54431 mapped to inchikey


In [22]:
df
df[df.columns[df.count()>200]].head(20)

Unnamed: 0,CAS,CAS_primary,CFR,CODEX ALIMENTARIUS (GSFA),DEA NO.,DRUG BANK,ECHA (EC/EINECS),EMA ASSESSMENT REPORTS,EPA PESTICIDE CODE,EVMPD,...,RXCUI_from_unii,INN_ID_from_unii,MF_from_unii,INCHIKEY_from_unii,SMILES_from_unii,UNII_TYPE_from_unii,qid_cas,qid_unii,inchikey_from_wd,inchikey
0,66537-39-1,66537-39-1,,,,,,,,,...,,,C13H22O,GYUZHTWCNKINPY-WCQYABFASA-N,C[C@H]1CC[C@@]2(O1)C(=CCCC2(C)C)C,INGREDIENT SUBSTANCE,Q27290647,Q27290647,GYUZHTWCNKINPY-WCQYABFASA-N,GYUZHTWCNKINPY-WCQYABFASA-N
1,12068-46-1,12068-46-1,,,,,235-099-1,,,,...,,,2Al.2Ca.3O.O4Si,WMWJKESBRPVNQB-UHFFFAOYSA-N,[O-2].[O-2].[O-2].[O-][Si]([O-])([O-])[O-].[Al...,INGREDIENT SUBSTANCE,Q27291425,Q27291425,WMWJKESBRPVNQB-UHFFFAOYSA-N,WMWJKESBRPVNQB-UHFFFAOYSA-N
2,51599-37-2|43064-17-1,51599-37-2,,,,,,,,,...,,,C19H24N2O4.ClH,,Cc1ccccc1OCC(CNCCOc2ccc(cc2)C(=O)N)O.Cl,INGREDIENT SUBSTANCE,Q27253916,Q27253916,ZATQSCRQPQXBEG-UHFFFAOYSA-N,ZATQSCRQPQXBEG-UHFFFAOYSA-N
3,504-64-3,504-64-3,,,,,,,,,...,,,C3O2,GNEVIACKFGQMHB-UHFFFAOYSA-N,C(=C=O)=C=O,INGREDIENT SUBSTANCE,Q411352,Q411352,GNEVIACKFGQMHB-UHFFFAOYSA-N,GNEVIACKFGQMHB-UHFFFAOYSA-N
4,6362-80-7,6362-80-7,,,,,228-846-8,,,,...,,,C18H20,ZOKCNEIWFQCSCM-UHFFFAOYSA-N,CC(C)(CC(=C)c1ccccc1)c2ccccc2,INGREDIENT SUBSTANCE,Q27255693,Q27255693,ZOKCNEIWFQCSCM-UHFFFAOYSA-N,ZOKCNEIWFQCSCM-UHFFFAOYSA-N
5,446255-20-5,446255-20-5,,,,,,,,,...,,,C12H3Br7O,NLBLNZDNOSSGPW-UHFFFAOYSA-N,c1cc(c(c(c1)Br)Br)Oc2c(c(c(c(c2Br)Br)Br)Br)Br,INGREDIENT SUBSTANCE,Q27282365,Q27282365,NLBLNZDNOSSGPW-UHFFFAOYSA-N,NLBLNZDNOSSGPW-UHFFFAOYSA-N
6,14536-00-6,14536-00-6,,,,,238-569-4,,,,...,,,3C18H35O2.Ce,BTVVNGIPFPKDHO-UHFFFAOYSA-K,CCCCCCCCCCCCCCCCCC(=O)[O-].CCCCCCCCCCCCCCCCCC(...,INGREDIENT SUBSTANCE,Q27260416,Q27260416,BTVVNGIPFPKDHO-UHFFFAOYSA-K,BTVVNGIPFPKDHO-UHFFFAOYSA-K
7,1079400-07-9,1079400-07-9,,,,,,,,,...,,,C17H16ClN5O2,ZRJGMDIPCQOGNI-UHFFFAOYSA-N,Cn1c(ccn1)C(=O)Nc2ccc(c(n2)N)c3cc(ccc3Cl)OC,INGREDIENT SUBSTANCE,Q27258289,Q27258289,ZRJGMDIPCQOGNI-UHFFFAOYSA-N,ZRJGMDIPCQOGNI-UHFFFAOYSA-N
8,7020-55-5,7020-55-5,,,,DB00771,,,,,...,21232.0,,C22H26NO3,,C[N+]12CCC(CC1)C(C2)OC(=O)C(c3ccccc3)(c4ccccc4)O,IONIC MOIETY,Q5132472,Q5132472,HOOSGZJRQIVJSZ-UHFFFAOYSA-N,HOOSGZJRQIVJSZ-UHFFFAOYSA-N
9,19902-91-1,19902-91-1,,,,,,,,,...,,,C15H16O5,RSIWXFIBHXYNFM-NSHDSACASA-N,COC1=CC(=O)O[C@H](C1)CCc2ccc3c(c2)OCO3,INGREDIENT SUBSTANCE,Q5276438,Q5276438,RSIWXFIBHXYNFM-NSHDSACASA-N,RSIWXFIBHXYNFM-NSHDSACASA-N
