## map to wikidata

In [49]:
from wikidataintegrator import wdi_helpers
import pandas as pd
df = pd.read_csv('fullSeedData-2016-06-16_UNII.csv', index_col=0, low_memory=False)

In [50]:
# no mixtures
df = df.query("substanceClass == 'chemical'")

In [69]:
# get all inchikeys in wikidata
inchi_wdid = wdi_helpers.id_mapper("P235")
wdid_inchi = {v:k for k,v in inchi_wdid.items()}
print("{} inchikeys in wikidata".format(len(inchi_wdid)))

151546 inchikeys in wikidata


In [51]:
# dict of CAS id to wikidata QID
cas_wdid = wdi_helpers.id_mapper("P231")
print("{} cas numbers in wikidata".format(len(cas_wdid)))

70754 cas numbers in wikidata


In [52]:
df['qid_cas'] = df.CAS_primary.map(lambda x:cas_wdid.get(x, None))
df[['CAS_primary', 'qid_cas']].count()

CAS_primary    54190
qid_cas        52632
dtype: int64

In [53]:
# dict of UNII id to wikidata QID
unii_wdid = wdi_helpers.id_mapper("P652")
print("{} UNII in wikidata".format(len(unii_wdid)))

58966 UNII in wikidata


In [54]:
df['qid_unii'] = df.UNII.map(lambda x:unii_wdid.get(x, None))
df[['CAS_primary', 'qid_cas', 'qid_unii']].count()

CAS_primary    54190
qid_cas        52632
qid_unii       52627
dtype: int64

In [67]:
# check agreement of cas and unii, and add inchikey
(df.qid_cas == df.qid_unii).value_counts()

True     52029
False     2402
dtype: int64

In [72]:
df['inchikey_from_wd'] = df[df.qid_cas == df.qid_unii].qid_cas.map(lambda x:wdid_inchi.get(x, None))
df[['CAS', 'UNII', 'qid_cas', 'qid_unii', 'inchikey_from_wd']].count()

CAS                 54431
UNII                54431
qid_cas             52632
qid_unii            52627
inchikey_from_wd    51954
dtype: int64

In [74]:
# some things we miss:
missing_df = df[df.qid_cas.isnull() & df.qid_unii.isnull()]
missing_df[missing_df.columns[missing_df.count()>200]].head()

Unnamed: 0,CAS,CAS_primary,ECHA (EC/EINECS),MERCK INDEX,NCI_THESAURUS,UNII,names,preferred_names,substanceClass,uuid,PT,RN,EC,NCIT,MF,INCHIKEY,SMILES,UNII_TYPE
19,25394-63-2,25394-63-2,,,,WZS9L0Z92R,"AMFECLORAL, (+)-|(+)-AMFECLORAL|BENZENEETHANAM...","AMFECLORAL, (+)-",chemical,001487f1-962d-4ec0-8052-2d648f8f4cb3,"AMFECLORAL, (+)-",25394-63-2,,,C11H12Cl3N,,CC(Cc1ccccc1)/N=C/C(Cl)(Cl)Cl,INGREDIENT SUBSTANCE
67,1307301-38-7,1307301-38-7,,,C73845,LR57574HN8,DA 1773|LA 391|SODIUM PICOSULFATE|SODIUM PICOS...,SODIUM PICOSULFATE,chemical,005a172f-4991-421b-ac58-3809e0e15dfa,SODIUM PICOSULFATE,1307301-38-7,,C73845,C18H13NO8S2.2Na.H2O,FHYUVJHZGPGDSP-UHFFFAOYSA-L,c1ccnc(c1)C(c2ccc(cc2)OS(=O)(=O)[O-])c3ccc(cc3...,INGREDIENT SUBSTANCE
80,116002-72-3,116002-72-3,,,,4XZ3RL53CZ,"3-FLAVANOL, CIS-|3-FLAVANOL, CIS-(+/-)-|2H-1-B...","3-FLAVANOL, CIS-",chemical,0069697b-4086-4ba2-83fa-0929b6c54225,"3-FLAVANOL, CIS-",116002-72-3,,,C15H14O2,,c1ccc(cc1)[C@@H]2[C@@H](Cc3ccccc3O2)O,INGREDIENT SUBSTANCE
146,66899-87-4,66899-87-4,,,,1214RHT70G,"4H-BENZO(A)QUINOLIZINE, 3-ETHYL-1,6,7,11B-TETR...","DEHYDROISOEMETINE, (+/-)-",chemical,00c30b82-2d63-4876-8350-50832c4e8b36,"DEHYDROISOEMETINE, (+/-)-",66899-87-4,,,C29H38N2O4,,CCC1=C(C[C@H]2c3cc(c(cc3CCN2C1)OC)OC)C[C@H]4c5...,INGREDIENT SUBSTANCE
162,90242-86-7|73365-47-6|72244-00-9,90242-86-7,,,,J51L7OP374,"3-BUTEN-2-ONE, 4-((1R,3R)-2,2,3-TRIMETHYL-6-ME...",".GAMMA.-IRONE, TRANS-(+/-)-",chemical,00d4e6a8-8944-4267-85e1-54274e2aee38,".GAMMA.-IRONE, TRANS-(+/-)-",90242-86-7,,,C14H22O,,C[C@@H]1CCC(=C)[C@H](C1(C)C)/C=C/C(=O)C,INGREDIENT SUBSTANCE


In [75]:
# but most of them we got inchi from the UNII mapping file
missing_df[['UNII','INCHIKEY']].count()

UNII        1410
INCHIKEY     748
dtype: int64

### mixtures

In [76]:
df = pd.read_csv('fullSeedData-2016-06-16_UNII.csv', index_col=0, low_memory=False)
df = df.query("substanceClass == 'mixture'")

### Example
[3-Heptene](https://chem.nlm.nih.gov/chemidplus/rn/592-78-9) is a mixture of
[3-Heptene, (3E)-](https://chem.nlm.nih.gov/chemidplus/rn/14686-14-7) and
[3-Heptene, (3Z)-](https://chem.nlm.nih.gov/chemidplus/rn/7642-10-6)

#### in wikidata:
[(3E)-3-heptene](https://www.wikidata.org/wiki/Q27160264)

[(3Z)-3-heptene](https://www.wikidata.org/wiki/Q27254474)

In [100]:
# map "mixture_UNII" field to wikidata QIDs
mapper = lambda uniis: [unii_wdid.get(x, None) for x in uniis]
wdid_mixture_from_unii = df.mixture_UNII.str.split("|").map(mapper)
wdid_mixture_from_unii.head()

66                                [Q27254147, Q27271508]
76     [Q52822, Q27103409, Q27103331, Q27281243, Q272...
172                               [Q27279986, Q27275260]
174                               [Q27252036, Q27264264]
186                       [None, None, None, None, None]
Name: mixture_UNII, dtype: object

In [101]:
# cleanup (replace with None, if any of the mixture components are None)
wdid_mixture_from_unii = wdid_mixture_from_unii.map(lambda x: "|".join(x) if None not in x else None)
wdid_mixture_from_unii.head()

66                                   Q27254147|Q27271508
76     Q52822|Q27103409|Q27103331|Q27281243|Q27254004...
172                                  Q27279986|Q27275260
174                                  Q27252036|Q27264264
186                                                 None
Name: mixture_UNII, dtype: object

In [103]:
wdid_mixture_from_unii.notnull().value_counts()

True     1115
False     257
Name: mixture_UNII, dtype: int64

In [109]:
# map to inchi keys
mapper = lambda wdids: [wdid_inchi.get(x, None) for x in wdids]
inchis = wdid_mixture_from_unii[wdid_mixture_from_unii.notnull()].str.split("|").map(mapper)
inchis = inchis.map(lambda x: "|".join(x) if None not in x else None)
inchis.head()

66     OUILVKYDBNPYBM-JDOFTSHGSA-N|OUILVKYDBNPYBM-OTK...
76     HGUFODBRKLSHSI-UHFFFAOYSA-N|OTQFXRBLGNEOGH-UHF...
172    QOKMHYUWJKXWOV-PLNGDYQASA-N|QOKMHYUWJKXWOV-SNA...
174    PXGZQGDTEZPERC-IZLXSQMJSA-N|PXGZQGDTEZPERC-OLQ...
249    ZHVUSSKUCZSPFQ-UHFFFAOYSA-N|VICXKFLOMKMGEH-UHF...
Name: mixture_UNII, dtype: object

In [110]:
inchis.notnull().value_counts()

True     1103
False      12
Name: mixture_UNII, dtype: int64

## Summary
### chemicals
51954 out of 54431 have in inchikey in wikidata, on an item where both the CAS and UNII agree

### mixtures
Out of 1372 mixtures, 1115 have all components of the mixture found in wikidata by their UNII, and 1103 of those have inchikeys