In [1]:
## exploring the ginas-UNII mapping
# run `run.sh` in this folder

In [5]:
import pandas as pd
df = pd.read_csv('fullSeedData-2016-06-16_UNII.csv', index_col=0, low_memory=False)

In [14]:
# show counts for each column (exclude columns with less than 1000 for brevity)
df[df.columns[df.count()>1000]].count()

CAS                   55803
CAS_primary           55531
CFR                    1508
DRUG BANK              1769
ECHA (EC/EINECS)      20376
EPA PESTICIDE CODE     2522
EVMPD                  5196
INN                    7992
IUPHAR                 1101
JECFA EVALUATION       1818
MERCK INDEX           14679
MESH                  10324
NCI_THESAURUS         10545
NDF-RT                 1082
RXCUI_x                6503
UNII                  55803
WHO-ATC                2696
WHO-VATC               2884
WIKIPEDIA              7133
mixture_UNII           1372
names                 55803
preferred_names       55803
substanceClass        55803
uuid                  55803
PT                    55802
RN                    55785
EC                    20390
NCIT                  10574
RXCUI_y                6683
INN_ID                 8045
MF                    54428
INCHIKEY              44673
SMILES                53983
UNII_TYPE             55802
dtype: int64

In [19]:
df.substanceClass.value_counts()

chemical    54431
mixture      1372
Name: substanceClass, dtype: int64

In [18]:
# Of those that are missing an inchikey, what class are they?
inchi_null = df[df.INCHIKEY.isnull()]
inchi_null.substanceClass.value_counts()
# ~10k chemicals are missing, and all mixtures

chemical    9758
mixture     1372
Name: substanceClass, dtype: int64

In [34]:
# However, we have the UNIIs within the mixtures, so lets check those
df.query("substanceClass == 'mixture'")[['CAS_primary','mixture_UNII','preferred_names']].head()

Unnamed: 0,CAS_primary,mixture_UNII,preferred_names
66,130-81-4,271XMD080E|92K78B4931,QUINDONIUM BROMIDE
76,41903-57-5,DO80M48B6O|P30XU4W6C1|HF5S8P28CC|J62NF50EHT|25...,TETRACHLORODIBENZO-P-DIOXIN (MIXED ISOMERS)
172,32951-19-2,HL8423W31J|C6PL4F56E0,1-(METHYLTHIO)-1-BUTENE
174,1076-97-7,18W55738KH|68QED0R44U,"1,4-CYCLOHEXANEDICARBOXYLIC ACID"
186,102961-72-8,C70V7E56T3|MBP3L2793V|T8KEU6LJBE|4E60GZ933I|6K...,ANTIBIOTIC A-40926


In [31]:
mixture_UNII = df.query("substanceClass == 'mixture'").iloc[0].mixture_UNII.split("|")
df[df.UNII.isin(mixture_UNII)][['CAS_primary','UNII','preferred_names','INCHIKEY']]

Unnamed: 0,CAS_primary,UNII,preferred_names,INCHIKEY
5662,10197-93-0,271XMD080E,"QUINDONIUM BROMIDE, CIS-",
40024,14490-58-5,92K78B4931,"QUINDONIUM BROMIDE, TRANS-",


In [35]:
mixture_UNII = df.query("substanceClass == 'mixture'").iloc[4].mixture_UNII.split("|")
df[df.UNII.isin(mixture_UNII)][['CAS_primary','UNII','preferred_names','INCHIKEY']]

Unnamed: 0,CAS_primary,UNII,preferred_names,INCHIKEY
4518,110882-85-4,C70V7E56T3,ANTIBIOTIC A-40926 B1,IMGYVEMZPBHISV-PSDJNXLUSA-N
11391,110882-82-1,T8KEU6LJBE,ANTIBIOTIC A-40926 A0,UTUMMENGNWNHJZ-PKPPEPSOSA-N
27941,871088-55-0,6KX5KH4647,ANTIBIOTIC A-40926 B2,OJESFQPLDXJICV-FLAGIKDOSA-N
30199,110882-83-2,4E60GZ933I,ANTIBIOTIC A-40926 A1,KFFFTRROFFANLL-PKPPEPSOSA-N
40783,110882-84-3,MBP3L2793V,ANTIBIOTIC A-40926 B0,PZMMGNLKWHJGSE-PSDJNXLUSA-N


In [None]:
# QUINDONIUM BROMIDE is a mixture of QUINDONIUM BROMIDE, CIS- and TRANS-, which still don't have inchi_keys
# but antibiotic A40926's components do....

In [100]:
# mix_comp_inchi = True if any componenets of the mixture have an InchiKey
mixture_UNII = df.query("substanceClass == 'mixture'").mixture_UNII.str.split("|")
mix_comp_inchi = mixture_UNII.map(lambda x: df[df.UNII.isin(x)].INCHIKEY.notnull().any())
df['mix_comp_inchi'] = mix_comp_inchi
mix_comp_inchi.value_counts()
# we have inchikeys for the components for 880/1372 mixtures

True     880
False    492
Name: mixture_UNII, dtype: int64

In [112]:
# get all chemicals
chem = df.query("substanceClass == 'chemical'")
chem_noinchi = chem[chem.INCHIKEY.isnull()]
chem_noinchi[chem_noinchi.columns[chem_noinchi.count()>1000]].count()

CAS                 9758
CAS_primary         9687
ECHA (EC/EINECS)    3221
EVMPD               1352
INN                 1850
MERCK INDEX         2302
MESH                1632
NCI_THESAURUS       2219
RXCUI_x             1254
UNII                9758
WIKIPEDIA           1052
names               9758
preferred_names     9758
substanceClass      9758
uuid                9758
PT                  9757
RN                  9757
EC                  3229
NCIT                2220
RXCUI_y             1292
INN_ID              1852
MF                  9755
SMILES              9558
UNII_TYPE           9757
dtype: int64

In [114]:
chem_noinchi[chem_noinchi.columns[chem_noinchi.count()>4000]].head()

Unnamed: 0,CAS,CAS_primary,UNII,names,preferred_names,substanceClass,uuid,PT,RN,MF,SMILES,UNII_TYPE
2,51599-37-2|43064-17-1,51599-37-2,25619NA95Y,"TOLAMOLOL HYDROCHLORIDE|BENZAMIDE, 4-(2-((2-HY...",TOLAMOLOL HYDROCHLORIDE,chemical,00021c3c-4469-4bfd-8f12-5fee4410f7e6,TOLAMOLOL HYDROCHLORIDE,51599-37-2,C19H24N2O4.ClH,Cc1ccccc1OCC(CNCCOc2ccc(cc2)C(=O)N)O.Cl,INGREDIENT SUBSTANCE
8,7020-55-5,7020-55-5,BO76JF850N,"CLIDINIUM|1-AZONIABICYCLO(2.2.2)OCTANE, 3-((HY...",CLIDINIUM,chemical,000739ec-f968-4cdb-bb8d-bf055c7fac55,CLIDINIUM,7020-55-5,C22H26NO3,C[N+]12CCC(CC1)C(C2)OC(=O)C(c3ccccc3)(c4ccccc4)O,IONIC MOIETY
16,6864-37-5,6864-37-5,3K4H01E55X,"4,4'-METHYLENEBIS(2-METHYLCYCLOHEXYLAMINE)|BIS...",BIS(4-AMINO-3-METHYLCYCLOHEXYL)METHANE,chemical,00122cc3-cceb-4143-b587-abe9a8b19ad3,BIS(4-AMINO-3-METHYLCYCLOHEXYL)METHANE,6864-37-5,C15H30N2,CC1CC(CCC1N)CC2CCC(C(C2)C)N,INGREDIENT SUBSTANCE
17,125109-85-5,125109-85-5,Z92022479Y,"ISOPROPYLPHENYLBUTANAL|BENZENEPROPANAL, .BETA....",ISOPROPYLPHENYLBUTANAL,chemical,00122e91-8cf7-4bec-a463-5c09a5243eab,ISOPROPYLPHENYLBUTANAL,125109-85-5,C13H18O,CC(C)c1cccc(c1)C(C)CC=O,INGREDIENT SUBSTANCE
19,25394-63-2,25394-63-2,WZS9L0Z92R,"AMFECLORAL, (+)-|(+)-AMFECLORAL|BENZENEETHANAM...","AMFECLORAL, (+)-",chemical,001487f1-962d-4ec0-8052-2d648f8f4cb3,"AMFECLORAL, (+)-",25394-63-2,C11H12Cl3N,CC(Cc1ccccc1)/N=C/C(Cl)(Cl)Cl,INGREDIENT SUBSTANCE


In [115]:
# Its not here (which is where the mapping came from)
#https://fdasis.nlm.nih.gov/srs/unii/25619na95y

In [None]:
# its here: https://chem.nlm.nih.gov/chemidplus/unii/25619na95y
# but the flat files don't contain inchikeys....
# ftp://ftp.nlm.nih.gov/nlmdata/.chemidlease/