In [7]:
import os
import zipfile
import requests
import pandas as pd

s = requests.Session()


def load_unii():
    url = 'http://fdasis.nlm.nih.gov/srs/download/srs/UNII_Data.zip'
    
    if not os.path.exists('./unii_data'):
        os.makedirs('./unii_data')
        
    s = requests.Session()
    reply = s.get(url, stream=True)
    with open('./unii_data/UNII_Data.zip', 'wb') as f:
        for chunk in reply.iter_content(chunk_size=512):
            if chunk:
                f.write(chunk)
                #f.flush()
    
    # TODO: UNII records file needs to be renamed to a standard name, it comes in an awkward naming 
    # including the date, so the name will be different every few months/days.
    zf = zipfile.ZipFile('./unii_data/UNII_Data.zip', 'r')
    zf.extractall('./unii_data/')
    zf.close()
        
    

def get_ndfrt_nui(prop_type, prop_value):
    base_url = 'https://rxnav.nlm.nih.gov/REST/Ndfrt/concept.json'
    
    params = {
        'propertyName': prop_type,
        'propertyValue': prop_value
    }
    
    reply = s.get(base_url, params=params).json()
    
    if 'groupConcepts' in reply and reply['groupConcepts'][0]:
        return reply['groupConcepts'][0]['concept'][0]['conceptNui']
    
    return ''


def get_ndfrt_data(nui):
    url = 'https://rxnav.nlm.nih.gov/REST/Ndfrt/allInfo.json'
    params = {
        'nui': nui
    }
    
    results = s.get(url=url, params=params).json()

    props = dict()
    if 'groupProperties' in results['fullConcept']:
        for prop in results['fullConcept']['groupProperties'][0]['property']:
            props[prop['propertyName']] = prop['propertyValue']
            
    return props
    
# nui = get_ndfrt_nui('MeSH_DUI', 'D014406')
# print(nui)

# get_ndfrt_data(nui)

load_unii()

In [4]:
unii_df = pd.read_csv('./unii_data/UNIIs 8Aug2016 Records.txt', sep='\t')
unii_df.head()

Unnamed: 0,UNII,PT,RN,EC,NCIT,RXCUI,ITIS,NCBI,PLANTS,GRIN,INN_ID,MF,INCHIKEY,SMILES,UNII_TYPE,Unnamed: 15
0,129526470,"5,8-DIMETHOXY(1,2,4)TRIAZOLO(1,5-C)PYRIMIDIN-2...",219715-62-5,,,,,,,,,C7H9N5O2,DBJPBHJHAPAUQU-UHFFFAOYSA-N,COc1cnc(n2c1nc(n2)N)OC,INGREDIENT SUBSTANCE,
1,258808825,MACROPIPER EXCELSUM LEAF,,,,,,130373.0,,,,,,,INGREDIENT SUBSTANCE,
2,377415922,"N-DESMETHYLVENLAFAXINE, (S)-",392332-59-1,,,,,,,,,C16H25NO2,MKAFOJAJJMUXLW-OAHLLOKOSA-N,CNC[C@H](c1ccc(cc1)OC)C2(CCCCC2)O,INGREDIENT SUBSTANCE,
3,457970679,VERRUCARIN,54018-05-2,,,,,,,,,,,,INGREDIENT SUBSTANCE,
4,480546720,HOMOCYCLOLEUCINE HYDROCHLORIDE,39692-17-6,254-594-3,,,,,,,,C7H13NO2.ClH,GTKXSYHXQSKWNP-UHFFFAOYSA-N,C1CCC(CC1)(C(=O)O)N.Cl,INGREDIENT SUBSTANCE,


In [13]:
for count, unii in enumerate(unii_df['UNII']):
    nui = get_ndfrt_nui('FDA_UNII', unii)
    if nui:
        print(count, 'UNII {} returned NUI:'.format(unii), nui)
        prop_data = get_ndfrt_data(nui)
#         print(prop_data)
        unii_df.loc[count, 'NUI'] = nui
        for p in ['UMLS_CUI', 'RxNorm_CUI', 'MeSH_CUI']:
            try:
                unii_df.loc[count, p] = prop_data[p]
            except KeyError:
                continue
#     else:
#         print('UNII {} not found'.format(unii))
    
#     if count > 300:
#         break
    

16 UNII 2865993309 returned NUI: N0000171746
36 UNII 5438723848 returned NUI: N0000148053
50 UNII 7352665165 returned NUI: N0000191549
53 UNII 7673326042 returned NUI: N0000022032
74 UNII 00072J7XWS returned NUI: N0000176163
88 UNII 001O2254AC returned NUI: N0000147694
89 UNII 0020414E5U returned NUI: N0000146267
106 UNII 003N66TS6T returned NUI: N0000171790
113 UNII 004F72P8F4 returned NUI: N0000147727
125 UNII 0057334FAB returned NUI: N0000022908
127 UNII 005990WHZZ returned NUI: N0000147220
132 UNII 005SYP50G5 returned NUI: N0000146145
181 UNII 00DPD30SOY returned NUI: N0000171765
184 UNII 00FN6IH15D returned NUI: N0000145905
190 UNII 00IBG87IQW returned NUI: N0000147657
196 UNII 00J9J9XKDE returned NUI: N0000148327
222 UNII 00OT1QX5U4 returned NUI: N0000148188
238 UNII 00S42N58OM returned NUI: N0000147153
249 UNII 00UK7646FG returned NUI: N0000147279
258 UNII 00WNZ48OR9 returned NUI: N0000147223
280 UNII 0111871I23 returned NUI: N0000190120
288 UNII 012C11ZU6G returned NUI: N000014

In [14]:
unii_df.to_csv('unii&nui_08-18-2016.csv')
unii_df.head(300)

Unnamed: 0,UNII,PT,RN,EC,NCIT,RXCUI,ITIS,NCBI,PLANTS,GRIN,INN_ID,MF,INCHIKEY,SMILES,UNII_TYPE,Unnamed: 15,NUI,UMLS_CUI,RxNorm_CUI
0,0129526470,"5,8-DIMETHOXY(1,2,4)TRIAZOLO(1,5-C)PYRIMIDIN-2...",219715-62-5,,,,,,,,,C7H9N5O2,DBJPBHJHAPAUQU-UHFFFAOYSA-N,COc1cnc(n2c1nc(n2)N)OC,INGREDIENT SUBSTANCE,,,,
1,0258808825,MACROPIPER EXCELSUM LEAF,,,,,,130373.0,,,,,,,INGREDIENT SUBSTANCE,,,,
2,0377415922,"N-DESMETHYLVENLAFAXINE, (S)-",392332-59-1,,,,,,,,,C16H25NO2,MKAFOJAJJMUXLW-OAHLLOKOSA-N,CNC[C@H](c1ccc(cc1)OC)C2(CCCCC2)O,INGREDIENT SUBSTANCE,,,,
3,0457970679,VERRUCARIN,54018-05-2,,,,,,,,,,,,INGREDIENT SUBSTANCE,,,,
4,0480546720,HOMOCYCLOLEUCINE HYDROCHLORIDE,39692-17-6,254-594-3,,,,,,,,C7H13NO2.ClH,GTKXSYHXQSKWNP-UHFFFAOYSA-N,C1CCC(CC1)(C(=O)O)N.Cl,INGREDIENT SUBSTANCE,,,,
5,0909840846,ALLIUM STRACHEYI WHOLE,,,,,,980680.0,,459419.0,,,,,INGREDIENT SUBSTANCE,,,,
6,0979361098,OXALIS CORNICULATA LEAF,,,,,29067.0,212256.0,OXCO,26196.0,,,,,INGREDIENT SUBSTANCE,,,,
7,1521994599,AURASPERONE D,67924-64-5,,,,,,,,,C31H24O10,,Cc1cc(=O)c2c(o1)cc3cc(c(c(c3c2O)OC)c4c5cc(cc(c...,INGREDIENT SUBSTANCE,,,,
8,1597304395,ETANTEROL,93047-39-3,,C65553,,,,,,5707.0,C18H24N2O3,,CC(Cc1ccc(cc1)O)NCC(c2cc(cc(c2)N)CO)O,INGREDIENT SUBSTANCE,,,,
9,1598442456,GIANT CATFISH WHOLE,,,,,639992.0,435128.0,,,,,,,INGREDIENT SUBSTANCE,,,,


In [15]:
unii_df.count()

UNII           81964
PT             81964
RN             60885
EC             21063
NCIT           12890
RXCUI          10026
ITIS           13498
NCBI           15801
PLANTS          7195
GRIN            8858
INN_ID          9016
MF             60756
INCHIKEY       47140
SMILES         57761
UNII_TYPE      81964
Unnamed: 15        0
NUI             4008
UMLS_CUI        4001
RxNorm_CUI      4008
dtype: int64