In [1]:
import os
import zipfile
import requests
import pandas as pd

s = requests.Session()
unii_folder = './unii_data/'

def load_unii():
    url = 'http://fdasis.nlm.nih.gov/srs/download/srs/UNII_Data.zip'
    
    if not os.path.exists(unii_folder):
        os.makedirs(unii_folder)
    
    # remove all old UNII files
    for file in os.listdir(unii_folder):
        file_path = os.path.join(unii_folder, file)
        if os.path.isfile(file_path):
            os.remove(file_path)
        
    s = requests.Session()
    reply = s.get(url, stream=True)
    with open(os.path.join(unii_folder, 'UNII_Data.zip'), 'wb') as f:
        for chunk in reply.iter_content(chunk_size=512):
            if chunk:
                f.write(chunk)
                #f.flush()
    
    zf = zipfile.ZipFile(os.path.join(unii_folder, 'UNII_Data.zip'), 'r')
    zf.extractall(unii_folder)
    zf.close()
    
    for file in os.listdir(unii_folder):
        if 'Records' in file:
            full_file_name = os.path.join(unii_folder, file)
            os.rename(full_file_name, os.path.join(unii_folder, 'unii_data.txt'))
        
    

def get_ndfrt_nui(prop_type, prop_value):
    base_url = 'https://rxnav.nlm.nih.gov/REST/Ndfrt/concept.json'
    
    params = {
        'propertyName': prop_type,
        'propertyValue': prop_value
    }
    
    reply = s.get(base_url, params=params).json()
    
    if 'groupConcepts' in reply and reply['groupConcepts'][0]:
        return reply['groupConcepts'][0]['concept'][0]['conceptNui']
    
    return ''


def get_ndfrt_data(nui):
    url = 'https://rxnav.nlm.nih.gov/REST/Ndfrt/allInfo.json'
    params = {
        'nui': nui
    }
    
    results = s.get(url=url, params=params).json()

    props = dict()
    if 'groupProperties' in results['fullConcept']:
        for prop in results['fullConcept']['groupProperties'][0]['property']:
            props[prop['propertyName']] = prop['propertyValue']
            
    return props
    
# nui = get_ndfrt_nui('MeSH_DUI', 'D014406')
# print(nui)

# get_ndfrt_data(nui)

load_unii()

In [2]:
unii_df = pd.read_csv(os.path.join(unii_folder, 'unii_data.txt'), dtype=object, sep='\t', low_memory=False)
unii_df.head()

Unnamed: 0,UNII,PT,RN,EC,NCIT,RXCUI,ITIS,NCBI,PLANTS,GRIN,MPNS,INN_ID,MF,INCHIKEY,SMILES,UNII_TYPE
0,0739N04X3A,"1,1,2-TRICHLOROTRIFLUOROETHANE",76-13-1,200-936-1,C96191,38585.0,,,,,,,C2Cl3F3,AJDIZQLSFPQPEY-UHFFFAOYSA-N,C(C(F)(Cl)Cl)(F)(F)Cl,INGREDIENT SUBSTANCE
1,18MXK3D6DB,3-STYRENESULFONIC ACID,46060-58-6,,,,,,,,,,C8H8O3S,ATBDZSAENDYQDW-UHFFFAOYSA-N,C=Cc1cccc(c1)S(=O)(=O)O,INGREDIENT SUBSTANCE
2,1Y58DO4MY1,DESIPRAMINE HYDROCHLORIDE,58-28-6,200-373-1,C28979,203174.0,,,,,,,C18H22N2.ClH,XAEWZDYWZHIUCT-UHFFFAOYSA-N,CNCCCN1c2ccccc2CCc3c1cccc3.Cl,INGREDIENT SUBSTANCE
3,6E17K3343P,CHLOROQUINE PHOSPHATE,50-63-5,200-055-2,C47445,20863.0,,,,,,,C18H26ClN3.2H3O4P,,CCN(CC)CCCC(C)Nc1ccnc2c1ccc(c2)Cl.OP(=O)(O)O.O...,INGREDIENT SUBSTANCE
4,9H05937G3X,FLUPREDNISOLONE,53-34-9,200-170-8,C65730,4497.0,,,,,,1352.0,C21H27FO5,MYYIMZRZXIQBGI-HVIRSNARSA-N,C[C@]12C[C@@H]([C@H]3[C@H]([C@@H]1CC[C@@]2(C(=...,INGREDIENT SUBSTANCE


In [3]:
unii_df.count()

UNII         91280
PT           91280
RN           63335
EC           21514
NCIT         12944
RXCUI        10500
ITIS         17350
NCBI         21845
PLANTS        9558
GRIN         15642
MPNS         14389
INN_ID        9040
MF           63260
INCHIKEY     49062
SMILES       60086
UNII_TYPE    91280
dtype: int64

In [4]:
for count, unii in enumerate(unii_df['UNII']):
    nui = get_ndfrt_nui('FDA_UNII', unii)
    if nui:
        print(count, 'UNII {} returned NUI:'.format(unii), nui)
        prop_data = get_ndfrt_data(nui)
#         print(prop_data)
        unii_df.loc[count, 'NUI'] = nui
        for p in ['UMLS_CUI', 'RxNorm_CUI', 'MeSH_CUI']:
            try:
                unii_df.loc[count, p] = prop_data[p]
            except KeyError:
                continue
#     else:
#         print('UNII {} not found'.format(unii))
    
#     if count > 300:
#         break
    

2 UNII 1Y58DO4MY1 returned NUI: N0000146921
3 UNII 6E17K3343P returned NUI: N0000146559
6 UNII H9Y79VD43J returned NUI: N0000148019
7 UNII V83O1VOZ8L returned NUI: N0000145890
11 UNII 6K7YS503HC returned NUI: N0000147454
12 UNII O1GX33ON8R returned NUI: N0000146231
13 UNII 0CPP32S55X returned NUI: N0000146440
14 UNII 48U51W007F returned NUI: N0000145884
16 UNII 1806D8D52K returned NUI: N0000147702
18 UNII 30Q7KI53AK returned NUI: N0000146376
19 UNII F446C597KA returned NUI: N0000145978
20 UNII H0C805XYDE returned NUI: N0000148009
21 UNII L64N7M9BWR returned NUI: N0000184202
22 UNII 4O71YT5YB5 returned NUI: N0000147133
23 UNII U0476M545B returned NUI: N0000147849
25 UNII 97C5T2UQ7J returned NUI: N0000146961
28 UNII R4KO0DY52L returned NUI: N0000147765
30 UNII 9JU12S4YFY returned NUI: N0000146093
31 UNII 95URV01IDQ returned NUI: N0000146580
37 UNII 46QG38NC4U returned NUI: N0000147040
38 UNII 9PHQ9Y1OLM returned NUI: N0000146334
39 UNII DAA13NKG2Q returned NUI: N0000147960
40 UNII ITX086

In [5]:
unii_df.to_csv('unii_data_ndfrt_04-2017.csv')
unii_df.head(300)

Unnamed: 0,UNII,PT,RN,EC,NCIT,RXCUI,ITIS,NCBI,PLANTS,GRIN,MPNS,INN_ID,MF,INCHIKEY,SMILES,UNII_TYPE,NUI,UMLS_CUI,RxNorm_CUI
0,0739N04X3A,"1,1,2-TRICHLOROTRIFLUOROETHANE",76-13-1,200-936-1,C96191,38585,,,,,,,C2Cl3F3,AJDIZQLSFPQPEY-UHFFFAOYSA-N,C(C(F)(Cl)Cl)(F)(F)Cl,INGREDIENT SUBSTANCE,,,
1,18MXK3D6DB,3-STYRENESULFONIC ACID,46060-58-6,,,,,,,,,,C8H8O3S,ATBDZSAENDYQDW-UHFFFAOYSA-N,C=Cc1cccc(c1)S(=O)(=O)O,INGREDIENT SUBSTANCE,,,
2,1Y58DO4MY1,DESIPRAMINE HYDROCHLORIDE,58-28-6,200-373-1,C28979,203174,,,,,,,C18H22N2.ClH,XAEWZDYWZHIUCT-UHFFFAOYSA-N,CNCCCN1c2ccccc2CCc3c1cccc3.Cl,INGREDIENT SUBSTANCE,N0000146921,C0700529,203174
3,6E17K3343P,CHLOROQUINE PHOSPHATE,50-63-5,200-055-2,C47445,20863,,,,,,,C18H26ClN3.2H3O4P,,CCN(CC)CCCC(C)Nc1ccnc2c1ccc(c2)Cl.OP(=O)(O)O.O...,INGREDIENT SUBSTANCE,N0000146559,C0055447,20863
4,9H05937G3X,FLUPREDNISOLONE,53-34-9,200-170-8,C65730,4497,,,,,,1352,C21H27FO5,MYYIMZRZXIQBGI-HVIRSNARSA-N,C[C@]12C[C@@H]([C@H]3[C@H]([C@@H]1CC[C@@]2(C(=...,INGREDIENT SUBSTANCE,,,
5,O061W4C37H,BUTYL MYRISTATE,110-36-1,203-759-8,,,,,,,,,C18H36O2,DHAZIUXMHRHVMP-UHFFFAOYSA-N,CCCCCCCCCCCCCC(=O)OCCCC,INGREDIENT SUBSTANCE,,,
6,H9Y79VD43J,STRYCHNINE,57-24-9,200-319-7,C84183,66422,,,,,,,C21H22N2O2,QMGVPVSNSZLJIA-FVWCLLPLSA-N,c1ccc2c(c1)[C@]34CCN5[C@H]3C[C@@H]6[C@@H]7[C@@...,INGREDIENT SUBSTANCE,N0000148019,C0202474,66422
7,V83O1VOZ8L,ISONIAZID,54-85-3,200-214-6,C600,6038,,,,,,4188,C6H7N3O,QRXWMOHMRWLFEY-UHFFFAOYSA-N,c1cnccc1C(=O)NN,INGREDIENT SUBSTANCE,N0000145890,C0022209,6038
8,QNT09A162Y,ARSTHINOL,119-96-0,204-361-7,C76411,,,,,,,240,C11H14AsNO3S2,,CC(=O)Nc1cc(ccc1O)[As]2SCC(S2)CO,INGREDIENT SUBSTANCE,,,
9,6V3I57K9UL,ETHYL 10-(4-IODOPHENYL)UNDECANOATE,99-79-6,202-787-8,,,,,,,,,C19H29IO2,,CCOC(=O)CCCCCCCCC(C)c1ccc(cc1)I,INGREDIENT SUBSTANCE,,,


In [6]:
inn_only = unii_df.loc[unii_df['INN_ID'].notnull(), :]
inn_only.count()

UNII          9040
PT            9040
RN            9016
EC            2631
NCIT          6932
RXCUI         2651
ITIS             2
NCBI            17
PLANTS           0
GRIN             1
MPNS             1
INN_ID        9040
MF            8274
INCHIKEY      6159
SMILES        7969
UNII_TYPE     9040
NUI           1684
UMLS_CUI      1678
RxNorm_CUI    1684
dtype: int64

In [7]:
for c, x in inn_only.iterrows():
    if pd.isnull(x['SMILES']):
        print(c, x['UNII'], x['PT'], x['RN'], x['INN_ID'])

332 NIJ123W41V PLICAMYCIN 18378-89-7 2029
448 6Q205EH1VU VANCOMYCIN 1404-90-6 680
1527 I835H2IHHX CETETH-20 9004-95-9 340
1582 04XPW8C0FL APROTININ 9087-70-1 2481
1678 Z1LH97KTRM ALFADEX 10016-20-3 6276
1719 E1NC1JVS3O POLYTETRAFLUOROETHYLENE 9002-84-0 2776
1886 U0JZ726775 DESIRUDIN 120993-53-5 7193
2158 740Y5J48Z8 ULARITIDE 118812-69-4 7107
2213 G8RGG88B68 PEGINTERFERON ALFA-2B 215647-85-1 7909
2439 F60NE4XB53 IMETELSTAT 868169-64-6 9142
2441 F76229E21M CEFILAVANCIN 722454-12-8 9881
2526 85J5ZP6YSL OBLIMERSEN 190977-41-4 8267
2572 P93RUU11P7 INOTUZUMAB OZOGAMICIN 635715-01-4 8574
2965 XT4808181K LOTILIBCIN 169148-84-9 9070
3037 38PLP07BKC EDRATIDE 433922-67-9 8335
3102 361LPM2T56 SUGAMMADEX 343306-71-8 8528
3137 2M3V3B8OEA DEPELESTAT 506433-25-6 8493
3142 JV039JZZ3A BETADEX 7585-39-9 6860
3338 J0K70H3420 ALSACTIDE 34765-96-3 4965
3339 618SLL9VBS OMIGANAN 204248-78-2 8347
3357 XK134822Z0 TELAVANCIN 372151-71-8 8504
3812 GJ2416WK6Y YTTRIUM Y-90 TACATUZUMAB TETRAXETAN 476413-07-7 8484
38

In [8]:
unii_df.count()

UNII          83725
PT            83725
RN            62470
EC            21394
NCIT          12935
RXCUI         10351
ITIS          13573
NCBI          15905
PLANTS         7260
GRIN           8957
INN_ID         9037
MF            62315
INCHIKEY      48340
SMILES        59246
UNII_TYPE     83725
NUI            4017
UMLS_CUI       4000
RxNorm_CUI     4017
dtype: int64