In [116]:
import pubchempy as pcp
import pandas as pd
import requests
import json
import vendor_lookup


In [205]:
def get_sid_set(sources):
    """
    Compile all the SID's which appear inside the Chemical Vendors or Legacy Depositors entries in a pubchem JSON file.
    From my current understanding SID is a unique entry that identifies a product+manufacturer. There should 
    be no duplicates. Therefore we assert that this is the case. 
    """
    sid_list = []
    for source_dict in sources:
        sid = source_dict['SID']  
        sid_list.append(sid)
    sid_set = set(sid_list)    
    
    assert len(sid_set) == len(sid_set), "Duplicate SID detected"
    return sid_set

def get_current_vendors(request_dict):
    
    categories = request_dict['SourceCategories']['Categories']  
    vendor_present = False
    legacy_present = False
    for cat_item in categories:
        category = cat_item['Category']
        sources = cat_item['Sources']
        if category == 'Chemical Vendors':
            vendor_present = True
            vendor_set = get_sid_set(sources)     
        elif category =='Legacy Depositors':
            legacy_present = True
            legacy_set = get_sid_set(sources)
    
    # Check if at least chemical vendors or legacy depositors is present.
    if vendor_present == False and legacy_present == False:
        return set([])
    
    current_vendors = vendor_set-legacy_set

    return current_vendors

        

In [184]:
chem_df = pd.read_csv('HBA_list.csv')
chem_df

Unnamed: 0,HBA,HBA_cid
0,choline chloride,6209
1,choline acetate,187
2,choline bromide,74724
3,choline fluoride,22134097
4,choline nitrate,13646546
5,(2-chloroethyl)trimethylammonium chloride,13836
6,ethyl(2-hydroxyethyl)dimethylammonium chloride,87940
7,benzyl(2-hydroxyethyl)dimethylammonium chloride,3014549
8,acetylcholine chloride,6060
9,tetramethylammonium chloride,6379


In [206]:
vendor_status = []
for i, row in chem_df.iterrows():
    cid = str(row['HBA_cid'])
    target_url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/categories/compound/'+ cid + '/JSON'
    request = requests.get(target_url)
    request_dict = request.json()
    # At this point you have the target URL and have put it in a dictionary.
    # Now the game is to look at different possible cases, i.e. look at different cid's
    # and find ones which have and don't have vendors. It appears that even if a chemical
    # does not have a vendor, old sources will appear inside the 'Chemical Vendors' dictionary. 
    # However, one could filter out non-current sources by checking whether that product also appears
    # in 'Legacy depositors'
    current_vendors = get_current_vendors(request_dict)
    if len(current_vendors) == 0:
        has_current_vendors = False
    else:
        has_current_vendors = True
    vendor_status.append(has_current_vendors)


  

6209
[True]
187
[True, True]
74724
[True, True, True]
22134097
[True, True, True, True]
13646546
[True, True, True, True, False]
13836
[True, True, True, True, False, True]
87940
[True, True, True, True, False, True, True]
3014549
[True, True, True, True, False, True, True, True]
6060
[True, True, True, True, False, True, True, True, True]
6379
[True, True, True, True, False, True, True, True, True, True]
11198
[True, True, True, True, False, True, True, True, True, True, True]
68974
[True, True, True, True, False, True, True, True, True, True, True, True]
517011
[True, True, True, True, False, True, True, True, True, True, True, True, True]
82326
[True, True, True, True, False, True, True, True, True, True, True, True, True, True]
5946
[True, True, True, True, False, True, True, True, True, True, True, True, True, True, True]
6285
[True, True, True, True, False, True, True, True, True, True, True, True, True, True, True, True]
12203661
[True, True, True, True, False, True, True, True,

In [208]:
chem_df['Vendor Status'] = vendor_status
chem_df

Unnamed: 0,HBA,HBA_cid,Vendor Status
0,choline chloride,6209,True
1,choline acetate,187,True
2,choline bromide,74724,True
3,choline fluoride,22134097,True
4,choline nitrate,13646546,False
5,(2-chloroethyl)trimethylammonium chloride,13836,True
6,ethyl(2-hydroxyethyl)dimethylammonium chloride,87940,True
7,benzyl(2-hydroxyethyl)dimethylammonium chloride,3014549,True
8,acetylcholine chloride,6060,True
9,tetramethylammonium chloride,6379,True
