In [201]:
import requests
from bs4 import BeautifulSoup
import pandas as pd


In [217]:
if False:
    # Fetch Drugs @ FDA by the starting letter, find the HTML table element, and then save it to a csv
    def get_table(letter):
        url = f"https://www.accessdata.fda.gov/scripts/cder/daf/index.cfm?event=browseByLetter.page&productLetter={letter}&ai=0"
        # url = f"https://www.accessdata.fda.gov/scripts/cder/daf/index.cfm?event=browseByLetter.page&productLetter=A&ai=0"
        response = requests.get(url)

        if response.status_code == 200:
            webpage_content = response.text
            soup = BeautifulSoup(webpage_content, 'html.parser')

            # Find the table element
            table = soup.find('table')

            # Extract table data into a DataFrame
            table_data = []
            for row in table.find_all('tr'):
                row_data = [cell.text.strip() for cell in row.find_all('td')]
                table_data.append(row_data)
            return table_data
        else:
            print("Failed to fetch the webpage")
        assert False
    letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
    table_data =[]
    for i in letters:
        print(i)
        table_data.extend(get_table(i))
    df_raw = pd.DataFrame(table_data)
    csv_file = "data/fda-drugs-raw.csv"
    df_raw.to_csv(csv_file, index=False)

In [222]:
# load the csv we saved from above
df2 = pd.read_csv("data/fda-drugs-raw.csv")
table_data = df2.values

In [243]:
table_data[:3]

array([[nan],
       [nan],
       ['A-HYDROCORT\n\nA-HYDROCORT (HYDROCORTISONE SODIUM SUCCINATE) | ANDA  #040666 | INJECTABLE;INJECTION  | HOSPIRA\r\n\t\nA-HYDROCORT (HYDROCORTISONE SODIUM SUCCINATE) | ANDA  #085928 | INJECTABLE;INJECTION  | ABBOTT\r\n\t\nA-HYDROCORT (HYDROCORTISONE SODIUM SUCCINATE) | ANDA  #085929 | INJECTABLE;INJECTION  | HOSPIRA\r\n\t\nA-HYDROCORT (HYDROCORTISONE SODIUM SUCCINATE) | ANDA  #085930 | INJECTABLE;INJECTION  | HOSPIRA\r\n\t\nA-HYDROCORT (HYDROCORTISONE SODIUM SUCCINATE) | ANDA  #085931 | INJECTABLE;INJECTION  | HOSPIRA\r\n\t\nA-HYDROCORT (HYDROCORTISONE SODIUM SUCCINATE) | ANDA  #085932 | INJECTABLE;INJECTION  | HOSPIRA\r\n\t\nA-HYDROCORT (HYDROCORTISONE SODIUM SUCCINATE) | ANDA  #089577 | INJECTABLE;INJECTION  | ABBOTT\r\n\t\nA-HYDROCORT (HYDROCORTISONE SODIUM SUCCINATE) | ANDA  #089578 | INJECTABLE;INJECTION  | ABBOTT\r\n\t\nA-HYDROCORT (HYDROCORTISONE SODIUM SUCCINATE) | ANDA  #089579 | INJECTABLE;INJECTION  | ABBOTT\r\n\t\nA-HYDROCORT (HYDROCORTISO

In [244]:
def process_raw_s(row_s):
    """
    Given a giant table entry from the raw data, parse it out into drug name and active

    The table entry roughly has the format:
    DRUG_NAME (newlines) DRUG_NAME (ACTIVE INGREDS.) | DETAILS.... (newlines) DRUG_NAME (ACTIVE INGREDS)...

    Where the same drug often has many entries I think for the different application methods and different companies that produce it (not sure?)

    Returns drug name, and frozenset of the ingredients
    """
    lines = row_s.split('\n')
    lines = list(map(lambda s : s.strip(), lines))
    lines = [s for s in lines if len(s)>0]
    drug_name = lines[0]
    details = lines[1:]
    # print(drug_name)
    import re
    matches = [re.findall(r'\((.*?)\)', s.replace(drug_name, '', 1).split('|')[0]) for s in details] # take the part before the first '|', then remove the drug name, and then extract the part between parens to get active ingredient(s)
    assert all([len(m)==1 for m in matches])
    flattened_matches = [m[0] for m in matches]
    split_matches = [s.split(';') for s in flattened_matches]
    set_matches = [frozenset([s.strip() for s in m]) for m in split_matches] # this contains the active ingredients for each row for this drug. there can be variants depending on application method, the company that produces it, etc. 
    # print(set_matches)
    active_ingredients = set_matches[0] # for now, we just take the first one
    
    # assert all([m==active_ingredients for m in set_matches])  #TODO: bring back this assert - maybe group by company?
    # print(active_ingredients)
    return drug_name, active_ingredients


In [224]:
print(process_raw_s(table_data[2][0]))

('A-HYDROCORT', frozenset({'HYDROCORTISONE SODIUM SUCCINATE'}))


In [225]:
drugs = []
for row in table_data:
  assert len(row) == 1
  s = row[0]
  if type(s) is float or len(s) == 0:
    continue
  drugs.append(process_raw_s(s))


In [246]:
drugs[:6]

[('A-HYDROCORT', frozenset({'HYDROCORTISONE SODIUM SUCCINATE'})),
 ('A-METHAPRED', frozenset({'METHYLPREDNISOLONE SODIUM SUCCINATE'})),
 ('A-POXIDE', frozenset({'CHLORDIAZEPOXIDE HYDROCHLORIDE'})),
 ('A.P.L.', frozenset({'GONADOTROPIN, CHORIONIC'})),
 ('A/T/S', frozenset({'ERYTHROMYCIN'})),
 ('ABACAVIR AND LAMIVUDINE', frozenset({'ABACAVIR', 'LAMIVUDINE'}))]

In [229]:
def get_SMILES_from_compound_name(compound_name):
    # PubChem PUG-REST URL for compound name search
    pubchem_url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{compound_name}/JSON"

    # Send an HTTP GET request to the PubChem API
    response = requests.get(pubchem_url)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        data = response.json()
        # print(data)
        if 'PC_Compounds' in data and data['PC_Compounds']:
            props = data['PC_Compounds'][0]['props']
            for i in props:
                if i['urn']['label']  == 'SMILES' and i['urn']['name'] == 'Canonical':
                  return i['value']['sval']

            # smiles_string = data['PC_Compounds'][0]['props'][17]['value']['sval']


            # smiles_string = data['PC_Compounds'][0]['props'][17]['value']['sval']
            # cid = data['PC_Compounds'][0]['id']['cid']
            # print(f"Compound Name: {compound_name}")
            # print(f"PubChem CID: {cid}")
            # print(f"SMILES String: {smiles_string}")
        else:
            print(f"No information found for '{compound_name}' in PubChem.")
    else:
        print(f"Error: Unable to retrieve data for '{compound_name}' from PubChem.")
    return None

In [239]:
# compound_name = "HYDROCORTISONE SODIUM SUCCINATE"  
# compound_name = "GONADOTROPIN, CHORIONIC"
# compound_name = "HUMAN CHORIONIC GONADOTROPIN"
# print(get_SMILES_from_compound_name(compound_name))

In [231]:
# this will take a while.... searching through for every active ingredient. it will report which ones it fails on
drug_smiles = []
for drug_name, active_ingredients in drugs[:5] + drugs[-5:]:
    # print(drug_name, active_ingredients)
    SMILES_strs = []
    for i in active_ingredients:
        SMILES_strs.extend([i, get_SMILES_from_compound_name(i)])
    drug_smiles.append((drug_name, *SMILES_strs))



Error: Unable to retrieve data for 'GONADOTROPIN, CHORIONIC' from PubChem.


In [232]:
drug_smiles[:5]

[('A-HYDROCORT',
  'HYDROCORTISONE SODIUM SUCCINATE',
  'CC12CCC(=O)C=C1CCC3C2C(CC4(C3CCC4(C(=O)COC(=O)CCC(=O)[O-])O)C)O.[Na+]'),
 ('A-METHAPRED',
  'METHYLPREDNISOLONE SODIUM SUCCINATE',
  'CC1CC2C3CCC(C3(CC(C2C4(C1=CC(=O)C=C4)C)O)C)(C(=O)COC(=O)CCC(=O)[O-])O.[Na+]'),
 ('A-POXIDE',
  'CHLORDIAZEPOXIDE HYDROCHLORIDE',
  'CN=C1CN(C(=C2C=C(C=CC2=N1)Cl)C3=CC=CC=C3)O.Cl'),
 ('A.P.L.', 'GONADOTROPIN, CHORIONIC', None),
 ('A/T/S',
  'ERYTHROMYCIN',
  'CCC1C(C(C(C(=O)C(CC(C(C(C(C(C(=O)O1)C)OC2CC(C(C(O2)C)O)(C)OC)C)OC3C(C(CC(O3)C)N(C)C)O)(C)O)C)C)O)(C)O')]

In [240]:
df = pd.DataFrame(drug_smiles)
columns = ['Drug Name']
for i in range((len(df.columns)-1)//2): 
    columns.extend([f'Active Ingredient {i}', f'SMILES {i}'])
print(columns)
df.columns = columns
df.tail()

['Drug Name', 'Active Ingredient 0', 'SMILES 0', 'Active Ingredient 1', 'SMILES 1']


Unnamed: 0,Drug Name,Active Ingredient 0,SMILES 0,Active Ingredient 1,SMILES 1
5,ZYRTEC ALLERGY,CETIRIZINE HYDROCHLORIDE,C1CN(CCN1CCOCC(=O)O)C(C2=CC=CC=C2)C3=CC=C(C=C3...,,
6,ZYRTEC HIVES RELIEF,CETIRIZINE HYDROCHLORIDE,C1CN(CCN1CCOCC(=O)O)C(C2=CC=CC=C2)C3=CC=C(C=C3...,,
7,ZYRTEC-D 12 HOUR,PSEUDOEPHEDRINE HYDROCHLORIDE,CC(C(C1=CC=CC=C1)O)NC.Cl,CETIRIZINE HYDROCHLORIDE,C1CN(CCN1CCOCC(=O)O)C(C2=CC=CC=C2)C3=CC=C(C=C3...
8,ZYTIGA,ABIRATERONE ACETATE,CC(=O)OC1CCC2(C3CCC4(C(C3CC=C2C1)CC=C4C5=CN=CC...,,
9,ZYVOX,LINEZOLID,CC(=O)NCC1CN(C(=O)O1)C2=CC(=C(C=C2)N3CCOCC3)F,,


In [242]:
df.to_csv("data/drug-actives.csv")