In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import os
import re

In [2]:
from collections import Counter
from itertools import chain

# Preliminaries
## Directories

In [3]:
# create directory for outputs
if not(os.path.exists('maps')):
    os.mkdir('maps')

## Load dataset
Data downloaded from FDA website: [link](https://verification.fda.gov.ph/drug_productslist.php)

In [4]:
f = pd.read_csv('drug_products.csv')
f.head(3)

Unnamed: 0,Registration Number,Generic Name,Brand Name,Dosage Strength,Dosage Form,Classification,Packaging,Manufacturer,Country of Origin,Trader,Importer,Distributor,Expiry Date
0,BR-041-03,Enoxaparin Sodium,Enoxbicare-40,4000 IU Anti-Factor Xa (equivalent to 40 mg) /...,Solution For Injection (SC),Prescription Drug (RX),0.4 mL in 1 mL pre-filled syringe (Box of 1 Bl...,Brawn Laboratories Ltd.,India,,Ambica International Corporation,Ambicare Pharmaceuticals Inc.,09 December 2025
1,BR-1000,"Japanese Encephalitis Virus (Live, Attenuated,...",IMOJEV,4.0-5.8 log 10 PFU** per dose (0.5mL),Suspension for Injection (S.C),Prescription Drug (RX),Type I borosilicate glass vial with vvm contai...,Government Pharmaceutical Organization-Merieux...,Thailand,Sanofi Pasteur Limited - Thailand,SANOFI PASTEUR,Zuellig Pharma Corporation,16 December 2020
2,BR-1001,Phospholipid Fraction from Bovine Lung,Alveofact,45 mg/mL (54 mg/1.2 mL),Lyophilized Powder for \r\nSuspension for Intr...,Prescription Drug (RX),Blister pack x 10's (Box of 100's),BAG Health Care GmbH,Germany,,"Glorious Dexa Mandaya, Inc.","Glorious Dexa Mandaya, Inc.",06 January 2025


# Drug Classification
OTC vs Rx vs HR

In [5]:
c = Counter(f['Classification'])
c

Counter({'Prescription Drug (RX)': 19070,
         'Prescription Drug (Rx)': 3783,
         'Prescription Drugs (RX)': 1,
         'Solution for \r\nInjection (SC)': 1,
         'Over-the-Counter (OTC) Drug': 577,
         'Over-the-Counter (OTC)': 2556,
         'Over-The-Counter (OTC)': 1378,
         'Rx': 1,
         'Household Remedy (HR)': 1657,
         'Over-The-Counter (OTC) Drug': 46,
         'Over-the-counter (OTC) Drug': 44,
         nan: 3,
         'Prescription Drug (RX) Drug': 5,
         'Over The Counter (OTC) Drug': 1,
         'Prescription drug (Rx)': 1,
         'prescription Drug (Rx)': 1,
         'Over-the-Counter Drug (OTC)': 2,
         'Over-the Counter (OTC) Drug': 1,
         'Over-The-Counter(OTC)': 16,
         'None': 2,
         'Prescription (Rx) Drug': 1,
         'Over-the-Counter (OTC) drug': 1,
         'Over-the-Counter (OTC) Drug :': 1,
         'Over-The Counter (OTC)': 1,
         'Prescription Drug (Rx': 1,
         'Over-the-counter (OTC)':

In [6]:
def new_class(old_value):
    
    if pd.isnull(old_value):
        return None
    if re.search(r'.*(none).*', old_value.lower()):
        return None
    
    if re.search(r'.*(rx|prescr|inj).*', old_value.lower()):
        return 'Prescription Drug (Rx)'
    
    if re.search(r'.*(otc).*', old_value.lower()):
        return 'Over-the-Counter (OTC) Drug'
    
    if re.search(r'.*(household).*', old_value.lower()):
        return 'Household Remedy (HR)'

## output map

In [7]:
class_map = pd.DataFrame(columns=['original','new'])
class_map['original'] = c.keys()

In [8]:
class_map['new'] = class_map['original'].apply(new_class)
class_map

Unnamed: 0,original,new
0,Prescription Drug (RX),Prescription Drug (Rx)
1,Prescription Drug (Rx),Prescription Drug (Rx)
2,Prescription Drugs (RX),Prescription Drug (Rx)
3,Solution for \r\nInjection (SC),Prescription Drug (Rx)
4,Over-the-Counter (OTC) Drug,Over-the-Counter (OTC) Drug
5,Over-the-Counter (OTC),Over-the-Counter (OTC) Drug
6,Over-The-Counter (OTC),Over-the-Counter (OTC) Drug
7,Rx,Prescription Drug (Rx)
8,Household Remedy (HR),Household Remedy (HR)
9,Over-The-Counter (OTC) Drug,Over-the-Counter (OTC) Drug


In [9]:
class_map.to_csv('maps/classes.csv',index=False)

# Dosage Form
Describe dosage forms. Too tedious to find all possible combinations.

In [10]:
print(f['Dosage Form'].nunique(dropna=False))
Counter(f['Dosage Form'])

2252


Counter({'Solution For Injection (SC)': 68,
         'Suspension for Injection (S.C)': 1,
         'Lyophilized Powder for \r\nSuspension for Intratracheal Administration': 1,
         'Solution for Injection (I.V./S.C.)': 13,
         'Solution for Injection (IV/SC)': 131,
         'Solution For Injection (Im/Iv/Sc)': 18,
         'Suspension For Injection (SC)': 7,
         'Suspension For Injection': 11,
         'Solution for injection': 3,
         'Suspension for injection (SC)': 1,
         'Solution For Injection (I.V. / S.C.)': 2,
         'Solution For Injection (I.V./S.C.)': 8,
         'Suspension for Injection (IM)': 22,
         'Concentrated Solution for IV Infusion': 2,
         'Solution for Injection': 103,
         'Solution for Injection (IM/SC)': 8,
         'Solution for Injection (SC)': 80,
         'Concentrate for Solution for Infusion (IV)': 9,
         'Solution for Subcutaneous/Continuous SC Pump Infusion/ \r\nIntravenous Injection': 1,
         'Powder for 

Instead, we "tokenize" the dosage forms then attempt to get the elements by tagging.

In [11]:
# get unique words
words_per_form = []
for k,i in enumerate(Counter(f['Dosage Form']).keys()):
    
    if pd.isnull(i):
        words_per_form += [[]]
    else:
        # split then remove special characters
        s = re.split(r'\s+|\/',i.lower())
        s = [re.sub(r'[^\w|^\-]|\d','',j) for j in s]

        # remove blanks
        s = [j for j in s if j != '']
        
        words_per_form += [s]


print(len(words_per_form))
words_per_form

2252


[['solution', 'for', 'injection', 'sc'],
 ['suspension', 'for', 'injection', 'sc'],
 ['lyophilized',
  'powder',
  'for',
  'suspension',
  'for',
  'intratracheal',
  'administration'],
 ['solution', 'for', 'injection', 'iv', 'sc'],
 ['solution', 'for', 'injection', 'iv', 'sc'],
 ['solution', 'for', 'injection', 'im', 'iv', 'sc'],
 ['suspension', 'for', 'injection', 'sc'],
 ['suspension', 'for', 'injection'],
 ['solution', 'for', 'injection'],
 ['suspension', 'for', 'injection', 'sc'],
 ['solution', 'for', 'injection', 'iv', 'sc'],
 ['solution', 'for', 'injection', 'iv', 'sc'],
 ['suspension', 'for', 'injection', 'im'],
 ['concentrated', 'solution', 'for', 'iv', 'infusion'],
 ['solution', 'for', 'injection'],
 ['solution', 'for', 'injection', 'im', 'sc'],
 ['solution', 'for', 'injection', 'sc'],
 ['concentrate', 'for', 'solution', 'for', 'infusion', 'iv'],
 ['solution',
  'for',
  'subcutaneous',
  'continuous',
  'sc',
  'pump',
  'infusion',
  'intravenous',
  'injection'],
 ['powde

## Elements

We use the following to describe the drug format:

- **form**: what type of solid, liquid, etc it is
- **use**: how the drug is used. this is used as the "form" if the above is missing
- **route**: where the drug is used
- **special**: physical properties of the drug

In [12]:
prop_dict = dict(
    form = None, # capsule, solution, concentrate
    use = None, # infusion, injection, etc
    route = None, # oral, topical
    special = None, # modified-release, lyophilized
)

### 1. format

e.g., capsule, solution, concentrate

In [13]:
dict_format = dict(
    gas = '\bgas',
    powder = 'pow(der)?',
    concentrate = 'concentrate)(?!d)|(conc)(?!\w+',
    solution = 'sol(utio)?n)|(sol)(?!\w+',
    suspension = 'sus(pension)?',
    emulsion = 'emuls(ion)?',
    liquid = 'liquid',
    inhaler = 'inhal?',
    syringe = 'syringe',
    syrup = 'syrup',
    elixir = 'elixir',
    tablet = 'tab(let)?',
    capsule = 'cap(sule)?',
    ointment = 'oin(tment)?',
    lyophilisate = 'lyophil',
    patch = 'patch',
    spray = 'spray',
    cream = 'cream',
    lotion = 'lotion',
    paste = 'paste',
    aerosol = 'aerosol',
    shampoo = 'shamp(oo)?',
    jelly = 'jelly',
    alcohol = 'alc(ohol)?',
    protectant = 'protec(tant)?',
    plaster = 'plaster',
    implant = 'implant',
    pessary = 'pessary?',
    troche = 'troche',
    gargle = 'gargle',
    dragee = 'dragee?',
    lozenge = 'lozenge',
    pastille = 'pastille',
    system = 'sys(tem)?',
    granules = 'granul(es)?',
    drops = 'drops?',
    liniment = 'liniment',
    tincture = 'tincture',
    cleanser = 'cleanser',
    sachet = 'sachet',
    embrocation = 'embrocation',
    paint = 'paint',
    douche = 'douche',
    stick = 'stick',
    soap = 'soap',
    strips = 'strips?',
    suppository = 'supp?osit(or)?',
    vial = 'vial',
    mass = 'mass',
    tube = 'tube',
    wipe = 'wipe',
    enema = 'enema',
    insert = 'insert',
    gel = '\bgel',
    pad = '\bpad',
    kit = 'kit',
    oil = 'oil',
    gum = 'gum',
    tea = 'tea',
)

for k,v in dict_format.items():
    dict_format[k] = fr'({v})'

In [14]:
def find_format(joined_tokens):
    
    for k,v in dict_format.items():
        if re.search(v,joined_tokens):
            return k
        
    return None

In [15]:
format_map = pd.DataFrame(columns=['original','tokenized','drug_format'])
format_map['original'] = Counter(f['Dosage Form']).keys()
format_map['tokenized'] = [' '.join(w) for w in words_per_form]

In [16]:
format_map['drug_format'] = format_map['tokenized'].apply(find_format)
format_map

Unnamed: 0,original,tokenized,drug_format
0,Solution For Injection (SC),solution for injection sc,solution
1,Suspension for Injection (S.C),suspension for injection sc,suspension
2,Lyophilized Powder for \r\nSuspension for Intr...,lyophilized powder for suspension for intratra...,powder
3,Solution for Injection (I.V./S.C.),solution for injection iv sc,solution
4,Solution for Injection (IV/SC),solution for injection iv sc,solution
...,...,...,...
2247,Test Kits,test kits,kit
2248,Freeze Dried Powder for Suspension (Intraocula...,freeze dried powder for suspension intraocular...,powder
2249,Freeze Dried Powder for Suspension (Oral/Intra...,freeze dried powder for suspension oral intran...,powder
2250,Feed Premix Solution,feed premix solution,solution


In [17]:
format_map.to_csv('maps/drug_format.csv',index=False)

### 2. use

e.g., for infusion, injection, inhalation

In [None]:
dict_use = dict(
    infusion = 'inf(usion)?',
    injection = 'inj(ection)?',
    inhalation = 'inhal(ation)?',
    nebulization = 'nebul?',
    application = 'applicat?',
    instillation = 'instill',
    suspension = 'susp',
    
)

In [10]:
print(f['Dosage Form'].nunique(dropna=False))
Counter(f['Dosage Form'])

2252


Counter({'Solution For Injection (SC)': 68,
         'Suspension for Injection (S.C)': 1,
         'Lyophilized Powder for \r\nSuspension for Intratracheal Administration': 1,
         'Solution for Injection (I.V./S.C.)': 13,
         'Solution for Injection (IV/SC)': 131,
         'Solution For Injection (Im/Iv/Sc)': 18,
         'Suspension For Injection (SC)': 7,
         'Suspension For Injection': 11,
         'Solution for injection': 3,
         'Suspension for injection (SC)': 1,
         'Solution For Injection (I.V. / S.C.)': 2,
         'Solution For Injection (I.V./S.C.)': 8,
         'Suspension for Injection (IM)': 22,
         'Concentrated Solution for IV Infusion': 2,
         'Solution for Injection': 103,
         'Solution for Injection (IM/SC)': 8,
         'Solution for Injection (SC)': 80,
         'Concentrate for Solution for Infusion (IV)': 9,
         'Solution for Subcutaneous/Continuous SC Pump Infusion/ \r\nIntravenous Injection': 1,
         'Powder for 