In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import json
import os
import re

In [2]:
from collections import Counter
from itertools import chain

# Preliminaries
## Directories

In [3]:
# create directory for outputs
if not(os.path.exists('maps')):
    os.mkdir('maps')

## Load dataset
Data downloaded from FDA website: [link](https://verification.fda.gov.ph/drug_productslist.php)

In [4]:
f = pd.read_csv('drug_products.csv')
f.head(3)

Unnamed: 0,Registration Number,Generic Name,Brand Name,Dosage Strength,Dosage Form,Classification,Packaging,Manufacturer,Country of Origin,Trader,Importer,Distributor,Expiry Date
0,BR-041-03,Enoxaparin Sodium,Enoxbicare-40,4000 IU Anti-Factor Xa (equivalent to 40 mg) /...,Solution For Injection (SC),Prescription Drug (RX),0.4 mL in 1 mL pre-filled syringe (Box of 1 Bl...,Brawn Laboratories Ltd.,India,,Ambica International Corporation,Ambicare Pharmaceuticals Inc.,09 December 2025
1,BR-1000,"Japanese Encephalitis Virus (Live, Attenuated,...",IMOJEV,4.0-5.8 log 10 PFU** per dose (0.5mL),Suspension for Injection (S.C),Prescription Drug (RX),Type I borosilicate glass vial with vvm contai...,Government Pharmaceutical Organization-Merieux...,Thailand,Sanofi Pasteur Limited - Thailand,SANOFI PASTEUR,Zuellig Pharma Corporation,16 December 2020
2,BR-1001,Phospholipid Fraction from Bovine Lung,Alveofact,45 mg/mL (54 mg/1.2 mL),Lyophilized Powder for \r\nSuspension for Intr...,Prescription Drug (RX),Blister pack x 10's (Box of 100's),BAG Health Care GmbH,Germany,,"Glorious Dexa Mandaya, Inc.","Glorious Dexa Mandaya, Inc.",06 January 2025


# Drug Classification
OTC vs Rx vs HR

In [5]:
c = Counter(f['Classification'])
c

Counter({'Prescription Drug (RX)': 19070,
         'Prescription Drug (Rx)': 3783,
         'Prescription Drugs (RX)': 1,
         'Solution for \r\nInjection (SC)': 1,
         'Over-the-Counter (OTC) Drug': 577,
         'Over-the-Counter (OTC)': 2556,
         'Over-The-Counter (OTC)': 1378,
         'Rx': 1,
         'Household Remedy (HR)': 1657,
         'Over-The-Counter (OTC) Drug': 46,
         'Over-the-counter (OTC) Drug': 44,
         nan: 3,
         'Prescription Drug (RX) Drug': 5,
         'Over The Counter (OTC) Drug': 1,
         'Prescription drug (Rx)': 1,
         'prescription Drug (Rx)': 1,
         'Over-the-Counter Drug (OTC)': 2,
         'Over-the Counter (OTC) Drug': 1,
         'Over-The-Counter(OTC)': 16,
         'None': 2,
         'Prescription (Rx) Drug': 1,
         'Over-the-Counter (OTC) drug': 1,
         'Over-the-Counter (OTC) Drug :': 1,
         'Over-The Counter (OTC)': 1,
         'Prescription Drug (Rx': 1,
         'Over-the-counter (OTC)':

In [6]:
def new_class(old_value):
    
    if pd.isnull(old_value):
        return None
    if re.search(r'.*(none).*', old_value.lower()):
        return None
    
    if re.search(r'.*(rx|prescr|inj).*', old_value.lower()):
        return 'Prescription Drug (Rx)'
    
    if re.search(r'.*(otc).*', old_value.lower()):
        return 'Over-the-Counter (OTC) Drug'
    
    if re.search(r'.*(household).*', old_value.lower()):
        return 'Household Remedy (HR)'

## Output map

In [7]:
class_map = pd.DataFrame(columns=['original','new'])
class_map['original'] = c.keys()

In [8]:
class_map['new'] = class_map['original'].apply(new_class)
class_map

Unnamed: 0,original,new
0,Prescription Drug (RX),Prescription Drug (Rx)
1,Prescription Drug (Rx),Prescription Drug (Rx)
2,Prescription Drugs (RX),Prescription Drug (Rx)
3,Solution for \r\nInjection (SC),Prescription Drug (Rx)
4,Over-the-Counter (OTC) Drug,Over-the-Counter (OTC) Drug
5,Over-the-Counter (OTC),Over-the-Counter (OTC) Drug
6,Over-The-Counter (OTC),Over-the-Counter (OTC) Drug
7,Rx,Prescription Drug (Rx)
8,Household Remedy (HR),Household Remedy (HR)
9,Over-The-Counter (OTC) Drug,Over-the-Counter (OTC) Drug


In [9]:
class_map.to_csv('maps/classes.csv',index=False)

# Dosage Form
Describe dosage forms. Too tedious to find all possible combinations.

In [10]:
print(f['Dosage Form'].nunique(dropna=False))
Counter(f['Dosage Form'])

2252


Counter({'Solution For Injection (SC)': 68,
         'Suspension for Injection (S.C)': 1,
         'Lyophilized Powder for \r\nSuspension for Intratracheal Administration': 1,
         'Solution for Injection (I.V./S.C.)': 13,
         'Solution for Injection (IV/SC)': 131,
         'Solution For Injection (Im/Iv/Sc)': 18,
         'Suspension For Injection (SC)': 7,
         'Suspension For Injection': 11,
         'Solution for injection': 3,
         'Suspension for injection (SC)': 1,
         'Solution For Injection (I.V. / S.C.)': 2,
         'Solution For Injection (I.V./S.C.)': 8,
         'Suspension for Injection (IM)': 22,
         'Concentrated Solution for IV Infusion': 2,
         'Solution for Injection': 103,
         'Solution for Injection (IM/SC)': 8,
         'Solution for Injection (SC)': 80,
         'Concentrate for Solution for Infusion (IV)': 9,
         'Solution for Subcutaneous/Continuous SC Pump Infusion/ \r\nIntravenous Injection': 1,
         'Powder for 

Instead, we "tokenize" the dosage forms then attempt to get the elements by tagging.

In [11]:
# sentence format
dosage_forms_toks = []
# word format
dosage_forms_sent = []


for k,i in enumerate(Counter(f['Dosage Form']).keys()):
    
    if pd.isnull(i):
        dosage_forms_toks += [[]]
        dosage_forms_sent += [[]]
    else:
        # split then remove special characters
        s = re.split(r'\s+|\/',i.lower())
        s = [re.sub(r'[^\w|^\-]|\d','',j) for j in s]

        # remove blanks
        s = [j for j in s if j != '']
        
        dosage_forms_toks += [s]
        dosage_forms_sent += [' '.join([j for j in s])]
        

print(len(dosage_forms_toks))

2252


In [12]:
dosage_forms_toks[:10]

[['solution', 'for', 'injection', 'sc'],
 ['suspension', 'for', 'injection', 'sc'],
 ['lyophilized',
  'powder',
  'for',
  'suspension',
  'for',
  'intratracheal',
  'administration'],
 ['solution', 'for', 'injection', 'iv', 'sc'],
 ['solution', 'for', 'injection', 'iv', 'sc'],
 ['solution', 'for', 'injection', 'im', 'iv', 'sc'],
 ['suspension', 'for', 'injection', 'sc'],
 ['suspension', 'for', 'injection'],
 ['solution', 'for', 'injection'],
 ['suspension', 'for', 'injection', 'sc']]

In [13]:
dosage_forms_sent[:10]

['solution for injection sc',
 'suspension for injection sc',
 'lyophilized powder for suspension for intratracheal administration',
 'solution for injection iv sc',
 'solution for injection iv sc',
 'solution for injection im iv sc',
 'suspension for injection sc',
 'suspension for injection',
 'solution for injection',
 'suspension for injection sc']

In [14]:
format_map = pd.DataFrame(columns=['original','tok_sent'])
format_map['original'] = Counter(f['Dosage Form']).keys()
format_map['tok_sent'] = dosage_forms_sent
format_map.head(10)

Unnamed: 0,original,tok_sent
0,Solution For Injection (SC),solution for injection sc
1,Suspension for Injection (S.C),suspension for injection sc
2,Lyophilized Powder for \r\nSuspension for Intr...,lyophilized powder for suspension for intratra...
3,Solution for Injection (I.V./S.C.),solution for injection iv sc
4,Solution for Injection (IV/SC),solution for injection iv sc
5,Solution For Injection (Im/Iv/Sc),solution for injection im iv sc
6,Suspension For Injection (SC),suspension for injection sc
7,Suspension For Injection,suspension for injection
8,Solution for injection,solution for injection
9,Suspension for injection (SC),suspension for injection sc


# Properties

We use the following properties in general to describe the drug:

- **modifier**: these are special physical properties of the drug. e.g., orodispersible, extended-release, cherry-flavored, freeze-dried. These are mostly adjectives.
- **form**: the format of the drug. Is it solid, liquid, gas, solution, emulsion, etc
- **route**: how the drug is administered. This can be through injection, inhalation, topical application, etc. It can be specific or broad.


We see that if we assume that the phrase is of the format:

$$(\text{modifier})-(\text{form})-\text{for}-(\text{route})$$

we can generally approach the problem by splitting the drug description by finding the word "for". For example, we can ideally split *sterile freeze-dried powder for iv infusion or injection* into

$$\text{sterile freeze-dried} + \text{powder} + \text{for} + \text{iv infusion or injection}$$

Note that both *sterile freeze-fried* and *iv infusion or injection* can be further divided by finding any nouns, adjectives, and adverbs within the subphrases.

## 1. Separating the route from the dosage form
Both subphrases can contain modifiers and nouns. Splitting the drug description early on can thus allow us to improve search performance.

In [15]:
def get_subphrases(phrase):
    
    # return subphrases immediate if splitting is success
    try:
        phr = re.split(r'\s+?\bfor\b\s+?', phrase, maxsplit=1)
        if len(phr)==2: return phr
    except TypeError:
        return ['', '']
    
    # check if subphrase is blank invalid or if all splits are blanks
    if phr == ['']:
        return ['', '']
    # check if phrase starts with "for"; return this as the RoA
    else:
        try:
            mat = re.search(r'(?<=for\b\s).+?$', phr[0]).group(0)
            return ['', mat]
        except (TypeError, AttributeError):
            pass
    
    # if neither, treat extracted subphrase as the other subphrase drug form
    return [phr[0], '']

In [16]:
format_map['drug_form_phrase'], format_map['drug_roa_phrase'] = zip(*format_map['tok_sent'].apply(get_subphrases))
format_map.head(10)

Unnamed: 0,original,tok_sent,drug_form_phrase,drug_roa_phrase
0,Solution For Injection (SC),solution for injection sc,solution,injection sc
1,Suspension for Injection (S.C),suspension for injection sc,suspension,injection sc
2,Lyophilized Powder for \r\nSuspension for Intr...,lyophilized powder for suspension for intratra...,lyophilized powder,suspension for intratracheal administration
3,Solution for Injection (I.V./S.C.),solution for injection iv sc,solution,injection iv sc
4,Solution for Injection (IV/SC),solution for injection iv sc,solution,injection iv sc
5,Solution For Injection (Im/Iv/Sc),solution for injection im iv sc,solution,injection im iv sc
6,Suspension For Injection (SC),suspension for injection sc,suspension,injection sc
7,Suspension For Injection,suspension for injection,suspension,injection
8,Solution for injection,solution for injection,solution,injection
9,Suspension for injection (SC),suspension for injection sc,suspension,injection sc


## 2. Extracting the drug formats

In [18]:
# open drug formats file and convert to regex pattern
# \b<>|<>|...\b
with open('json_data/drug_formats.json','r') as file:
    drug_formats = json.load(file)
    
# RoA delivery
roa_nouns = [r'\b(' + '|'.join(v.values()) + ')\\b' for k,v in drug_formats.items()]

In [19]:
def get_drug_format(subphrase):
    
    drug_format = None
    for roa in roa_nouns:
        try:
            drug_format = re.search(roa, subphrase).group(0)
        except (TypeError, AttributeError):
            continue
    
    return drug_format

In [20]:
format_map['form_format'] = format_map['drug_form_phrase'].apply(get_drug_format)
format_map['roa_format'] = format_map['drug_roa_phrase'].apply(get_drug_format)
format_map.head(10)

Unnamed: 0,original,tok_sent,drug_form_phrase,drug_roa_phrase,form_format,roa_format
0,Solution For Injection (SC),solution for injection sc,solution,injection sc,solution,
1,Suspension for Injection (S.C),suspension for injection sc,suspension,injection sc,suspension,
2,Lyophilized Powder for \r\nSuspension for Intr...,lyophilized powder for suspension for intratra...,lyophilized powder,suspension for intratracheal administration,powder,suspension
3,Solution for Injection (I.V./S.C.),solution for injection iv sc,solution,injection iv sc,solution,
4,Solution for Injection (IV/SC),solution for injection iv sc,solution,injection iv sc,solution,
5,Solution For Injection (Im/Iv/Sc),solution for injection im iv sc,solution,injection im iv sc,solution,
6,Suspension For Injection (SC),suspension for injection sc,suspension,injection sc,suspension,
7,Suspension For Injection,suspension for injection,suspension,injection,suspension,
8,Solution for injection,solution for injection,solution,injection,solution,
9,Suspension for injection (SC),suspension for injection sc,suspension,injection sc,suspension,


In [21]:
format_map.to_csv('maps/format_map.csv',index=False)

## 3. Modifiers
Getting the modifiers is a bit different. For the first subphrase (`drug_form_subphrase`), it is rather straightforward and may be just as simple as scanning for a list of words preceding the extracted nouns from step 2. However, routes of administration (`drug_roa_subphrase`) have an additional level of detail. 

### a. Routes of administration

A drug can be described by how it is administered (e.g., orally, gargled, by injection), and if so again, *how*.

In [22]:
# RoA adjectives
roa_adjs = [r'\b(im|iv|id|ia|sc|ivt)\b', r'\b(intra\-?\w+?\b)', r'\b(\w+?ic)\b', r'\b(in\-ovo)\b'
            r'\b(subcut.*?|epidur.*?)\b', r'\b(\w+?al)\b(?!\s+?free$)', r'\b(dilu.*?|reconsti.*?)\b']

# RoA nouns (action verbs)
roa_actions = [r'\b(inf(?:usion)?|inj(?:ection)?|inhal(?:ation)|instillation)\b',
               r'\b(gargl(?:ing|e)|nebuliz(?:ation|ing)?|sus(?:pension)?)\b',
               r'\b(filt(?:ering|ration))\b']

In [25]:
def get_roa_details(phrase):
    
    # capture one word before "for" just in case
    # try:
    #    roa_phrase = re.search(r'\bfor\b.+?$', phrase).group(0)
    #except (TypeError, AttributeError):
    #    roa_phrase = None
    
    
    # get all RoA descriptors
    roa_path = None
    roa_technique = None
    
    if True: # roa_phrase is not None:
        roa_path = []
        for roa in roa_adjs:
            r = re.findall(roa, phrase)
            roa_path += r
        roa_path = list(set(roa_path))
        
        roa_technique = []
        for roa in roa_actions:
            r = re.findall(roa, phrase)
            roa_technique += r
        roa_technique = list(set(roa_technique))
    
    
    return roa_path, roa_technique

In [26]:
format_map['roa_detail_1'], format_map['roa_detail_2'] = zip(*format_map['drug_roa_phrase'].apply(get_roa_details))
format_map.head(10)

Unnamed: 0,original,tok_sent,drug_form_phrase,drug_roa_phrase,form_format,roa_format,roa_detail_1,roa_detail_2
0,Solution For Injection (SC),solution for injection sc,solution,injection sc,solution,,[sc],[injection]
1,Suspension for Injection (S.C),suspension for injection sc,suspension,injection sc,suspension,,[sc],[injection]
2,Lyophilized Powder for \r\nSuspension for Intr...,lyophilized powder for suspension for intratra...,lyophilized powder,suspension for intratracheal administration,powder,suspension,[intratracheal],[suspension]
3,Solution for Injection (I.V./S.C.),solution for injection iv sc,solution,injection iv sc,solution,,"[sc, iv]",[injection]
4,Solution for Injection (IV/SC),solution for injection iv sc,solution,injection iv sc,solution,,"[sc, iv]",[injection]
5,Solution For Injection (Im/Iv/Sc),solution for injection im iv sc,solution,injection im iv sc,solution,,"[im, sc, iv]",[injection]
6,Suspension For Injection (SC),suspension for injection sc,suspension,injection sc,suspension,,[sc],[injection]
7,Suspension For Injection,suspension for injection,suspension,injection,suspension,,[],[injection]
8,Solution for injection,solution for injection,solution,injection,solution,,[],[injection]
9,Suspension for injection (SC),suspension for injection sc,suspension,injection sc,suspension,,[sc],[injection]


In [27]:
format_map.to_csv('maps/format_map.csv',index=False)

### b. Physical format

This refers to the physical properties of the drug. We will use another `json` file for this.