In [2]:
# Web Scraper for https://global.essentialmeds.org/dashboard/countries/
# Created by James Hu with ChatGPT; Rethink Priorities, 2023

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
from bs4 import BeautifulSoup

# Set up headless Firefox options
options = Options()
options.add_argument('-headless')
options.add_argument('--disable-notifications')
options.add_argument('--no-sandbox')
options.add_argument('--verbose')

# Initialize constants and dictionaries
NUM_COUNTRIES = 137
SCROLL_HEIGHT = 38
country_to_consistencies = {}
country_to_differences = {}

# Generate list of URLs to be scraped
urls = []

for n in range(1,NUM_COUNTRIES+1):
    url = 'https://global.essentialmeds.org/dashboard/countries/' + str(n)
    urls.append(url)

In [3]:
# Start the browser
driver = webdriver.Firefox(options=options)
wait = WebDriverWait(driver, 10)  # Initialize wait object

for url in urls:
    # ITEMS CONSISTENT WITH WORLD HEALTH ORGANIZATION LIST
    # Open the webpage
    driver.get(url)
    
    # Locate the scrollable element
    scrollable_element = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.css-o6nvc9 > div:nth-child(2)')))

    # Scroll through the internal scrollable element to load all products
    scroll_step = SCROLL_HEIGHT*10  # Set the scroll_step equal to the height of 10 product elements
    scroll_position = 0
    consistencies = set()

    while True:
        # Scroll down
        driver.execute_script(f"arguments[0].scrollTop = {scroll_position};", scrollable_element)
        scroll_position += scroll_step
        time.sleep(0.2)

        # Get the page source after scrolling
        html = driver.page_source

        # Use BeautifulSoup to parse and extract the data
        soup = BeautifulSoup(html, 'html.parser')
        product_list = soup.select('.css-t9ct1g')

        # Add visible products to the consistencies set
        current_product_count = len(consistencies)
        for product in product_list:
            consistencies.add(product.text.strip())

        # Break the loop if the product count doesn't change after scrolling
        if len(consistencies) == current_product_count:
            break

    country = soup.select('.css-147a4oj > h4:nth-child(1)')[0].text.strip()
    country_to_consistencies[country] = consistencies
        
    # ITEMS DIFFERENT FROM WORLD HEALTH ORGANIZATION LIST
    # Click the "Differences with the WHO list" button
    element_to_click = wait.until(EC.element_to_be_clickable((By.XPATH, '/html/body/div/div/div[1]/div[2]/div[3]/div[1]/div[2]')))
    element_to_click.click()
    time.sleep(0.2)

    # Locate the scrollable element
    scrollable_element = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.css-o6nvc9 > div:nth-child(2)')))

    # Scroll through the internal scrollable element to load all products
    scroll_step = SCROLL_HEIGHT*10  # Set the scroll_step equal to the height of 10 product elements
    scroll_position = 0
    differences = set()
    
    while True:
        # Scroll down
        
        driver.execute_script(f"arguments[0].scrollTop = {scroll_position};", scrollable_element)
        
        scroll_position += scroll_step
        time.sleep(0.2)

        # Get the page source after scrolling
        html = driver.page_source

        # Use BeautifulSoup to parse and extract the data
        soup = BeautifulSoup(html, 'html.parser')
        product_list = soup.select('.css-t9ct1g')

        # Add visible products to the differences set
        current_product_count = len(differences)
        for product in product_list:
            differences.add(product.text.strip())

        # Break the loop if the product count doesn't change after scrolling
        if len(differences) == current_product_count:
            break

    country_to_differences[country] = differences

# Close the browser  
driver.quit()

KeyboardInterrupt: 

In [56]:
import pandas as pd

# Create a set for the WHO EML
who_eml = set.union(*country_to_consistencies.values())

# Combine the dictionaries
all_products = {}
for country, products in country_to_consistencies.items():
    all_products[country] = products | country_to_differences[country]

# Add the WHO EML column
all_products["WHO EML"] = who_eml

# Create a list of all unique products and sort it alphabetically
unique_products = sorted(set.union(who_eml, *country_to_differences.values()))

# Create an empty DataFrame with the desired column order
sorted_countries = sorted(col for col in all_products.keys() if col != "WHO EML")
columns = ["Product", "WHO EML"] + sorted_countries
df = pd.DataFrame(columns=columns)

# Fill the DataFrame
for product in unique_products:
    row_data = {col: int(product in product_set) for col, product_set in all_products.items()}
    row_data["Product"] = product
    row = pd.Series(row_data, name=product)
    df = df.append(row)

# Reset the index
df.reset_index(drop=True, inplace=True)

# Export DataFrame to an Excel file
df.to_excel("products_eml.xlsx", index=False)

In [2]:
# Analysis (DataFrame reimported from Excel)

import pandas as pd

# Read the Excel file
file_path = 'products_eml.xlsx'
data = pd.read_excel(file_path, engine='openpyxl')

In [3]:
# Replace 'Product' and 'WHO_EML' with the appropriate column names in your DataFrame
product_column = 'Product'
who_eml_column = 'WHO EML'

# Filter out products on the WHO EML
filtered_data = data[data[who_eml_column] != 1]

# Identify country columns by excluding the product column and the WHO EML column
country_columns = [col for col in filtered_data.columns if col not in (product_column, who_eml_column)]

# Combine all country columns into a single DataFrame, keeping the product names
all_products = filtered_data.melt(id_vars=product_column, value_vars=country_columns, var_name='Country', value_name='Included')
all_products = all_products[all_products['Included'] == 1]

# Count the occurrences of each product and sort by count in descending order
product_counts = all_products[product_column].value_counts().reset_index()
product_counts.columns = ['Product', 'Count']
sorted_products = product_counts.sort_values('Count', ascending=False)

# Export sorted_products to an Excel file
output_file_path = 'non-WHO_products_by_freq_on_NEMLs.xlsx'
sorted_products.to_excel(output_file_path, index=False, engine='openpyxl')

In [4]:
to_check = 'Glibenclamide (Glyburide)'
product_row = data[data[product_column] == to_check]
is_not_on_eml = product_row[who_eml_column].iloc[0] == 0

print(f"Is '{to_check}' not on the WHO EML? {is_not_on_eml}")

Is 'Glibenclamide (Glyburide)' not on the WHO EML? True


In [5]:
# Check that all WHO EML products were covered in previous scrape
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
from bs4 import BeautifulSoup

# Set up headless Firefox options
options = Options()
options.add_argument('-headless')
options.add_argument('--disable-notifications')
options.add_argument('--no-sandbox')
options.add_argument('--verbose')

# Initialize constants and dictionaries
SCROLL_HEIGHT = 38
products_to_inclusion = {}

# Generate list of URLs to be scraped
url = "https://global.essentialmeds.org/dashboard/medicines"

# Start the browser
driver = webdriver.Firefox(options=options)
wait = WebDriverWait(driver, 10)  # Initialize wait object

# Open the webpage
driver.get(url)

# Locate the scrollable element
scrollable_element = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.css-o6nvc9 > div:nth-child(2)')))

# Scroll through the internal scrollable element to load all products
scroll_step = SCROLL_HEIGHT*10  # Set the scroll_step equal to the height of 10 product elements
scroll_position = 0
all_products = set()

while True:
    # Scroll down
    driver.execute_script(f"arguments[0].scrollTop = {scroll_position};", scrollable_element)
    scroll_position += scroll_step
    time.sleep(0.2)

    # Get the page source after scrolling
    html = driver.page_source

    # Use BeautifulSoup to parse and extract the data
    soup = BeautifulSoup(html, 'html.parser')
    product_list = soup.select('.css-t9ct1g')
    on_eml = soup.select('.css-7s7c1t')

    # Add visible products to the consistencies set
    current_product_count = len(all_products)
    for i, product in enumerate(product_list):
        product_name = product.text.strip()
        all_products.add(product_name)
        products_to_inclusion[product_name] = 'check' in str(on_eml[i])

    # Break the loop if the product count doesn't change after scrolling
    if len(all_products) == current_product_count:
        break

# Close the browser  
driver.quit()

In [6]:
count = sum(value for value in products_to_inclusion.values() if value)

print("Number of items in the WHO EML:", count)

Number of items in the WHO EML: 414


In [23]:
'check' in str(whether_on_eml_list)

True

In [24]:
whether_on_eml_list

[<span class="css-7s7c1t"></span>,
 <span class="css-7s7c1t"><img src="/fonts/check.svg?b5cb3b7653ef05cd1f1f7c93fa070992"/></span>,
 <span class="css-7s7c1t"></span>,
 <span class="css-7s7c1t"></span>,
 <span class="css-7s7c1t"></span>,
 <span class="css-7s7c1t"></span>,
 <span class="css-7s7c1t"></span>,
 <span class="css-7s7c1t"></span>,
 <span class="css-7s7c1t"></span>,
 <span class="css-7s7c1t"></span>,
 <span class="css-7s7c1t"></span>,
 <span class="css-7s7c1t"></span>,
 <span class="css-7s7c1t"></span>]

In [31]:


import pandas as pd

# Assuming this is your dictionary mapping products to true/false
product_dict = products_to_inclusion

# Create a set of products with True values in the dictionary
true_products_dict = set(product for product, value in product_dict.items() if value)

# Assuming this is your DataFrame
# data

# Create a set of products with a 1 in the "WHO EML" column
true_products_df = set(data.loc[data['WHO EML'] == 1, 'Product'])

# Compare the two sets of products
products_only_in_dict = true_products_dict - true_products_df
products_only_in_df = true_products_df - true_products_dict

print("Products only in the dictionary:")
print(products_only_in_dict)

print("Products only in the DataFrame:")
print(products_only_in_df)

Products only in the dictionary:
{'Pyronaridine', 'Velpatasvir'}
Products only in the DataFrame:
set()


In [11]:
who_eml_web_scraped = {k for k, v in products_to_inclusion.items() if v == 1}

In [12]:
import re
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer, LTChar
import fitz

def process_pdf(file_path, start_page, end_page):
    core_items = set()
    complementary_items = set()
    combined_items = set()

    doc = fitz.open(file_path)
    for page_num in range(start_page, end_page + 1):
        page = doc[page_num]
        blocks = page.get_text("dict")["blocks"]

        for block in blocks:
            if "lines" in block:
                for line in block["lines"]:
                    line_items = []
                    for span in line["spans"]:
                        text = span["text"]
                        if "Times New Roman" not in span["font"]:
                            continue
                        if "Italic" in span["font"]:
                            text = f"<i>{text}</i>"
                        line_items.append(text)

                    buffer = "".join(line_items)
                    items = re.findall(r'((?:<i>[^<]*</i>|[^<\s,][^,]*[^<\s,]))', buffer)
                    for item in items:
                        if "<i>" in item:
                            item = item.replace("<i>", "").replace("</i>", "").strip()
                            if item and item != '+':
                                complementary_items.add(item)
                        elif "+" not in item:
                            item = item.strip()
                            if item and item != '+' and not item.isdigit():
                                core_items.add(item)
                        elif item.strip() != '+':
                            combined_items.add(item.strip())

    combined_split_items = set()
    for combined_item in combined_items:
        items = combined_item.split(" + ")
        for item in items:
            if item not in core_items and item not in complementary_items and item != '+':
                combined_split_items.add(item.strip())

    return core_items, complementary_items, combined_split_items

file_path = "EML-20-eng.pdf"
start_page, end_page = 58, 61  # Assuming 0-based page indexing
core_items, complementary_items, combined_split_items = process_pdf(file_path, start_page, end_page)

In [13]:
who_eml_pdf_scraped = core_items | complementary_items | combined_split_items

In [18]:
set1 = who_eml_web_scraped
set2 = who_eml_pdf_scraped

# Find elements present in set1 but not set2, case-insensitive
diff1 = {elem.lower() for elem in set1} - {elem.lower() for elem in set2}

# Find elements present in set2 but not set1, case-insensitive
diff2 = {elem.lower() for elem in set2} - {elem.lower() for elem in set1}

print("{} elements in set1 but not set2: {}".format(len(diff1),diff1))
print("{} elements in set2 but not set1: {}".format(len(diff2),diff2))

117 elements in set1 but not set2: {'scopolamine', 'stavudine', 'tetanus antitoxin', 'prostaglandin', 'nitroprusside', 'vitamin k (menadione, phytomenadione, phytonadione)', 'cefalexin (cephalexin)', 'vitamin b1 (thiamine )', 'diphtheria anti-toxin', 'teicoplanin', 'fludrocortisone (fluohydrocortisone)', 'piperaquine', 'hydroxyurea (hydroxycarbamide)', 'diethylstilbestrol (stilboestrol)', 'zinc', 'faropenem', 'ferrous fumarate (iron)', 'vitamin a (retinol)', 'copper iud', 'tretinoin', 'pegylated interferon alpha 2a', "iodine (lugol's iodine)", 'fluoride', 'medroxyprogesterone', 'levonorgestrel (emergency contraception)', 'leuprolide (leuprorelin)', 'nitroglycerin (glyceryl trinitrate)', 'levonorgestrel - releasing intrauterine system', 'poliomyelitis vaccine (polio vaccine)', 'valproic acid (sodium valproate, valproate, valproate semisodium)', 'nitrous oxide (dinitrogen oxide)', 'dalteparin', 'lidocaine (lignocaine, xylocaine)', 'progesterone vagnal ring', 'ergonovine (ergometrine)', '

In [21]:
import jellyfish

# Compute Jaro-Winkler similarity between elements in diff1 and diff2
similarity_scores = []
for elem1 in diff1:
    # Split element on "(" character and compare only first part
    elem1_parts = elem1.split("(")
    elem1_first = elem1_parts[0].strip()
    for elem2 in diff2:
        # Split element on "(" character and compare only first part
        elem2_parts = elem2.split("(")
        elem2_first = elem2_parts[0].strip()
        if elem1_first == elem2_first:
            # If first parts match, set similarity score to 1.0
            score = 1.0
        else:
            # Otherwise, compute Jaro-Winkler similarity score
            score = jellyfish.jaro_winkler(elem1, elem2)
        similarity_scores.append((elem1, elem2, score))

# Sort similarity scores in descending order
sorted_scores = sorted(similarity_scores, key=lambda x: x[2], reverse=True)

# Print ranking of most similar elements
for score in sorted_scores:
    print(f"{score[0]} vs. {score[1]}: {score[2]:.2f}")


cefalexin (cephalexin) vs. cefalexin: 1.00
fludrocortisone (fluohydrocortisone) vs. fludrocortisone: 1.00
poliomyelitis vaccine (polio vaccine) vs. poliomyelitis vaccine: 1.00
valproic acid (sodium valproate, valproate, valproate semisodium) vs. valproic acid (sodium valproate): 1.00
nitrous oxide (dinitrogen oxide) vs. nitrous oxide: 1.00
lidocaine (lignocaine, xylocaine) vs. lidocaine: 1.00
calcium folinate (folinic acid, levoleucovorin, leucovorin) vs. calcium folinate: 1.00
clomifene (clomiphene) vs. clomifene: 1.00
sodium hydrogen carbonate (sodium bicarbonate) vs. sodium hydrogen carbonate: 1.00
sulfasalazine (salazosulfapyridine) vs. sulfasalazine: 1.00
norethisterone (norethindrone) vs. norethisterone: 1.00
lumefantrine (benflumetol) vs. lumefantrine: 1.00
nicotine replacement therapy vs. nicotine replacement therapy (nrt): 1.00
benzylpenicillin (penicillin g) vs. benzylpenicillin: 1.00
anti-d immunoglobulin (rho) vs. anti-d immunoglobulin: 1.00
paracetamol (acetaminophen) vs. 