In [2]:
# Web Scraper for https://global.essentialmeds.org/dashboard/countries/
# Created by James Hu with ChatGPT; Rethink Priorities, 2023

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
from bs4 import BeautifulSoup

# Set up headless Firefox options
options = Options()
options.add_argument('-headless')
options.add_argument('--disable-notifications')
options.add_argument('--no-sandbox')
options.add_argument('--verbose')

# Initialize constants and dictionaries
NUM_COUNTRIES = 137
SCROLL_HEIGHT = 38
country_to_consistencies = {}
country_to_differences = {}

# Generate list of URLs to be scraped
urls = []

for n in range(1,NUM_COUNTRIES+1):
    url = 'https://global.essentialmeds.org/dashboard/countries/' + str(n)
    urls.append(url)

In [3]:
# Start the browser
driver = webdriver.Firefox(options=options)
wait = WebDriverWait(driver, 10)  # Initialize wait object

for url in urls:
    # ITEMS CONSISTENT WITH WORLD HEALTH ORGANIZATION LIST
    # Open the webpage
    driver.get(url)
    
    # Locate the scrollable element
    scrollable_element = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.css-o6nvc9 > div:nth-child(2)')))

    # Scroll through the internal scrollable element to load all products
    scroll_step = SCROLL_HEIGHT*10  # Set the scroll_step equal to the height of 10 product elements
    scroll_position = 0
    consistencies = set()

    while True:
        # Scroll down
        driver.execute_script(f"arguments[0].scrollTop = {scroll_position};", scrollable_element)
        scroll_position += scroll_step
        time.sleep(0.2)

        # Get the page source after scrolling
        html = driver.page_source

        # Use BeautifulSoup to parse and extract the data
        soup = BeautifulSoup(html, 'html.parser')
        product_list = soup.select('.css-t9ct1g')

        # Add visible products to the consistencies set
        current_product_count = len(consistencies)
        for product in product_list:
            consistencies.add(product.text.strip())

        # Break the loop if the product count doesn't change after scrolling
        if len(consistencies) == current_product_count:
            break

    country = soup.select('.css-147a4oj > h4:nth-child(1)')[0].text.strip()
    country_to_consistencies[country] = consistencies
        
    # ITEMS DIFFERENT FROM WORLD HEALTH ORGANIZATION LIST
    # Click the "Differences with the WHO list" button
    element_to_click = wait.until(EC.element_to_be_clickable((By.XPATH, '/html/body/div/div/div[1]/div[2]/div[3]/div[1]/div[2]')))
    element_to_click.click()
    time.sleep(0.2)

    # Locate the scrollable element
    scrollable_element = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.css-o6nvc9 > div:nth-child(2)')))

    # Scroll through the internal scrollable element to load all products
    scroll_step = SCROLL_HEIGHT*10  # Set the scroll_step equal to the height of 10 product elements
    scroll_position = 0
    differences = set()
    
    while True:
        # Scroll down
        
        driver.execute_script(f"arguments[0].scrollTop = {scroll_position};", scrollable_element)
        
        scroll_position += scroll_step
        time.sleep(0.2)

        # Get the page source after scrolling
        html = driver.page_source

        # Use BeautifulSoup to parse and extract the data
        soup = BeautifulSoup(html, 'html.parser')
        product_list = soup.select('.css-t9ct1g')

        # Add visible products to the differences set
        current_product_count = len(differences)
        for product in product_list:
            differences.add(product.text.strip())

        # Break the loop if the product count doesn't change after scrolling
        if len(differences) == current_product_count:
            break

    country_to_differences[country] = differences

# Close the browser  
driver.quit()

KeyboardInterrupt: 

In [56]:
import pandas as pd

# Create a set for the WHO EML
who_eml = set.union(*country_to_consistencies.values())

# Combine the dictionaries
all_products = {}
for country, products in country_to_consistencies.items():
    all_products[country] = products | country_to_differences[country]

# Add the WHO EML column
all_products["WHO EML"] = who_eml

# Create a list of all unique products and sort it alphabetically
unique_products = sorted(set.union(who_eml, *country_to_differences.values()))

# Create an empty DataFrame with the desired column order
sorted_countries = sorted(col for col in all_products.keys() if col != "WHO EML")
columns = ["Product", "WHO EML"] + sorted_countries
df = pd.DataFrame(columns=columns)

# Fill the DataFrame
for product in unique_products:
    row_data = {col: int(product in product_set) for col, product_set in all_products.items()}
    row_data["Product"] = product
    row = pd.Series(row_data, name=product)
    df = df.append(row)

# Reset the index
df.reset_index(drop=True, inplace=True)

# Export DataFrame to an Excel file
df.to_excel("products_eml.xlsx", index=False)

In [11]:
# Analysis (DataFrame reimported from Excel)

import pandas as pd

# Read the Excel file
file_path = 'products_eml.xlsx'
data = pd.read_excel(file_path, engine='openpyxl')

In [12]:
# Replace 'Product' and 'WHO_EML' with the appropriate column names in your DataFrame
product_column = 'Product'
who_eml_column = 'WHO EML'

# Filter out products on the WHO EML
filtered_data = data[data[who_eml_column] != 1]

# Identify country columns by excluding the product column and the WHO EML column
country_columns = [col for col in filtered_data.columns if col not in (product_column, who_eml_column)]

# Combine all country columns into a single DataFrame, keeping the product names
all_products = filtered_data.melt(id_vars=product_column, value_vars=country_columns, var_name='Country', value_name='Included')
all_products = all_products[all_products['Included'] == 1]

# Count the occurrences of each product and sort by count in descending order
product_counts = all_products[product_column].value_counts().reset_index()
product_counts.columns = ['Product', 'Count']
sorted_products = product_counts.sort_values('Count', ascending=False)

# Export sorted_products to an Excel file
output_file_path = 'non-WHO_products_by_freq_on_NEMLs.xlsx'
sorted_products.to_excel(output_file_path, index=False, engine='openpyxl')

In [14]:
to_check = 'Glibenclamide (Glyburide)'
product_row = data[data[product_column] == to_check]
is_not_on_eml = product_row[who_eml_column].iloc[0] == 0

print(f"Is '{to_check}' not on the WHO EML? {is_not_on_eml}")

Is 'Glibenclamide (Glyburide)' not on the WHO EML? True


In [28]:
# Check that all WHO EML products were covered in previous scrape
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
from bs4 import BeautifulSoup

# Set up headless Firefox options
options = Options()
options.add_argument('-headless')
options.add_argument('--disable-notifications')
options.add_argument('--no-sandbox')
options.add_argument('--verbose')

# Initialize constants and dictionaries
SCROLL_HEIGHT = 38
products_to_inclusion = {}

# Generate list of URLs to be scraped
url = "https://global.essentialmeds.org/dashboard/medicines"

# Start the browser
driver = webdriver.Firefox(options=options)
wait = WebDriverWait(driver, 10)  # Initialize wait object

# Open the webpage
driver.get(url)

# Locate the scrollable element
scrollable_element = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.css-o6nvc9 > div:nth-child(2)')))

# Scroll through the internal scrollable element to load all products
scroll_step = SCROLL_HEIGHT*10  # Set the scroll_step equal to the height of 10 product elements
scroll_position = 0
all_products = set()

while True:
    # Scroll down
    driver.execute_script(f"arguments[0].scrollTop = {scroll_position};", scrollable_element)
    scroll_position += scroll_step
    time.sleep(0.2)

    # Get the page source after scrolling
    html = driver.page_source

    # Use BeautifulSoup to parse and extract the data
    soup = BeautifulSoup(html, 'html.parser')
    product_list = soup.select('.css-t9ct1g')
    on_eml = soup.select('.css-7s7c1t')

    # Add visible products to the consistencies set
    current_product_count = len(all_products)
    for i, product in enumerate(product_list):
        product_name = product.text.strip()
        all_products.add(product_name)
        products_to_inclusion[product_name] = 'check' in str(on_eml[i])

    # Break the loop if the product count doesn't change after scrolling
    if len(all_products) == current_product_count:
        break

# Close the browser  
driver.quit()

{'4-dimethylaminophenol': False, 'Abacavir': True, 'Abatacept': False, 'Abciximab': False, 'Abiraterone Acetate': False, 'Acamprosate': False, 'Acarbose': False, 'Acebutolol': False, 'Aceclidine': False, 'Aceclofenac': False, 'Acefylline heptaminol': False, 'Acemetacin': False, 'Acenocoumarol': False, 'Acetazolamide': True, 'Acetylcholine': False, 'Acetylcysteine': True, 'Acetylleucine': False, 'Acetylsalicylic acid': True, 'Acitretin': False, 'Acrivastine': False, 'Activated charcoal': True, 'Acyclovir': True, 'Adalimumab': False, 'Adapalene': False, 'Adefovir': False, 'Ademetionine': False, 'Adenoprosin': False, 'Adenosine': False, 'Adrenalone': False, 'Aescin': False, 'Agalsidase': False, 'Agomelatine': False, 'Albendazole': True, 'Albumin': False, 'Alcaftadine': False, 'Alclometasone': False, 'Alcuronium': False, 'Alemtuzumab': False, 'Alendronic Acid (Alendronate)': False, 'Alfentanil': False, 'Alfuzosin': False, 'Alglucosidase': False, 'Alimemazine': False, 'Aliskiren': False, 'A

In [29]:
count = sum(value for value in products_to_inclusion.values() if value)

print("Number of items in the WHO EML:", count)

Number of items in the WHO EML: 414


In [23]:
'check' in str(whether_on_eml_list)

True

In [24]:
whether_on_eml_list

[<span class="css-7s7c1t"></span>,
 <span class="css-7s7c1t"><img src="/fonts/check.svg?b5cb3b7653ef05cd1f1f7c93fa070992"/></span>,
 <span class="css-7s7c1t"></span>,
 <span class="css-7s7c1t"></span>,
 <span class="css-7s7c1t"></span>,
 <span class="css-7s7c1t"></span>,
 <span class="css-7s7c1t"></span>,
 <span class="css-7s7c1t"></span>,
 <span class="css-7s7c1t"></span>,
 <span class="css-7s7c1t"></span>,
 <span class="css-7s7c1t"></span>,
 <span class="css-7s7c1t"></span>,
 <span class="css-7s7c1t"></span>]

In [31]:


import pandas as pd

# Assuming this is your dictionary mapping products to true/false
product_dict = products_to_inclusion

# Create a set of products with True values in the dictionary
true_products_dict = set(product for product, value in product_dict.items() if value)

# Assuming this is your DataFrame
# data

# Create a set of products with a 1 in the "WHO EML" column
true_products_df = set(data.loc[data['WHO EML'] == 1, 'Product'])

# Compare the two sets of products
products_only_in_dict = true_products_dict - true_products_df
products_only_in_df = true_products_df - true_products_dict

print("Products only in the dictionary:")
print(products_only_in_dict)

print("Products only in the DataFrame:")
print(products_only_in_df)

Products only in the dictionary:
{'Pyronaridine', 'Velpatasvir'}
Products only in the DataFrame:
set()
