In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
import re
import time
import os
import numpy as np
import pandas as pd

In [2]:
DATA_DIR = r'C:\Users\cwl8\Desktop\tmp'
DOWNLOADS_PATH = r'C:\Users\cwl8\Downloads'
DOWNLOAD_FILE_NAME = 'United States and Puerto Rico Cancer Statistics, 1999-2021 Incidence.txt'
URL = "https://wonder.cdc.gov/cancer-v2021.HTML"

In [3]:
chrome_options = Options()
service = Service()
options = webdriver.ChromeOptions()
options.add_experimental_option(
    "prefs", {
        # block image loading
        "profile.managed_default_content_settings.images": 2,
    }
)
chrome_options.add_argument("--headless")  
chrome_options.add_argument("--disable-gpu")  
chrome_options.add_argument("--window-size=1920,1080")  
chrome_options.add_argument("--disable-extensions")  
chrome_options.add_argument("--no-sandbox")  
chrome_options.add_argument("--disable-dev-shm-usage") 


In [4]:
MSA_SELECT_ID = "SD198.V3"
MSA_OPTION_ID = "RO_locationD198.V3"
YEAR_SELECT_ID = "SD198.V1"
SEX_SELECT_ID = "SD198.V9"
AGE_GROUP_SELECT_ID = "SD198.V5"
ETHNICITY_SELECT_ID = "SD198.V6"
RACE_SELECT_ID = "SD198.V4"
CANCER_SITES_SELECT_ID = "SD198.V8"
EXPORT_RESULTS_ID = "export-option"
FOOTER_BUTTONS_CLASS = "footer-buttons"
SEND_BUTTON_VALUE = "Send"

In [5]:

DESELECT_DICT = {
    MSA_SELECT_ID: "10900",
    YEAR_SELECT_ID: "2002",
    SEX_SELECT_ID: "F",
    AGE_GROUP_SELECT_ID: "15-19",
    ETHNICITY_SELECT_ID: "2186-5",
    RACE_SELECT_ID: "2054-5",
    CANCER_SITES_SELECT_ID: "20030"
}

In [6]:
def get_select_options(select_id):
    select = driver.find_element(By.ID, select_id)
    options = select.find_elements(By.XPATH, "./*")
    value_options = {option.get_property('value'): option for option in options}
    name_values = {re.sub(r'\(\d+\)', '', option.text).strip(): option.get_property('value') for option in options}
    return name_values 

In [7]:
def select_msa_option():
    msa_option = driver.find_element(By.ID, MSA_OPTION_ID)
    msa_option.click()

In [8]:
driver = webdriver.Chrome(options=chrome_options, service=service)
driver.get(URL)

In [9]:
select_msa_option()
msa_name_values = get_select_options(MSA_SELECT_ID)
year_name_values = get_select_options(YEAR_SELECT_ID)
sex_name_values = get_select_options(SEX_SELECT_ID)
age_group_name_values = get_select_options(AGE_GROUP_SELECT_ID)
ethnicity_name_values = get_select_options(ETHNICITY_SELECT_ID)
race_name_values = get_select_options(RACE_SELECT_ID)
cancer_sites_name_values = get_select_options(CANCER_SITES_SELECT_ID)


In [10]:
driver.quit()

In [11]:
msa_name_values.pop('The United States')
year_name_values.pop("All Years")
sex_name_values.pop("All Sexes")
age_group_name_values.pop("All Ages")
ethnicity_name_values.pop("All Ethnicities")
race_name_values.pop("All Races")
cancer_sites_name_values = {"Gallbladder": cancer_sites_name_values["Gallbladder"], "Lung and Bronchus": cancer_sites_name_values["Lung and Bronchus"]}


In [12]:
total_expected_rows = len(msa_name_values) * len(year_name_values) * len(sex_name_values) * len(age_group_name_values) * len(ethnicity_name_values) * len(race_name_values) * len(cancer_sites_name_values)

In [14]:
tot = pd.DataFrame()

flag = 0
for msa in msa_name_values:
    for year in year_name_values:
        for sex in sex_name_values:
            for age_group in age_group_name_values:
                for ethnicity in ethnicity_name_values:
                    for race in race_name_values:
                        for cancer_site in cancer_sites_name_values:
                            row = pd.Series({
                                'msa': msa,
                                'year': year,
                                'sex': sex,
                                'age_group': age_group,
                                'ethnicity': ethnicity,
                                'race': race,
                                'cancer_site': cancer_site,
                            })
                            tot = pd.concat([tot, row])
                            if flag % 100_000 == 0:
                                print(flag)
                            flag += 1

tot['count'] = np.nan


0


KeyboardInterrupt: 

In [None]:
tot.to_csv(os.path.join(DATA_DIR, 'cancer_incidence.csv'))

In [14]:
driver = webdriver.Chrome(options=chrome_options)
driver.get(URL)

In [15]:
def select_send():
    footer_buttons = driver.find_element(By.CLASS_NAME, FOOTER_BUTTONS_CLASS)
    # time.sleep(1)
    send_button = footer_buttons.find_element(By.XPATH, f'//*[@value="{SEND_BUTTON_VALUE}"]')
    # time.sleep(1)
    send_button.click()
    # time.sleep(1)

def select_option(select_id, option_value):
    select = driver.find_element(By.ID, select_id)
    option = driver.find_element(By.XPATH, f'//*[@value="{option_value}"]')
    deselect_option = driver.find_element(By.XPATH, f'//*[@value="{DESELECT_DICT[select_id]}"]')

    select.click()
    # time.sleep(1)
    deselect_option.click()
    # time.sleep(1)
    option.click()
    # time.sleep(1)


def one_download(msa, year, sex, age_group, ethnicity, race, cancer_site):
    select_msa_option()
    select_option(MSA_SELECT_ID, msa_name_values[msa])
    select_option(YEAR_SELECT_ID, year_name_values[year])
    select_option(SEX_SELECT_ID, sex_name_values[sex])
    select_option(AGE_GROUP_SELECT_ID, age_group_name_values[age_group])
    select_option(ETHNICITY_SELECT_ID, ethnicity_name_values[ethnicity])
    select_option(RACE_SELECT_ID, race_name_values[race])
    select_option(CANCER_SITES_SELECT_ID, cancer_sites_name_values[cancer_site])
    # select_export_results_option()
    select_send()

    try:
        driver.find_element(By.ID, 'error-messages') 
        count = np.nan
    except Exception as err:
        table = driver.find_element(By.CLASS_NAME, 'response-form')
        count = int(table.find_element(By.TAG_NAME, 'td').text)

    row = pd.Series({
        'msa': msa,
        'year': year,
        'sex': sex,
        'age_group': age_group,
        'ethnicity': ethnicity,
        'race': race,
        'cancer_site': cancer_site,
        'count': count
    })

    driver.get(URL)

    return row

    

In [None]:
driver.quit()

In [None]:
tot.head()