In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import re
import time
import os


In [2]:
DATA_DIR = r'C:\Users\cwl8\Desktop\tmp'
DOWNLOADS_PATH = r'C:\Users\cwl8\Downloads'
DOWNLOAD_FILE_NAME = 'United States and Puerto Rico Cancer Statistics, 1999-2021 Incidence.txt'

In [3]:
URL = "https://wonder.cdc.gov/cancer-v2021.HTML"

In [4]:
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

In [5]:
MSA_SELECT_ID = "SD198.V3"
MSA_OPTION_ID = "RO_locationD198.V3"
YEAR_SELECT_ID = "SD198.V1"
SEX_SELECT_ID = "SD198.V9"
AGE_GROUP_SELECT_ID = "SD198.V5"
ETHNICITY_SELECT_ID = "SD198.V6"
RACE_SELECT_ID = "SD198.V4"
CANCER_SITES_SELECT_ID = "SD198.V8"
EXPORT_RESULTS_ID = "export-option"
FOOTER_BUTTONS_CLASS = "footer-buttons"
SEND_BUTTON_VALUE = "Send"

In [6]:

DESELECT_DICT = {
    MSA_SELECT_ID: "10900",
    YEAR_SELECT_ID: "2002",
    SEX_SELECT_ID: "F",
    AGE_GROUP_SELECT_ID: "15-19",
    ETHNICITY_SELECT_ID: "2186-5",
    RACE_SELECT_ID: "2054-5",
    CANCER_SITES_SELECT_ID: "20030"
}

In [7]:
def get_select_options(select_id):
    returns = []
    select = driver.find_element(By.ID, select_id)
    # time.sleep(1)
    options = select.find_elements(By.XPATH, "./*")
    # time.sleep(1)
    value_options = {option.get_property('value'): option for option in options}
    # time.sleep(1)
    name_values = {re.sub(r'\(\d+\)', '', option.text).strip(): option.get_property('value') for option in options}
    # time.sleep(1)
    return select, value_options, name_values 

In [8]:
def select_msa_option():
    msa_option = driver.find_element(By.ID, MSA_OPTION_ID)
    # time.sleep(1)
    msa_option.click()
    # time.sleep(1)

def select_export_results_option():
    export_results = driver.find_element(By.ID, EXPORT_RESULTS_ID)
    # time.sleep(1)
    export_results.click()
    # time.sleep(1)


def select_send():
    footer_buttons = driver.find_element(By.CLASS_NAME, FOOTER_BUTTONS_CLASS)
    # time.sleep(1)
    send_button = footer_buttons.find_element(By.XPATH, f'//*[@value="{SEND_BUTTON_VALUE}"]')
    # time.sleep(1)
    send_button.click()
    # time.sleep(1)

In [9]:

def select_option(select_id, option_value):
    select, value_options, _ = get_select_options(select_id)
    option = value_options[option_value]
    deselect_option = value_options[DESELECT_DICT[select_id]]
    select.click()
    # time.sleep(1)
    deselect_option.click()
    # time.sleep(1)
    option.click()
    # time.sleep(1)


def one_download(msa, year, sex, age_group, ethnicity, race, cancer_site):
    select_msa_option()
    select_option(MSA_SELECT_ID, msa)
    select_option(YEAR_SELECT_ID, year)
    select_option(SEX_SELECT_ID, sex)
    select_option(AGE_GROUP_SELECT_ID, age_group)
    select_option(ETHNICITY_SELECT_ID, ethnicity)
    select_option(RACE_SELECT_ID, race)
    select_option(CANCER_SITES_SELECT_ID, cancer_site)
    select_export_results_option()
    select_send()
    while DOWNLOAD_FILE_NAME not in os.listdir(DOWNLOADS_PATH):
        time.sleep(0.1)
    os.rename(os.path.join(DOWNLOADS_PATH, DOWNLOAD_FILE_NAME), os.path.join(DATA_DIR, f"{msa},_{year},_{sex},_{age_group},_{ethnicity},_{race},_{cancer_site}.txt"))


In [10]:
driver = webdriver.Chrome(options=chrome_options)
driver.get(URL)

In [11]:
select_msa_option()
_, _, msa_name_values = get_select_options(MSA_SELECT_ID)
_, _, year_name_values = get_select_options(YEAR_SELECT_ID)
_, _, sex_name_values = get_select_options(SEX_SELECT_ID)
_, _, age_group_name_values = get_select_options(AGE_GROUP_SELECT_ID)
_, _, ethnicity_name_values = get_select_options(ETHNICITY_SELECT_ID)
_, _, race_name_values = get_select_options(RACE_SELECT_ID)
_, _, cancer_sites_name_values = get_select_options(CANCER_SITES_SELECT_ID)


In [12]:
driver.quit()

In [13]:
msa_name_values.pop('The United States')
year_name_values.pop("All Years")
sex_name_values.pop("All Sexes")
age_group_name_values.pop("All Ages")
ethnicity_name_values.pop("All Ethnicities")
race_name_values.pop("All Races")
cancer_sites_name_values = {"Gallbladder": cancer_sites_name_values["Gallbladder"], "Lung and Bronchus": cancer_sites_name_values["Lung and Bronchus"]}


In [14]:
total_expected_files = len(msa_name_values) * len(year_name_values) * len(sex_name_values) * len(age_group_name_values) * len(ethnicity_name_values) * len(race_name_values) * len(cancer_sites_name_values)

In [15]:
driver = webdriver.Chrome(options=chrome_options)
driver.get(URL)
flag = 1
for msa in msa_name_values:
    for year in year_name_values:
        for sex in sex_name_values:
            for age_group in age_group_name_values:
                for ethnicity in ethnicity_name_values:
                    for race in race_name_values:
                        for cancer_site in cancer_sites_name_values:
                            try:
                                print(f"{flag}/{total_expected_files}", ' - '.join([msa, year, sex, age_group, ethnicity, race, cancer_site]), end=' . . . ')
                                one_download(msa_name_values[msa], year_name_values[year], sex_name_values[sex], age_group_name_values[age_group], ethnicity_name_values[ethnicity], race_name_values[race], cancer_sites_name_values[cancer_site])
                                print("Finished")
                            except Exception as error:
                                print("Failed", error)
                            driver.refresh()
                            flag += 1


1/2910420 Akron, OH - 1999 - Female - < 1 year - Hispanic - American Indian or Alaska Native - Gallbladder . . . Finished
2/2910420 Akron, OH - 1999 - Female - < 1 year - Hispanic - American Indian or Alaska Native - Lung and Bronchus . . . Finished
3/2910420 Akron, OH - 1999 - Female - < 1 year - Hispanic - Asian or Pacific Islander - Gallbladder . . . Finished
4/2910420 Akron, OH - 1999 - Female - < 1 year - Hispanic - Asian or Pacific Islander - Lung and Bronchus . . . Finished
5/2910420 Akron, OH - 1999 - Female - < 1 year - Hispanic - Black or African American - Gallbladder . . . Finished
6/2910420 Akron, OH - 1999 - Female - < 1 year - Hispanic - Black or African American - Lung and Bronchus . . . 

KeyboardInterrupt: 

In [16]:
driver.quit()