In [37]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from pathlib import Path
import re
import time
from tqdm import tqdm

In [38]:
# Health Service Area (HSA) home url and names 
HSA_URL = 'http://communityhealth.phsa.ca/CHSAHealthProfiles'
HSA_REGIONS = ['Fraser Health','Interior Health','Island Health','Northern Health','Vancouver Coastal Health']

In [39]:
# Get a soup instance for the HSA_URL
hsa_page = requests.get(HSA_URL)
soup = BeautifulSoup(hsa_page.content,'html.parser')

In [40]:
# HSAs for which data needs to be downloaded
HSA_REGIONS_DOWNLOAD = ['Fraser Health','Island Health','Northern Health','Vancouver Coastal Health']

In [41]:
# Find HSA subregions and web links for HSAs
hsa_subregions_links = dict()
root = 'http://communityhealth.phsa.ca'
for region in tqdm(HSA_REGIONS):
    hsa_block = soup.find('a',attrs={'class':'collapsed currentLHA'},text=re.compile(region)).parent.parent.parent
    hsa_subregions = hsa_block.find('ul',attrs={'class':'col-xs-12'}).findAll('li')
    subregion2link = dict()
    for subregion in hsa_subregions:
        subregion_name = subregion.find('a').text
        subregion_link = subregion.find('a')['href']
        subregion2link[subregion_name] = root + subregion_link
    hsa_subregions_links[region] = subregion2link

100%|██████████| 5/5 [00:00<00:00, 85.88it/s]


In [21]:
# Create folders for each HSAs (mentioned for download)
code_path = Path.cwd()
print(code_path)
data_path = code_path.parent/'data'
data_path.mkdir(exist_ok=True)
for region in HSA_REGIONS_DOWNLOAD:
    region_path = data_path/region
    region_path.mkdir(exist_ok=True)

/Users/shreygrover/Desktop/Hackathons/Vancouver_DataJam_2021/src


In [15]:
# Visit each HSA subregion weblink and download the socioeconomic data csv. Store it in the respective HSA folder.
for region in tqdm(HSA_REGIONS_DOWNLOAD):
    chrome_options = webdriver.ChromeOptions()
    prefs = {'download.default_directory' : str(data_path/region)}
    chrome_options.add_experimental_option('prefs', prefs)
    browser = webdriver.Chrome('/usr/local/bin/chromedriver',options=chrome_options)
    
    subregion2link = hsa_subregions_links[region]
    for subregion in subregion2link.keys():
        subregion_link = subregion2link[subregion]
        print(subregion_link)
        browser.get(subregion_link)
        
        browser.find_element_by_xpath("//a[@class='button']").click()
        time.sleep(4)
    browser.close()

  0%|          | 0/4 [00:00<?, ?it/s]

http://communityhealth.phsa.ca/CHSAHealthProfiles/CHSAHealthReport/Abbotsford%20Rural
http://communityhealth.phsa.ca/CHSAHealthProfiles/CHSAHealthReport/Agassiz_Harrison
http://communityhealth.phsa.ca/CHSAHealthProfiles/CHSAHealthReport/Aldergrove_Otter
http://communityhealth.phsa.ca/CHSAHealthProfiles/CHSAHealthReport/Brookswood_Murrayville
http://communityhealth.phsa.ca/CHSAHealthProfiles/CHSAHealthReport/Burnaby%20Northeast
http://communityhealth.phsa.ca/CHSAHealthProfiles/CHSAHealthReport/Burnaby%20Northwest
http://communityhealth.phsa.ca/CHSAHealthProfiles/CHSAHealthReport/Burnaby%20Southeast
http://communityhealth.phsa.ca/CHSAHealthProfiles/CHSAHealthReport/Burnaby%20Southwest
http://communityhealth.phsa.ca/CHSAHealthProfiles/CHSAHealthReport/Central%20Abbotsford
http://communityhealth.phsa.ca/CHSAHealthProfiles/CHSAHealthReport/City%20of%20Langley
http://communityhealth.phsa.ca/CHSAHealthProfiles/CHSAHealthReport/Cloverdale
http://communityhealth.phsa.ca/CHSAHealthProfiles/CHSAH

 25%|██▌       | 1/4 [03:12<09:38, 192.70s/it]

http://communityhealth.phsa.ca/CHSAHealthProfiles/CHSAHealthReport/Alberni%20Valley_Bamfield
http://communityhealth.phsa.ca/CHSAHealthProfiles/CHSAHealthReport/Campbell%20River
http://communityhealth.phsa.ca/CHSAHealthProfiles/CHSAHealthReport/Campbell%20River%20Rural
http://communityhealth.phsa.ca/CHSAHealthProfiles/CHSAHealthReport/Cedar_Wellington
http://communityhealth.phsa.ca/CHSAHealthProfiles/CHSAHealthReport/Central%20Cowichan
http://communityhealth.phsa.ca/CHSAHealthProfiles/CHSAHealthReport/Central%20Saanich
http://communityhealth.phsa.ca/CHSAHealthProfiles/CHSAHealthReport/Chemainus
http://communityhealth.phsa.ca/CHSAHealthProfiles/CHSAHealthReport/Colwood
http://communityhealth.phsa.ca/CHSAHealthProfiles/CHSAHealthReport/Comox
http://communityhealth.phsa.ca/CHSAHealthProfiles/CHSAHealthReport/Comox%20Valley%20Rural
http://communityhealth.phsa.ca/CHSAHealthProfiles/CHSAHealthReport/Courtenay
http://communityhealth.phsa.ca/CHSAHealthProfiles/CHSAHealthReport/Cowichan%20Valley

 50%|█████     | 2/4 [06:21<06:20, 190.44s/it]

http://communityhealth.phsa.ca/CHSAHealthProfiles/CHSAHealthReport/Burns%20Lake%20North
http://communityhealth.phsa.ca/CHSAHealthProfiles/CHSAHealthReport/Burns%20Lake%20South
http://communityhealth.phsa.ca/CHSAHealthProfiles/CHSAHealthReport/Chetwynd
http://communityhealth.phsa.ca/CHSAHealthProfiles/CHSAHealthReport/Dawson%20Creek
http://communityhealth.phsa.ca/CHSAHealthProfiles/CHSAHealthReport/Fort%20Nelson%20Population%20Centre
http://communityhealth.phsa.ca/CHSAHealthProfiles/CHSAHealthReport/Fort%20St%20John
http://communityhealth.phsa.ca/CHSAHealthProfiles/CHSAHealthReport/Fraser%20Lake
http://communityhealth.phsa.ca/CHSAHealthProfiles/CHSAHealthReport/Houston
http://communityhealth.phsa.ca/CHSAHealthProfiles/CHSAHealthReport/Hudson's%20Hope
http://communityhealth.phsa.ca/CHSAHealthProfiles/CHSAHealthReport/Kitimat
http://communityhealth.phsa.ca/CHSAHealthProfiles/CHSAHealthReport/Mackenzie
http://communityhealth.phsa.ca/CHSAHealthProfiles/CHSAHealthReport/McBride_Valemount
htt

 75%|███████▌  | 3/4 [08:29<02:41, 161.75s/it]

http://communityhealth.phsa.ca/CHSAHealthProfiles/CHSAHealthReport/Blundell
http://communityhealth.phsa.ca/CHSAHealthProfiles/CHSAHealthReport/Bowen%20Island_Lions%20Bay
http://communityhealth.phsa.ca/CHSAHealthProfiles/CHSAHealthReport/Broadmoor
http://communityhealth.phsa.ca/CHSAHealthProfiles/CHSAHealthReport/Cedar%20Cottage
http://communityhealth.phsa.ca/CHSAHealthProfiles/CHSAHealthReport/Downtown%20Eastside
http://communityhealth.phsa.ca/CHSAHealthProfiles/CHSAHealthReport/Downtown%20Vancouver
http://communityhealth.phsa.ca/CHSAHealthProfiles/CHSAHealthReport/East%20and%20West%20Cambie_Bridgeport
http://communityhealth.phsa.ca/CHSAHealthProfiles/CHSAHealthReport/Fairview
http://communityhealth.phsa.ca/CHSAHealthProfiles/CHSAHealthReport/Gibsons
http://communityhealth.phsa.ca/CHSAHealthProfiles/CHSAHealthReport/Gilmore_Shellmont_East_Hamilton
http://communityhealth.phsa.ca/CHSAHealthProfiles/CHSAHealthReport/Grandview-Woodland
http://communityhealth.phsa.ca/CHSAHealthProfiles/CHSA

100%|██████████| 4/4 [11:35<00:00, 173.84s/it]


In [42]:
def clean_subregions(val):
    code, name = val.split(" ",1)
    mod_name = name.replace('/','_')
    return code, mod_name

subregion2code = dict()
for key in hsa_subregions_links:
    hsa_subregions = hsa_subregions_links[key].keys()
    for subregion in hsa_subregions:
        code, mod_name = clean_subregions(subregion)
        subregion2code[mod_name] = code

In [43]:
processed_data_path = data_path.parent/'processed'
processed_data_path.mkdir(exist_ok=True)

subregion2code_df = pd.DataFrame(subregion2code.items(),columns=['subregion','code'])
subregion2code_df.to_csv(processed_data_path/'subregion2code.csv',index=False)