# Features to add
- Parallelization to speed up scraping
- More intelligence for repeated municipal data
- Add documentation
- change .items() to .values() for pd csv read cell

In [69]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import pandas as pd

In [70]:
# getting the current day for url
today = datetime.now()
day_of_month = today.day
month = today.month
year = today.year

In [71]:
URL = f"https://climate.weather.gc.ca/historical_data/search_historic_data_stations_e.html?searchType=stnProv&timeframe=1&lstProvince=&optLimit=yearRange&StartYear=1840&EndYear=2022&Year={year}&Month={month}&Day={day_of_month}&selRowPerPage=100"
headers = {'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36"}
municipal_urls = {}
chosen_provinces = ['BC']

In [72]:
URL

'https://climate.weather.gc.ca/historical_data/search_historic_data_stations_e.html?searchType=stnProv&timeframe=1&lstProvince=&optLimit=yearRange&StartYear=1840&EndYear=2022&Year=2022&Month=7&Day=7&selRowPerPage=100'

In [73]:
r = requests.get(URL, headers=headers)
soup = BeautifulSoup(r.content)

In [74]:
# getting next button
def is_last_page(soup, page):
    button_row = soup.find('div', {'class': 'pull-left'}).findAll('li')
    if button_row[-1].find('a')['href'] == '':
        last_button_num = int(button_row[-2].find('a').text)
        return (last_button_num - 1) * 100 + 1 == (page - 1) * 100 + 1
    else:
        return False

In [75]:
def get_page_links(soup, start_index):
    for i, municipality in enumerate(soup.find('div', {'class': 'historical-data-results'}).findAll('form')):
        hlyRange = municipality.find('input', {'name': 'hlyRange'})['value']
        dlyRange = municipality.find('input', {'name': 'dlyRange'})['value']
        mlyRange = municipality.find('input', {'name': 'mlyRange'})['value']
        station_id = municipality.find('input', {'name': 'StationID'})['value']
        prov = municipality.find('input', {'name': 'Prov'})['value']
        mun_name = municipality.find('div', {'class': 'col-lg-3'}).text
        search_year = municipality.find('select', {'id': f'Year{i + start_index}'}).findAll('option')[-1]['value']
        search_month = municipality.find('select', {'id': f'Month{i + start_index}'}).findAll('option')[-1]['value']
        temp_url = f"https://climate.weather.gc.ca/climate_data/daily_data_e.html?timeframe=2&Year={search_year}&Month={search_month}&Day={day_of_month}&hlyRange={hlyRange}&dlyRange={dlyRange}&mlyRange={mlyRange}&StationID={station_id}&Prov={prov}&urlExtension=_e.html&searchType=stnProv&optLimit=yearRange&StartYear=1840&EndYear={year}&selRowPerPage=100&Line=0&lstProvince="
        if mun_name:
            municipal_urls[mun_name] = {'url': temp_url, 'id': station_id, 'prov': prov, 'year': search_year, 'month': search_month}

In [78]:
base_url = f"https://climate.weather.gc.ca/historical_data/search_historic_data_stations_e.html?searchType=stnProv&timeframe=1&lstProvince=&optLimit=yearRange&StartYear=1840&EndYear=2022&Year=2022&Month=7&Day=4&selRowPerPage=100&txtCentralLatMin=0&txtCentralLatSec=0&txtCentralLongMin=0&txtCentralLongSec=0&startRow="
page = 1
while not is_last_page(soup, page):
    print(page)
    get_page_links(soup, (page - 1) * 100)
    next_url = base_url + str((page) * 100 + 1)
    r = requests.get(next_url)
    soup = BeautifulSoup(r.content)
    page += 1
get_page_links(soup, (page - 1) * 100)
print(page)

88


In [87]:
municipal_urls['VANCOUVER UBC']

{'url': 'https://climate.weather.gc.ca/climate_data/daily_data_e.html?timeframe=2&Year=1995&Month=6&Day=7&hlyRange=|&dlyRange=1957-09-01|1995-06-30&mlyRange=1957-01-01|1995-06-01&StationID=903&Prov=BC&urlExtension=_e.html&searchType=stnProv&optLimit=yearRange&StartYear=1840&EndYear=2022&selRowPerPage=100&Line=0&lstProvince=',
 'id': '903',
 'prov': 'BC',
 'year': '1995',
 'month': '6'}

https://climate.weather.gc.ca/climate_data/bulk_data_e.html?format=csv&stationID=10700&Year=2007&Month=11&Day=1&time=&timeframe=2&submit=Download+Data

Note: takes ~18 min single thread

In [81]:
list_of_data = []
for name, info in municipal_urls.items():
    if not len(chosen_provinces):
        df = pd.read_csv(f"https://climate.weather.gc.ca/climate_data/bulk_data_e.html?format=csv&stationID={info['id']}&Year={info['year']}&Month={info['month']}&Day=1&time=&timeframe=2&submit=Download+Data")
    else:
        if info['prov'] in chosen_provinces:
            df = pd.read_csv(f"https://climate.weather.gc.ca/climate_data/bulk_data_e.html?format=csv&stationID={info['id']}&Year={info['year']}&Month={info['month']}&Day=1&time=&timeframe=2&submit=Download+Data")
            list_of_data.append(df)        

In [82]:
final_df = pd.concat(list_of_data)

In [86]:
final_df.to_csv('BC_weather_data.csv')