In [2]:
#import standard libraries
from typing import Text
from time import sleep
import pandas as pd
import numpy as np
#import selenium libraries
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common import keys
from selenium.webdriver.common.keys import Keys

# Web Scrapper

Due to the lack of available data regarding general avalanche incidents, I had to create a web scrapper to pull the data from the `Mountain Information Network` which is part of Avalanche Canada. The records only go back to 2016 so my search range for the data will be from Janurary 1, 2016 to July 30, 2021. This purpose of this data gather is to get the date and location of an avalanche to later be used to gether the weather data, in an attempt to creaete a model that can predict avalanches in an area.

In order to do this I used the selenium library which is able to create and automated web browser that is capable of navigating websites and pulling HTML element. Due to  the design of the website the bot will have to click several elements to set the search parameters and then navigate through several pages to collect a list of links for each avalanche incident.

Once all of the URLS and been collected the bot will then go to each URL and collect the location and date information.

## Setting up Selenium
Since Selenium is an automated web browser is requires a web driver, for this notebook I will be using chrome. Selenium will need to be pointed towards to location of the driver. When initialized with `driver.get` Selenium will then open a browser window. I have directed Selenium to open the MIN website so I can begin to gather the required information.

In [3]:
# Sets the path for the chrome driver
PATH = 'C:\Program Files (x86)\chromedriver.exe'

# Initializes the driver
driver = webdriver.Chrome(PATH)

# Opens a webpage on the driver
driver.get('https://www.avalanche.ca/mountain-information-network/submissions')
# Print out the title of the website
print(driver.title)

Avalanche Canada


In [4]:
# Opens the down drop container for the 'Report Type'
dropdown = driver.find_element_by_class_name('Dropdown_Container__2ZbXz')
dropdown.click()

# Finds the the avalanche option and clicks it
driver.find_element_by_xpath('//div[@title="Avalanche"]').click()

In [5]:
# Defines the beginning date range box as datebox and clicks it
datebox = driver.find_element_by_class_name('DayPickerInput')
datebox.click()

# Defines the previous month button and clicks it
date_change_button = driver.find_element_by_xpath("//div[@class='DayPicker-NavBar']/span[@class='DayPicker-NavButton DayPicker-NavButton--prev']")
date_change_button.click()

# Defines the read out of the date
datebox_month =  driver.find_element_by_class_name("DayPicker-Caption").text

# Creates a loop that checks if the date equals Februrary 2016
# If the month is not equal it clicks the previous month button and sets the current month to the new month
while datebox_month != 'FEBRUARY 2016':
    new_datebox_month =  driver.find_element_by_class_name("DayPicker-Caption").text
    date_change_button.click()
    datebox_month = new_datebox_month

# Once the correct month is found selenium clicks on the first of the month   
select_date_button = driver.find_element_by_xpath("//div[@class='DayPicker-Week']/div[@aria-label='Friday, January 1, 2016']")
select_date_button.click()

In [6]:
# Create a list of pages that selenium will check
pages = range(2,5)
list_pages = list(pages)

In [7]:
# Initialize an empty list to store URLs
report_url_list = []
# Sets the time out delay for the website to load
delay = 30

# Iterates through each page of results, waits for the page to loads, records all the urls to incidents
for i in list_pages:
    
    # Create a condition that waits for the reports the load before proceeding 
    try:
        myElem = WebDriverWait(driver, delay).until(EC.presence_of_element_located((By.XPATH, '//*[@id="root"]/div/div/div/div/div/div/div/main/div[2]/table/tbody/tr[1]')))

        # Finds all elements that have a submission URL in them and creates a list
        reports = driver.find_elements_by_xpath("//a[contains(@href, '/map?panel=mountain-information-network-submissions/')]")

        # Iterates through all the reports and gets their urls
        for report in reports:
            report_url_list.append(report.get_attribute('href'))

        # Once all the reports and been iterated though goes to the next page
        driver.find_element_by_xpath("//div[@class='Pagination_Container__2Z9O-']/button[contains(text(), '"+str(i)+"')]").click()
        num_of_urls = len(set(report_url_list))

    # Executes except if page does not load in the 'delay' amount of time 
    except TimeoutException:
        print ("Loading took too much time!")

In [9]:
# Gets the number of unique results

len(set(report_url_list))

150

In [10]:
# Prints the number of results

print(len(report_url_list))

150


In [11]:
# Saves the list of URLs to a DataFrame for readability

incident_links = pd.DataFrame({'links':report_url_list})

In [12]:
# Saves the URLs as a csv without an index
# The purpose of this is to allow the kernal to be restarted without losing all progress

incident_links.to_csv('data/avanlanche_links.csv',index=False)

In [13]:
# Loads in the links if needed

avalanche_links_df = pd.read_csv('data/avanlanche_links.csv')

In [14]:
# Creates a list of from the URL DataFrame

test_avalanche = avalanche_links_df['links'].tolist()

In [15]:
#Initializes a empty lift for the avalanche location and time of occurance
avalanche_location_time = list()

# Opens a new webdriver using chrome
driver = webdriver.Chrome(PATH)

# Iterates through all the links to the avalanche reports
for link in test_avalanche:

    try:
        # Navigates selenium to the current page in the list
        driver.get(link)

        # Waits for the page to load the data
        myElem = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//*[@id='root']/div/div/div/main/section[2]/div/div[2]/dl/div[3]/dd")))

        # Collects the location data from the report
        location = driver.find_element_by_xpath("//*[@id='root']/div/div/div/main/section[2]/div/div[2]/dl/div[3]/dd").get_attribute('innerHTML')

        # Collects the time of occurance from the report
        time_occured = driver.find_element_by_xpath("//*[@id='root']/div/div/div/main/section[2]/div/div[2]/dl/div[2]/dd").get_attribute('innerHTML')

        # Creates a list of dictionaries to store the data
        avalanche_location_time.append({'location':location,'date_occured':time_occured})
        
    except TimeoutException:
        print ("Loading took too much time!")

# Converts the information to a DataFrame
avalanche_df = pd.DataFrame(avalanche_location_time)

# Saves the information as a csv with an index 
avalanche_df.to_csv('data/avalanche.csv',index=True)

WebDriverException: Message: chrome not reachable
  (Session info: chrome=91.0.4472.114)
