In [2]:
!pip install selenium

Collecting selenium
  Using cached selenium-4.0.0-py3-none-any.whl (954 kB)
Collecting trio~=0.17
  Using cached trio-0.19.0-py3-none-any.whl (356 kB)
Collecting trio-websocket~=0.9
  Using cached trio_websocket-0.9.2-py3-none-any.whl (16 kB)
Collecting outcome
  Using cached outcome-1.1.0-py2.py3-none-any.whl (9.7 kB)
Collecting wsproto>=0.14
  Using cached wsproto-1.0.0-py3-none-any.whl (24 kB)
Collecting h11<1,>=0.9.0
  Using cached h11-0.12.0-py3-none-any.whl (54 kB)
Installing collected packages: outcome, h11, wsproto, trio, trio-websocket, selenium
Successfully installed h11-0.12.0 outcome-1.1.0 selenium-4.0.0 trio-0.19.0 trio-websocket-0.9.2 wsproto-1.0.0


In [89]:
#Import relevant packages

from time import sleep
from selenium import webdriver
from selenium.webdriver.common.by import By

from bs4 import BeautifulSoup
import pandas as pd 
import re

In [74]:
#url to live updates from last 24 hours on earthquakes with magnitude above 2.5
MAIN_URL = "https://earthquake.usgs.gov/earthquakes/map"

In [86]:
def get_urls_from_main_page(MAIN_URL):
    """ The function receives as parameter url to webpage of earthquake updates and
    return list of urls to individual events
    """
    #Setting webdriver to Chrome
    driver = webdriver.Chrome()
    
    driver.get(MAIN_URL)
    elements = driver.find_elements(By.TAG_NAME, 'mat-list-item')
    driver.execute_script('arguments[0].click()', elements[0])
    sleep(3)

    urls = []

    for i in range(1, len(elements)):
        driver.execute_script('arguments[0].click()', elements[i])
        sleep(0.2)
        link = driver.find_elements(By.TAG_NAME, 'a')[-1]
        # to exclude hyperlinks not related to earthquake event  (like tsunami gov)
        if 'eventpage' in link.get_attribute('href'):
            urls.append(link.get_attribute('href'))
    print(urls)
    return urls


In [87]:
#Testing function above
urls =  get_urls_from_main_page(MAIN_URL)

['https://earthquake.usgs.gov/earthquakes/eventpage/ak021e0ky9z3/pager', 'https://earthquake.usgs.gov/earthquakes/eventpage/hv72777902/', 'https://earthquake.usgs.gov/earthquakes/eventpage/us7000fqgj/dyfi', 'https://earthquake.usgs.gov/earthquakes/eventpage/hv72777822/dyfi', 'https://earthquake.usgs.gov/earthquakes/eventpage/us7000fqgf/', 'https://earthquake.usgs.gov/earthquakes/eventpage/us7000fqgd/', 'https://earthquake.usgs.gov/earthquakes/eventpage/hv72777717/', 'https://earthquake.usgs.gov/earthquakes/eventpage/us7000fqfv/', 'https://earthquake.usgs.gov/earthquakes/eventpage/pr2021305002/', 'https://earthquake.usgs.gov/earthquakes/eventpage/us7000fqf0/pager', 'https://earthquake.usgs.gov/earthquakes/eventpage/us7000fqez/dyfi', 'https://earthquake.usgs.gov/earthquakes/eventpage/us7000fqew/', 'https://earthquake.usgs.gov/earthquakes/eventpage/us7000fqet/', 'https://earthquake.usgs.gov/earthquakes/eventpage/us7000fqeh/shakemap', 'https://earthquake.usgs.gov/earthquakes/eventpage/pr20

In [103]:
def get_detail_url(urls):
    """ preparing urls with relevant information for "details page"
    parameters: list of urls
    returns: updated list of urls for individual page
    """
    new_url_list=[]
    for url in urls:
        if  url[-1] == '/':
            url = url + 'origin/detail'
            new_url_list.append(url)
            
        else:
            url = re.sub(r'/(\w*)$', '/origin/detail', url)
            new_url_list.append(url)
            
    detail_urls = list(set(new_url_list))
    return detail_urls

In [111]:
def get_event_details(detail_urls):
    """
    The function receives list of urls providing details for specific earthquakes. 
    After extraction relevant information it returns a dataframe table 
    where each row provides information on individual earthquake.
    """
    list_events = []
    for url in detail_urls:
        driver.get(url)
        sleep(2.5)
        html = driver.page_source
        soup = BeautifulSoup(html, 'lxml')

        #Extracting into list data categories
        clean_elem=[]
        clean_elem.append('url') #first element will be url itself
        elements = soup.find_all("dt")
        for element in elements:
            element = element.text.strip()
            if 'uncertainty'in element:
                element1 = element[:element.index("uncertainty")]
                clean_elem.append(element1)
                element2 = element1 + ' uncertainty'
                clean_elem.append(element2)
            else:
                clean_elem.append(element) 


        #Extracting into list data itself
        clean_data=[] 
        clean_data.append(url) #first element will be url itself
        data = soup.find_all("dd")
        for info in data:
            info = info.text.strip()
            if '±' in info:
                info1 = info[:info.index('±')].strip()
                clean_data.append(info1)
                info_uncert = info[info.index('±'):].strip()
                clean_data.append(info_uncert)
            else:
                clean_data.append(info)

        #Pairing categories with data itself
        data_extract = dict(list(zip(clean_elem, clean_data)))
        list_events.append(data_extract)
        print(f'Status: events from url {url} added to dict')
     
        
    return pd.DataFrame(list_events)

In [112]:
#Testing two functions:

detail_urls = get_detail_url(urls)
event_table=get_event_details(detail_urls)
print(event_table.head(10))

events from url https://earthquake.usgs.gov/earthquakes/eventpage/pr2021305000/origin/detail added to dict
events from url https://earthquake.usgs.gov/earthquakes/eventpage/ak021e0ky9z3/origin/detail added to dict
events from url https://earthquake.usgs.gov/earthquakes/eventpage/hv72777902/origin/detail added to dict
events from url https://earthquake.usgs.gov/earthquakes/eventpage/us7000fqgj/origin/detail added to dict
events from url https://earthquake.usgs.gov/earthquakes/eventpage/us7000fq9q/origin/detail added to dict
events from url https://earthquake.usgs.gov/earthquakes/eventpage/us7000fqew/origin/detail added to dict
events from url https://earthquake.usgs.gov/earthquakes/eventpage/us7000fqcg/origin/detail added to dict
events from url https://earthquake.usgs.gov/earthquakes/eventpage/us7000fqf0/origin/detail added to dict
events from url https://earthquake.usgs.gov/earthquakes/eventpage/us7000fqgd/origin/detail added to dict
events from url https://earthquake.usgs.gov/earthqu

In [113]:
#Put in main function

df.to_csv('earthquakes.csv', index=False, encoding="utf-8-sig")