In [1]:
from __future__ import print_function, division
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import requests
from bs4 import BeautifulSoup
import re
from collections import defaultdict
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import os

firefoxdriver = "/usr/local/bin/geckodriver" # path to the chromedriver executable
os.environ["webdriver.firefox.driver"] = firefoxdriver


%config InlineBackend.figure_format = 'png' # ‘png’, ‘retina’, ‘jpeg’, ‘svg’, ‘pdf’

%matplotlib inline
mpl.rcParams['figure.dpi']= 300

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 25)
pd.set_option('display.precision', 3)

In [443]:
url = 'https://www.wta.org/go-hiking/hikes/wilderness-peak'
pacurl = 'https://www.wta.org/go-hiking/hikes/pacific-northwest-trail'
response = requests.get(url)
response2 = requests.get(pacurl)

page = response.text
page2 = response2.text

In [444]:
pac = BeautifulSoup(page2, 'lxml')

In [7]:
soup = BeautifulSoup(page, 'lxml')

In [546]:
headers = [
    'name',
    'region',
    'subregion',
    'votes',
    'rating',
    'length',
    'gain',
    'hpoint',
    'fee',
    'lat',
    'long',
    'trailhead1',
    'trailhead2',
    'author1',
    'author2',
    'countreports',
          ]

binary_headers = [
    'Wildflowers/Meadows',
    'Ridges/passes',
    'Wildlife',
    'Waterfalls',
    'Old growth',
    'Summits',
    'Good for kids',
    'Dogs allowed on leash',
    'Fall foliage',
    'Lakes',
    'Rivers',
    'Coast',
    'Mountain views',
    'Established campsites',
]

# Helper Functions

In [536]:
def get_soup_hike_stat(soup, stat):
    obj = soup.find(text=re.compile(stat))
    if not obj:
        return None
    else:
        return obj

# Retrieval Functions

In [526]:
def get_name(soup):
    return soup.find(class_='documentFirstHeading').text

def get_region(soup):
    regions = get_soup_hike_stat(soup, 'Location').find_next().text
    regions = regions.split(sep="--")
    return regions[0].strip()

def get_subregion(soup):
    regions = get_soup_hike_stat(soup, 'Location').find_next().text
    regions = regions.split(sep="--")
    return regions[1].strip()

def get_length(soup):
    length = get_soup_hike_stat(soup, 'Length').find_next().find('span').text
    return length.split()[0]

def get_gain(soup):
    return get_soup_hike_stat(soup, 'Gain').find_next().text

def get_hpoint(soup):
    return get_soup_hike_stat(soup, 'Highest Point').find_next().text

def get_fee(soup):
    return get_soup_hike_stat(soup, 'Parking Pass/Entry Fee').find_next().text

def get_lat(soup):
    return get_soup_hike_stat(soup, 'Co-ordinates').find_next().text

def get_long(soup):    
    return get_soup_hike_stat(soup, 'Co-ordinates').find_next().find_next().text

def get_trailhead1(soup):
    obj = soup.find(id='trailhead-details').find_all('p')[1].text
    if "weather" in obj:
        return np.nan
    return obj
def get_trailhead2(soup):
    obj = soup.find(id='trailhead-details').find_all('p')[2].text
    if "weather" in obj:
        return np.nan
    return obj

def get_author1(soup):
    return soup.find(class_='authorship sidebar-section').find('p').find_all('span')[0].text

def get_author2(soup):
    return soup.find(class_='authorship sidebar-section').find('p').find_all('span')[1].text

def get_countreports(soup):
    return soup.find(class_='ReportCount').text

def get_votes(soup):
    return soup.find(class_='rating-count').text.split()[0][1:]

def get_rating(soup):
    return soup.find(class_='current-rating').text.split()[0]

# Function to mark if certain categorical features are present

## Only run this after having created the dictionary for the current webpage

In [556]:
def append_features(soup, headers):
    '''
    soup: beautifulsoup object of web page
    headers: list of names of the particular wta.org trail page features we want.
    ----
    output is defaultdict
    '''
    data = defaultdict()
    for i in headers:
        try:
            # allows to call methods by inserting a string into the func name
            data[i] = [globals()["get_" + i](soup)]
        except:
            data[i] = np.nan
    return data

def append_binary_feats(soup, binary_headers, feat_dict):
    '''
    soup: beautifulsoup object of web page
    datadict: this is the dictionary used for each webpage to store the variables.
    Thus this function should be called only after that dict has been created.
    ----
    Product: this outcome appends binary categorical features present as a "1" or "0" to an existing
    dictionary
    '''
    bin_features = binary_headers.copy()
    for i in soup.find_all(class_='feature'):
        label = i.attrs['data-title']
        if label in bin_features:
            feat_dict[label] = 1
            bin_features.remove(label)
    for remainder in bin_features:
        feat_dict[remainder] = 0

In [479]:
# Rough Debugging
for i in headers:
    print('get_'+i+':', locals()["get_"+i](soup))

get_name: Wilderness Peak Loop
get_region: Issaquah Alps
get_subregion: Cougar Mountain
get_votes: 19
get_rating: 3.00
get_length: 4.0
get_gain: 1200
get_hpoint: 1598
get_fee: None
get_lat: 47.5093
get_long: -122.0904
get_trailhead1: Whittaker Wilderness Peak (#COUGAR-E4), Gombu Wilderness Cliffs (#)
get_trailhead2: King County Parks
get_author1: WTA Correspondents
get_author2: Alan Gibbs
get_countreports: 177


KeyError: 'get_Wildflowers/Meadows'

In [445]:
# Rough Debugging
print('get_name:', get_name(pac))
print('get_region:', get_region(pac))
print('get_subregion:', get_subregion(pac))
print('get_gain:', get_gain(pac))
print('get_hpoint:', get_hpoint(pac))

get_name: Pacific Northwest Trail
get_region: Puget Sound and Islands


IndexError: list index out of range

# Creating Dictionary

In [547]:
data = append_features(soup, headers)
data

defaultdict(None,
            {'name': nan,
             'region': nan,
             'subregion': nan,
             'votes': nan,
             'rating': nan,
             'length': nan,
             'gain': nan,
             'hpoint': nan,
             'fee': nan,
             'lat': nan,
             'long': nan,
             'trailhead1': nan,
             'trailhead2': nan,
             'author1': nan,
             'author2': nan,
             'countreports': nan})

In [575]:
# so for one webpage:
feat_dict = append_features(soup, headers)
append_binary_feats(soup, binary_headers, feat_dict)
# Create pandas DataFrame from dictionary
df_page = pd.DataFrame.from_dict(feat_dict)
df_page

Unnamed: 0,name,region,subregion,votes,rating,length,gain,hpoint,fee,lat,...,Old growth,Summits,Good for kids,Dogs allowed on leash,Fall foliage,Lakes,Rivers,Coast,Mountain views,Established campsites
0,Wilderness Peak Loop,Issaquah Alps,Cougar Mountain,19,3.0,4.0,1200,1598,,47.5093,...,1,1,1,1,1,0,0,0,0,0


In [582]:
# so for one webpage:
feat_dict = append_features(soup, headers)
append_binary_feats(soup, binary_headers, feat_dict)
# Create pandas DataFrame from dictionary
df_page = pd.DataFrame.from_dict(feat_dict)
df_page

Unnamed: 0,name,region,subregion,votes,rating,length,gain,hpoint,fee,lat,long,trailhead1,trailhead2,author1,author2,countreports,Wildflowers/Meadows,Ridges/passes,Wildlife,Waterfalls,Old growth,Summits,Good for kids,Dogs allowed on leash,Fall foliage,Lakes,Rivers,Coast,Mountain views,Established campsites
0,Wilderness Peak Loop,Issaquah Alps,Cougar Mountain,19,3.0,4.0,1200,1598,,47.5093,-122.0904,"Whittaker Wilderness Peak (#COUGAR-E4), Gombu ...",King County Parks,WTA Correspondents,Alan Gibbs,177,1,1,1,1,1,1,1,1,1,0,0,0,0,0


In [580]:
df = pd.concat([df, df_page], join='inner')
df

Unnamed: 0,name,region,subregion,votes,rating,length,gain,hpoint,fee,lat,long,trailhead1,trailhead2,author1,author2,countreports,Wildflowers/Meadows,Ridges/passes,Wildlife,Waterfalls,Old growth,Summits,Good for kids,Dogs allowed on leash,Fall foliage,Lakes,Rivers,Coast,Mountain views,Established campsites
0,Wilderness Peak Loop,Issaquah Alps,Cougar Mountain,19,3.0,4.0,1200.0,1598,,47.5093,-122.0904,"Whittaker Wilderness Peak (#COUGAR-E4), Gombu ...",King County Parks,WTA Correspondents,Alan Gibbs,177,1,1,1,1,1,1,1,1,1,0,0,0,0,0
0,Pacific Northwest Trail,Puget Sound and Islands,,2,3.5,1200.0,,7580,,48.6451,-122.3563,Various,,,,27,1,1,1,1,1,1,0,0,1,1,1,1,1,1
0,Pacific Northwest Trail,Puget Sound and Islands,,2,3.5,1200.0,,7580,,48.6451,-122.3563,Various,,,,27,1,1,1,1,1,1,0,0,1,1,1,1,1,1


In [5]:
# TESTING only - resets my dfs
df_page = None
df = None

# Combine into one scrape function

In [7]:
def scrape_page(soup, headers, binary_headers, df):
    feat_dict = append_features(soup, headers)
    append_binary_feats(soup, binary_headers, feat_dict)
    # Create pandas DataFrame from dictionary
    df_page = pd.DataFrame.from_dict(feat_dict)
    newdf = pd.concat([df, df_page], join='inner')
    return newdf

In [8]:
babydf = pd.DataFrame()

In [9]:
scrape_page(soup, headers, binary_headers, babydf)

NameError: name 'soup' is not defined

# Selenium

In [2]:
driver = webdriver.Firefox()
driver.get("https://www.wta.org/go-outside/hikes/")
time.sleep(1);

WebDriverException: Message: Reached error page: about:neterror?e=dnsNotFound&u=https%3A//www.wta.org/go-outside/hikes/&c=UTF-8&f=regular&d=We%20can%E2%80%99t%20connect%20to%20the%20server%20at%20www.wta.org.


In [None]:
# ROUND TRIP OR ONE WAY?!?!

In [None]:
# wtadf = pd.DataFrame()
count = 0
mamadf = pd.DataFrame()
while True:
    all_page_hikes = driver.find_elements_by_class_name('listitem-title')
    for hike in all_page_hikes:
        driver.click()
        time.sleep(1);
        driver.switch_to_window(driver.window_handles[1])
        # Insert all scraping code here
        # scrape_page(soup, headers, binary_headers, mamadf)

        # then close page and head back to list?!
        driver.close()
        count += 1
        # driver.switch_to_window(driver.window_handles[0])
        driver.switch_to.window('main')
    # then hit the next page button
    try:
        next_button = driver.find_element_by_class_name('next')
        next_button.click()
    except:
        print(f"End of program. Scraped {count} hike pages")
        print("See df: ")
        return mamadf
print(f"End of program. Scraped {count} hike pages")
print("See df: ")
return mamadf