In [1]:
import random
import csv
import bs4
import pandas as pd
import numpy as np
import time
from selenium import webdriver 
from selenium.webdriver.common.by import By 
from selenium.webdriver.support.ui import WebDriverWait 
from selenium.webdriver.support import expected_conditions as EC 
from selenium.common.exceptions import TimeoutException
from datetime import datetime
from IPython.display import clear_output, display

<font color='black' size=6><u> **Navigate AllTrails.com and extract all CA hikes** </u></font>

Exploratory code for a <font color='green'>**single webpage**</font>.

In [29]:
option = webdriver.ChromeOptions()
option.add_argument(' — incognito')

In [78]:
browser = webdriver.Chrome(executable_path='/Users/stefanheinze/Desktop/chromedriver', options=option)
browser.get('https://www.alltrails.com/us/california')

In [84]:
load_more_button = browser.find_element_by_xpath('//*[@id="load_more"]/a/div/h3')
centered_button = browser.execute_script("arguments[0].scrollIntoView()", load_more_button)
time.sleep(0.25)
load_more_button.click()

In [31]:
# Wait 20 seconds for page to load
timeout = 20
try:
    WebDriverWait(browser, timeout).until(EC.visibility_of_element_located((By.XPATH, '//*[@id="load_more"]/a/div/h3')))
except TimeoutException:
    print('Timed out waiting for page to load')
    browser.quit()

In [43]:
WebDriverWait(browser, timeout).until(EC.visibility_of_element_located((By.XPATH,'//*[@id="load_more"]/a/div/h3')))
browser.find_element_by_xpath('//*[@id="load_more"]/a/div/h3').click()

In [15]:
soup = bs4.BeautifulSoup(browser.page_source)
html = soup.find_all('h3', class_='name xlate-none short')
hike_names = [ hike.text for hike in html ]
hrefs = [ hike.find('a')['href'] for hike in html ]

Now we implement the code to functions and automatically  <font color='green'> **extract all hike urls in CA**</font>.

In [133]:
agent_list = []
with open('agent_list.csv', 'r') as f:
    reader = csv.reader(f, delimiter=',')
    for r in reader:
        for agent in r:
            agent_list.append(agent)

def open_chrome():
    option = webdriver.ChromeOptions()
    option.add_argument(' — incognito')
    user_agent = random.choice(agent_list)
    option.add_argument(f'user-agent={user_agent}')
    browser = webdriver.Chrome(executable_path='/Users/stefanheinze/Desktop/chromedriver', options=option)
    
    return browser

In [134]:
def progress_display(current_value, iterable, start_time):
    current_index = iterable.index(current_value)
    current_time = datetime.now()
    delta_time = (current_time - start_time).total_seconds()
    clear_output(wait=True)
    print(str(current_index) + '/' + str(len(iterable)) + ' (' + str(round(current_index/len(iterable)*100, 1)) + '%)')  # use display(f) if you encounter performance issues
    print(str(int(delta_time)) + ' seconds have elapsed.')
    print('Expected time remaining is ' + str(int((len(iterable) - current_index) * (delta_time / current_index))) + ' seconds.')

In [135]:
def load_all_hikes():
    '''Loads the California section of AllTrails.com and clicks through
    the "Load More Hikes" button until it no longer appears (i.e. all
    hikes are loaded.
    
    Returns HTML soup.'''
    
    browser = open_chrome()
    browser.get('https://www.alltrails.com/us/california')
    
    # Wait 20 seconds for page to load
    timeout = 20
    try:
        WebDriverWait(browser, timeout).until(EC.visibility_of_element_located((By.XPATH, '//*[@id="load_more"]/a/div/h3')))
    except TimeoutException:
        print('Timed out waiting for page to load')
        browser.quit()
    
    count = 0
    start_time = datetime.now()
    while True:
        try:
            WebDriverWait(browser, timeout).until(EC.visibility_of_element_located((By.XPATH,'//*[@id="load_more"]/a/div/h3')))
            load_more_button = browser.find_element_by_xpath('//*[@id="load_more"]/a/div/h3')
            browser.execute_script("arguments[0].scrollIntoView()", load_more_button)
            time.sleep(1)
            load_more_button.click()
            count += 24
            progress_display(count, range(8200), start_time=start_time)
            time.sleep(random.uniform(0,2))
        except Exception as e:
            print(e)
            break

    soup = bs4.BeautifulSoup(browser.page_source)
    
    return soup

In [136]:
def extract_hike_urls(soup):
    '''Extracts hike links from the fully loaded AllTrails.com California webpage.
    
    Returns hike urls in a list.'''
    
    hikes_html = soup.find_all('h3', class_='name xlate-none short')
    #hike_names = [ hike.text for hike in hikes_html ]
    hike_hrefs = [ hike.find('a')['href'] for hike in hikes_html ]
    hike_urls = [ 'https://www.alltrails.com' + href for href in hike_hrefs ]
    
    return hike_urls

In [137]:
def save_list(listname, filename):
    '''Saves list to csv, with each item in the same row separated by a comma.'''
    
    with open(filename, 'w') as f:
        writer = csv.writer(f)
        for item in listname:
            writer.writerow([str(item)])

In [139]:
soup = load_all_hikes()
hike_urls = extract_hike_urls(soup)
save_list(hike_urls, 'hike_urls.csv')

8184/8200 (99.8%)
3629 seconds have elapsed.
Expected time remaining is 7 seconds.
8208 is not in range


# Extract data from hike

## Single hike

In [453]:
browser = webdriver.Chrome(executable_path='/Users/stefanheinze/Desktop/chromedriver', options=option)
browser.get('https://www.alltrails.com/trail/us/california/potato-chip-rock-via-mt-woodson-trail')

In [454]:
# Wait 20 seconds for page to load
timeout = 20
try:
    WebDriverWait(browser, timeout).until(EC.visibility_of_element_located((By.XPATH, '//*[@id="main-photo"]/div[1]')))
except TimeoutException:
    print('Timed out waiting for page to load')
    browser.quit()

In [457]:
WebDriverWait(browser, timeout).until(EC.visibility_of_element_located((By.XPATH,'//*[@id="load_more"]/div[2]/h3')))
browser.find_element_by_xpath('//*[@id="load_more"]/div[2]/h3').click()

In [458]:
soup = bs4.BeautifulSoup(browser.page_source)

In [372]:
hike_name = soup.find('title').text

In [430]:
difficulty = soup.find('div', id='difficulty-and-rating').find('span').text

In [374]:
dist_elev_type = soup.find_all('div', class_='detail-data')
distance = dist_elev_type[0].text
elevation = dist_elev_type[1].text
hike_type = dist_elev_type[2].text

In [375]:
tag_data = soup.find_all('span', class_='big rounded active')
tags = []
for tag in tag_data:
    tags.append(tag.text)

In [376]:
description = soup.find('p', class_='xlate-google').text

In [377]:
reviewer_info = soup.find_all('span', itemprop='author')
user_name = [ item.text for item in reviewer_info ]
user_href = []
for item in reviewer_info:
    try:
        user_href.append(item.parent['href'])
    except:
        user_href.append(None)

In [441]:
reviews_html = soup.find_all('meta', itemprop='ratingValue')
user_rating = [ int(item['content']) for item in reviews_html[1:] ]

In [403]:
publish_html = soup.find_all('meta', itemprop='datePublished')
user_date = [ datetime.strptime(item['content'], '%Y-%m-%d') for item in publish_html ]

In [406]:
text_html = soup.find_all('p', itemprop='reviewBody')
user_text = [ item.text for item in text_html ]

Extracted items:
 - hike_name
 - hike_difficulty
 - hike_distance
 - hike_elevation
 - hike_type
 - hike_tags
 - hike_description
 - user_names
 - user_hrefs
 - user_texts
 - user_ratings
 - user_dates

## Automate all hikes

In [27]:
def load_hike_page(hike_link):
    '''
    Loads webpage for a given hike on AllTrails.com and loads all reviews by 
    clicking through the "Load More Reviews" button.
    
    Returns HTML of webpage.
    '''
    
    browser = open_chrome()
    browser.get(hike_link)

    # Wait 20 seconds for page to load
    timeout = 20
    try:
        WebDriverWait(browser, timeout).until(EC.visibility_of_element_located((By.XPATH, '//*[@id="main-photo"]/div[1]')))
    except TimeoutException:
        print('Timed out waiting for page to load')
        browser.quit()

    # Load all reviews
    while True:
        try:
            WebDriverWait(browser, 1).until(EC.visibility_of_element_located((By.XPATH,'//*[@id="load_more"]/div[2]/h3')))
            load_more_button = browser.find_element_by_xpath('//*[@id="load_more"]/div[2]/h3')
            browser.execute_script("arguments[0].scrollIntoView()", load_more_button)
            time.sleep(1)
            load_more_button.click()
            time.sleep(random.uniform(0,1))
        except:
            break

    soup = bs4.BeautifulSoup(browser.page_source)
    browser.close()

    return soup

In [25]:
def extract_hike_data(soup):
    '''
    Extract the following data for a single hike on AllTrails.com:
    
       Variable (type)
     - hike_name (str)
     - hike_difficulty (str)
     - hike_distance (str)
     - hike_elevation (str)
     - hike_type (str)
     - hike_tags (list of str)
     - hike_description (str)
     - user_names (list of str)
     - user_hrefs (list of str)
     - user_texts (list of str)
     - user_ratings (list of int)
     - user_dates (list of datetimes)
     
     Returns a dict with the above keys.
    '''
    
    # Get hike name
    try:
        hike_name = soup.find('title').text
    except:
        hike_name = None
    
    # Get hike difficulty
    try:
        hike_difficulty = soup.find('div', id='difficulty-and-rating').find('span').text
    except:
        hike_difficulty = None
    
    # Get hike distance, elevation, and type
    try:
        dist_elev_type = soup.find_all('div', class_='detail-data')
        hike_distance = dist_elev_type[0].text
        hike_elevation = dist_elev_type[1].text
        hike_type = dist_elev_type[2].text
    except:
        hike_distance = None
        hike_elevation = None
        hike_type = None
    
    # Get tags associated with hike
    try:
        tag_data = soup.find_all('span', class_='big rounded active')
        hike_tags = []
        for tag in tag_data:
            hike_tags.append(tag.text)
    except:
        hike_tags = []
    
    # Get hike description
    try:
        hike_description = soup.find('p', class_='xlate-google').text
    except:
        hike_description = None
    
    # Get user names and hrefs (which serve as unique ID)
    try:
        reviewer_info = soup.find_all('span', itemprop='author')
        user_names = [ item.text for item in reviewer_info ]
        user_hrefs = []
        for item in reviewer_info:
            # Some users don't have an ID
            try:
                user_hrefs.append(item.parent['href'])
            except:
                user_hrefs.append(None)
    except:
        user_names = None
        user_hrefs = None
    
    # Get user ratings
    try:
        reviews_html = soup.find_all('meta', itemprop='ratingValue')
        user_ratings = [ int(item['content']) for item in reviews_html[1:] ]
    except:
        user_ratings = None
    
    # Get user review publish dates
    try:
        publish_html = soup.find_all('meta', itemprop='datePublished')
        user_dates = [ datetime.strptime(item['content'], '%Y-%m-%d') for item in publish_html ]
    except:
        user_dates = None
    
    # Get user review text
    try:
        text_html = soup.find_all('p', itemprop='reviewBody')
        user_texts = [ item.text for item in text_html ]
    except: user_texts = None
    
    # Create dictionary with all scraped information
    hike_info_dict = {}
    hike_info_dict['hike_name'] = hike_name
    hike_info_dict['hike_difficulty'] = hike_difficulty
    hike_info_dict['hike_distance'] = hike_distance
    hike_info_dict['hike_elevation'] = hike_elevation
    hike_info_dict['hike_type'] = hike_type
    hike_info_dict['hike_tags'] = hike_tags
    hike_info_dict['hike_description'] = hike_description
    hike_info_dict['user_names'] = user_names
    hike_info_dict['user_hrefs'] = user_hrefs
    hike_info_dict['user_texts'] = user_texts
    hike_info_dict['user_ratings'] = user_ratings
    hike_info_dict['user_dates'] = user_dates

    return hike_info_dict

In [187]:
def save_progress(hike_info_dict, filename):
    '''
    Saves scraped dictionary (hike_info_dict) to a csv file (filename) using pandas
    '''
    
    try:
        hike_df = pd.read_csv(filename, index_col=False)
        hike_df = hike_df.append(pd.DataFrame([hike_info_dict], columns=hike_info_dict.keys()))
        hike_df.to_csv(filename, index=False)
    except FileNotFoundError:
        hike_df = pd.DataFrame([hike_info_dict], columns=hike_info_dict.keys())
        hike_df.to_csv(filename, index=False)

In [186]:
def progress_display(current_value, iterable, start_time):
    current_index = iterable.index(current_value)
    current_time = datetime.now()
    delta_time = (current_time - start_time).total_seconds()
    clear_output(wait=True)
    print(str(current_index) + '/' + str(len(iterable)) + ' (' + str(round(current_index/len(iterable)*100, 1)) + '%)')  # use display(f) if you encounter performance issues
    print(str(int(delta_time)) + ' seconds have elapsed.')
    print('Expected time remaining is ' + str(int((len(iterable) - current_index) * (delta_time / current_index))) + ' seconds.')

In [188]:
def get_all_hikes_data(hike_urls, filename):
    '''
    Extract data for all hikes with urls in hike_links. Saves data to filename after
    every successful webpage scrape.
    '''
    
    start_time = datetime.now()
    for hike_url in hike_urls:
        try:
            soup = load_hike_page(hike_url)
            hike_info_dict = extract_hike_data(soup)
            save_progress(hike_info_dict, filename)
            progress_display(hike_url, hike_urls, start_time)
        except:
            print('Error! Last hike saved was ' + hike_info_dict['hike_name'] + '!')
            break

In [None]:
# get_all_hikes_data(hike_links, 'hike_data.csv')

## Make agent list

In [711]:
browser = webdriver.Chrome(executable_path='/Users/stefanheinze/Desktop/chromedriver', options=option)
browser.get('http://useragentstring.com/pages/useragentstring.php?name=All')

In [716]:
soup = bs4.BeautifulSoup(browser.page_source)
user_agents = [ agent.text for agent in soup.find_all('a') ][5:]

In [767]:
save_list(user_agents, 'agent_list.csv')

## example plot

In [6]:
hike_link = 'https://www.alltrails.com/trail/us/california/san-antonio-ski-hut'

In [37]:
soup = load_hike_page(hike_link)

In [125]:
hike_data_dict = extract_single_hike_data(soup)

TypeError: extract_single_hike_data() got an unexpected keyword argument 'index_col'

In [143]:
hike_data_dict.keys()

dict_keys(['hike_name', 'hike_difficulty', 'hike_distance', 'hike_elevation', 'hike_type', 'hike_tags', 'hike_description', 'user_names', 'user_hrefs', 'user_texts', 'user_ratings', 'user_dates'])

In [58]:
hike_data_dict['hike_name']

'  San Antonio Ski Hut - California\n | AllTrails'

In [None]:
test = extract_single_hike_data(soup)
test_df = pd.DataFrame(test['user_ratings'], columns=['user_ratings'])

ratings = test_df['user_ratings'].value_counts()
for index in range(6):
    if index not in ratings:
        ratings[index] = 0

ratings.sort_index().plot.bar(rot=0);
plt.xlabel('Rating')
plt.ylabel('Count')
plt.title('Ratings on Potato Chip Rock Hike')
plt.savefig('ExampleDistRatings.png', dpi=300, bbox_inches='tight')

<font size=4 color='green'><u>Many Tests</u></font>  
This is a **test!**  
This is *another* test!  
And ***another!***

Question: This markdown is a...?  
(a) test  
(b) *test*  
(c) **test**  
(d) ***test***

It's obviously an ~~exam~~

...one last test...

`Test == Test`