The purpose of this notebook is to use the Mountain Project API to collect route data for later analysis. Route IDs are input in batches, and a number of useful features are provided by the API.

### Mount Google Drive for file access and storage

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


### Define functions for data collection

In [0]:
# credit to github.com/sailskisurf23 for the functions in this cell and a list of route ids

# This script is used to compile a database of useful route features by querying the Mountain Project API.

import requests, random, time, sys, json
from pymongo import MongoClient

# Input file with route ids:
loc_r = '/content/gdrive/My Drive/Python/Mtn_proj_model/data/route_ids.csv'

# Establish output destinations:
client = MongoClient('localhost:27017')
db = client.routesAPI_db

def get_API_routes(route_IDs):
    """
    Returns list of roudids that a given userid has ticked
    Parameters:
    route_IDs: list of str
    Returns:
    routes: list of json str
    """
    time.sleep(2+.5*random.random())
    # build URL
    base_url = 'https://www.mountainproject.com/'
    query_str = 'data/get-routes?'
    route_str = ','.join(route_IDs)
    credfile = '/content/gdrive/My Drive/Python/Mtn_proj_model/mtn_proj_cred_file.txt'
    with open(credfile) as f:
        key = f.read().strip()
    url = base_url + query_str + 'routeIds=' + route_str + '&key=' + key
    # parse json and return 'ticks'
    print('GET routes: {}'.format(route_str))
    r = requests.get(url)
    parsed_json = json.loads(r.content)
    routes = parsed_json['routes']
    return routes

def write_routes_db(routes):
    """
    Writes list of users to database
    Parameters:
    routes: list of json str
    """
    print('writing to db, routes: ', end='')
    for route in routes:
        print('{},'.format(route['id']), end='')
        db.routesAPI_db.insert_one(route)
    print('')
    pass

def chunks(list, n=50):
    """
    Generator function: break list into chunks of len(n)
    """
    for i in range(0, len(list), n):
        yield list[i:i+n]


def get_routes_data():
    # grab routes in chunks
    with open(loc_r) as f:
        routes = [line.strip() for line in f]
    batch_size = 100
    routes_chunks = chunks(routes,batch_size)
    for i,chunk in enumerate(routes_chunks):
        try:
            print('starting chunk {} of {}'.format(i+1,int(len(routes)/batch_size)))
            routes_data = get_API_routes(chunk)
            write_routes_db(routes_data)
        except:
            print('error while retreiving routes: {}'.format(chunk))
            time.sleep(10)

### Collect route data using API and save to csv file

In [0]:
# getRoutes data and store in mongodb
get_routes_data()

# open data in dataframe
routesdb = db.routesAPI_db
df = pd.DataFrame(list(routesdb.find()))

# save data to csv
df.to_csv('/content/gdrive/My Drive/Python/Mtn_proj_model/data/getRoutesData.csv', index=False)

### Reload API data as dataframe and initialize new data columns

In [0]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import html5lib

# reload data into dataframe
df_r = pd.read_csv('/content/gdrive/My Drive/Python/Mtn_proj_model/data/route_ids.csv', names=['routeid'], dtype='str')

# initialize new columns to be filled by scraping route webpages
df_r['routeName'] = ''
df_r['scoreStars'] = np.nan
df_r['scoreStarsVotes'] = np.nan
df_r['r_type'] = ''
df_r['r_length'] = ''
df_r['FA_data'] = ''
df_r['page_views'] = np.nan
df_r['page_view_rate'] = ''
df_r['desc'] = ''
df_r['locInfo'] = ''
df_r['pro'] = ''
df_r['gradeYDS'] = ''
df_r['gradeFr'] = ''
df_r['gradeEwb'] = ''
df_r['gradeUIAA'] = ''
df_r['gradeZA'] = ''
df_r['gradeBr'] = ''
df_r['commentList'] = ''
df_r['apPopularityScore'] = ''
df_r['apScoreCount'] = ''
df_r['apScoreAvg'] = ''

### Reload most recent checkpoint in case of Colab timeout

In [0]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import html5lib

# last index of most recently saved csv
last_idx = 60000

# reload dataframe and specify dtypes
df_r = pd.read_csv('/content/gdrive/My Drive/Python/Mtn_proj_model/data/scapedData_0-' + last_idx + '.csv', dtype='str')
df_r['scoreStars'] = pd.to_numeric(df_r['scoreStars'])
df_r['scoreStarsVotes'] = pd.to_numeric(df_r['scoreStarsVotes'])
df_r['page_views'] = pd.to_numeric(df_r['page_views'])

### Scrape additional data from route pages, add to dataframe, and save as csv

In [0]:
# define starting index; in case of colab timeout run above cell first
try: last_idx
except NameError: b = None
if last_idx is not None:
  idx_start = last_idx + 1
else:
  idx_start = 0

# Input file paths:
loc_r = '/content/gdrive/My Drive/Python/Mtn_proj_model/data/route_ids.csv'
base_url = 'https://www.mountainproject.com/'

# create temp dataframe from current row onward; might remove this
df_temp = df_r.loc[idx_start:]

# fill routeid rows with scraped data
for index, row in df_temp.iterrows():

  print('processing index {} of {}'.format(index, len(df_temp)-1))

  try:
    # build URLs
    routeid = row['routeid']
    url = base_url+'route/'+routeid
    # retrieve URLs
    time.sleep(2+random.random())
    r = requests.get(url)
    soup = BeautifulSoup(r.content, 'html5lib')


    # route name
    routeName = soup.find('h1').get_text().strip()

    # collecting this value since it is different than the value from the API, which seems to use 'apScoreCount'
    scoreStarsData = soup.find('span', {'id':'starsWithAvgText-'+str(routeid)}).get_text().replace('\\n','').strip().split(' ')
    scoreStars = scoreStarsData[1]
    scoreStarsVotes = scoreStarsData[3]

    # route type and length; some routes have multiple types (e.g. Trad, Alpine) 
    detail = soup.find('table', {'class':'description-details'}).findChildren('td')
    detail_txt = detail[1].get_text().strip().split(', ')

    r_length = []
    r_type = []
    for item in detail_txt:
      if item[0].isdigit():
        r_length = item
      else:
        r_type.append(item)
    r_type = ', '.join(r_type)

    # FA data
    FA_data = detail[3].get_text().strip().split(', ') # some data has data, some doesn't; sort it out later; will probably not use this data anyway

    # page view and view rate
    page_views = detail[5].get_text().strip().split(' ')[0].replace(',','')
    page_view_rate = detail[5].get_text().strip().split(' ')[-1].replace(',','')

    # description/location/protection info
    desc = []
    locInfo = []
    pro = []
    desc_all = soup.findAll('div', {'class':'fr-view'})
    for item in desc_all:
      if 'Description' in item.previousSibling.previousSibling.get_text(): desc = item.get_text()
      if 'Location' in item.previousSibling.previousSibling.get_text(): locInfo = item.get_text()
      if 'Protection' in item.previousSibling.previousSibling.get_text(): pro = item.get_text()

    # route grades in various formats (collect them all!)
    gradeObj = soup.find('h2', {'class':'inline-block mr-2'}).findChildren('span')
    gradeObjTxt = []
    gradeYDS = []
    gradeFr = []
    gradeEwb = []
    gradeUIAA = []
    gradeZA = []
    gradeBr = []

    for i in range(len(gradeObj)):
      gradeObjTxt.append(gradeObj[i].get_text())

    gradeObjTxt = gradeObjTxt[::2]
    for item in gradeObjTxt:
      if 'YDS' in item: gradeYDS = item.split(' ')[0]
      elif 'French' in item: gradeFr = item.split(' ')[0]
      elif 'Ewbanks' in item: gradeEwb = item.split(' ')[0]
      elif 'UIAA' in item: gradeUIAA = item.split(' ')[0]
      elif 'ZA' in item: gradeZA = item.split(' ')[0]
      elif 'British' in item: gradeBr = item.split(' British')[0]

    # comments
    commentObj = soup.findAll('span', {"class" : 'comment-time'})
    commentList = [item.previousSibling.strip() for item in commentObj]

    # get hidden info (some numbers differ from info shown on page; not sure why)
    extraList = soup.findAll('script')[-1].get_text().split('\n')

    apPopularityScore = []
    apScoreCount = []
    apScoreAvg = []
    for line in extraList:
      if 'apPopularityScore' in line: apPopularityScore = line.strip().split("'")[1]
      if 'apScoreCount' in line: apScoreCount = line.strip().split("'")[1]
      if 'apScoreAvg' in line: apScoreAvg = line.strip().split("'")[1]

    # add scaped values to dataframe
    df_r.at[index, 'routeName'] = routeName
    df_r.at[index, 'scoreStars'] = scoreStars
    df_r.at[index, 'scoreStarsVotes'] = scoreStarsVotes
    df_r.at[index, 'r_type'] = r_type
    df_r.at[index, 'r_length'] = r_length
    df_r.at[index, 'FA_data'] = FA_data
    df_r.at[index, 'page_views'] = page_views
    df_r.at[index, 'page_view_rate'] = page_view_rate
    df_r.at[index, 'desc'] = desc
    df_r.at[index, 'locInfo'] = locInfo
    df_r.at[index, 'pro'] = pro
    df_r.at[index, 'gradeYDS'] = gradeYDS
    df_r.at[index, 'gradeFr'] = gradeFr
    df_r.at[index, 'gradeEwb'] = gradeEwb
    df_r.at[index, 'gradeUIAA'] = gradeUIAA
    df_r.at[index, 'gradeZA'] = gradeZA
    df_r.at[index, 'gradeBr'] = gradeBr
    df_r.at[index, 'commentList'] = commentList
    df_r.at[index, 'apPopularityScore'] = apPopularityScore
    df_r.at[index, 'apScoreCount'] = apScoreCount
    df_r.at[index, 'apScoreAvg'] = apScoreAvg

    # save df every so often in case Colab times out or crashes since disk is non-persistent
    if index % 1000 == 0:
      df_r.to_csv('/content/gdrive/My Drive/Python/Mtn_proj_model/data/scapedData_0-'+str(index)+'.csv', index=False)


  except:
    print('***failed on index # {}'.format(index))

processing index 79501 of 7390
processing index 79502 of 7390
processing index 79503 of 7390
processing index 79504 of 7390
processing index 79505 of 7390
processing index 79506 of 7390
processing index 79507 of 7390
processing index 79508 of 7390
processing index 79509 of 7390
processing index 79510 of 7390
processing index 79511 of 7390
processing index 79512 of 7390
processing index 79513 of 7390
processing index 79514 of 7390
processing index 79515 of 7390
processing index 79516 of 7390
processing index 79517 of 7390
processing index 79518 of 7390
processing index 79519 of 7390
processing index 79520 of 7390
processing index 79521 of 7390
processing index 79522 of 7390
processing index 79523 of 7390
processing index 79524 of 7390
processing index 79525 of 7390
processing index 79526 of 7390
processing index 79527 of 7390
processing index 79528 of 7390
processing index 79529 of 7390
processing index 79530 of 7390
processing index 79531 of 7390
processing index 79532 of 7390
processi