# Moonboard Webscraper

## Imports

In [1]:
import requests
from bs4 import BeautifulSoup

# from requests_html import AsyncHTMLSession

## Figuring out the right HTTP headers for BeautifulSoup

In [2]:
def soup_filter(html, tag=None, _class=None, multiple=True):
    if tag is None:
        return BeautifulSoup(html, 'html.parser')
    elif multiple:
        return BeautifulSoup(html, 'html.parser').findAll(tag, _class)
    elif not multiple:
        return BeautifulSoup(html, 'html.parser').find(tag, _class)

In [40]:
def holdset_angle_mapping(holdset, angle):

    if holdset == "MoonBoard 2016" and angle == "":
        return "1", "0"
    elif holdset == "MoonBoard Masters 2017" and angle == "40":
        return "15", "1"
    elif holdset == "MoonBoard Masters 2017" and angle == "25":
        return "15", "2"
    elif holdset == "MoonBoard Masters 2019" and angle == "40":
        return "17", "1"
    elif holdset == "MoonBoard Masters 2019" and angle == "25":
        return "17", "2"
    elif holdset == "Mini MoonBoard 2020" and angle == "40":
        return "19", "1"
    else:
        raise ValueError(holdset, angle)


## Payload Testing

In [None]:
URL = 'https://www.moonboard.com/Account/Login'
dashboard_URL = 'https://www.moonboard.com/Dashboard/Index'
test_URL = 'https://www.moonboard.com/Problems/View/358034/9circle'
# test_URL = 'https://restapimoonboard.ems-x.com/v1/_moonapi/problems/v3/19/40/1?v=8.3.4'

########################################
#
#  Enter your username and password here!
#
########################################

payload = {'user': '', 'pass' : ''}

with requests.Session() as session:
    headers = {'User-Agent': """Mozilla/5.0 (X11; Linux x86_64) """
               """AppleWebKit/537.36 (KHTML, like Gecko) """
               """Chrome/58.0.3029.110 Safari/537.36"""}
    
    raw_token = session.get(URL, headers=headers)

    token = soup_filter(raw_token.text, tag='input', _class={'name': '__RequestVerificationToken'}, multiple=False)['value']
    
    payload = {'authenticity_token': token, 
            'user[login]': '',
            'user[password]': '',
            'user[locale]': 'en',
            'user[otp_attempt]': 'step_1',
            'utf8': '✓'}
    
    login = session.post(URL,
                         headers=headers,
                         data=payload)
    
    page = session.get(test_URL, headers=headers, cookies=session.cookies)
    my_soup = BeautifulSoup(page.text, features='lxml')
    print(my_soup)

In [67]:
json_str = str(script_arr[-1]).split('\n')[3].strip('\r')[30:-3]
json_str

'{"Method":"Feet follow hands","Name":"9CIRCLE","Grade":"6A+","UserGrade":"6A+","MoonBoardConfiguration":{"Id":1,"Description":"40° MoonBoard","LowGrade":null,"HighGrade":null},"MoonBoardConfigurationId":0,"Setter":{"Id":"24473fff-738f-4b75-a62c-1ac1f67037ce","Nickname":"episkeptis0","Firstname":"vvvv","Lastname":"kkkk","City":"athens","Country":"Greece","ProfileImageUrl":"/Content/Account/Images/default-profile.png?638412317637153659","CanShareData":true},"FirstAscender":false,"Rating":0,"UserRating":4,"Repeats":0,"Attempts":0,"Holdsetup":{"Id":17,"Description":"MoonBoard Masters 2019","Setby":null,"DateInserted":null,"DateUpdated":null,"DateDeleted":null,"IsLocked":false,"IsMini":false,"Active":false,"Holdsets":null,"MoonBoardConfigurations":null,"HoldLayoutId":0,"AllowClimbMethods":true},"IsBenchmark":true,"IsMaster":false,"IsAssessmentProblem":false,"ProblemType":null,"Moves":[{"Id":2043503,"Description":"B5","IsStart":true,"IsEnd":false},{"Id":2043504,"Description":"A9","IsStart":

In [91]:
import json 

json_dict = json.loads(json_str)
color_dict = {
    '0x0000FF' : 'hold',
    '0xFF0000' : 'end',
    '0x00FF00' : 'start'
}
[(x['Description'], color_dict[x['Color']]) for x in json_dict['Locations']]

[('A9', 'hold'),
 ('A14', 'hold'),
 ('A18', 'end'),
 ('B11', 'hold'),
 ('B5', 'start'),
 ('C12', 'hold'),
 ('D15', 'hold')]

In [83]:
list(json_dict.keys())[-1]

'DateTimeString'

## Getting the holds for a given problem URL

In [258]:
import json 
from time import sleep


def get_holds(test_URL: str, sleepTime=1., username: str = '', password: str = ''):
        """
        Returns a list of tuples `holds` for the url `test_URL`.
        
        Arguments
        test_url  -- the URL from which to strip holds. 
        sleepTime -- number of seconds to wait between HTTP requests.
        """
        sleep(sleepTime)
        URL = 'https://www.moonboard.com/Account/Login'
        dashboard_URL = 'https://www.moonboard.com/Dashboard/index'
        
        ########################################
        #
        #  Enter your username and password here!
        #
        ########################################
        payload = {'user': username, 'pass' : password}
        
        # Start an HTTP session
        with requests.Session() as session:
                headers = {'User-Agent': """Mozilla/5.0 (X11; Linux x86_64) """
                        """AppleWebKit/537.36 (KHTML, like Gecko) """
                        """Chrome/58.0.3029.110 Safari/537.36"""} # spoofed headers
                
                raw_token = session.get(URL, headers=headers)

                # getting verification token from HTML
                token = soup_filter(raw_token.text, tag='input', _class={'name': '__RequestVerificationToken'}, multiple=False)['value']
                
                
                payload = {'authenticity_token': token, 
                        'user[login]': username,
                        'user[password]': password,
                        'user[locale]': 'en',
                        'user[otp_attempt]': 'step_1',
                        'utf8': '✓'}
                
                # post payload to log into Moonboard
                login = session.post(URL,
                                headers=headers,
                                data=payload)
                
                # scrape page using BeautifulSoup
                page = session.get(test_URL, headers=headers, cookies=session.cookies)
                my_soup = BeautifulSoup(page.text, features='lxml')
                
                # route holds are a JSON in a script tag!
                script_arr = my_soup.find_all('script', attrs={'type' : 'text/javascript'})
        
        # parse script tag to get holds as JSON
        json_str = str(script_arr[-1]).split('\n')[3].strip('\r')[30:-3]
        json_dict = json.loads(json_str)
        
        # Parse JSON to get holds in a desired format
        color_dict = {
        '0x0000FF' : 'hold',
        '0xFF0000' : 'end',
        '0x00FF00' : 'start'
        }
        return [(x['Description'], color_dict[x['Color']]) for x in json_dict['Locations']]

In [93]:
get_holds("https://www.moonboard.com/Problems/View/411086/kushnir")

[('B15', 'hold'),
 ('C3', 'start'),
 ('C18', 'end'),
 ('D8', 'hold'),
 ('H13', 'hold'),
 ('K11', 'hold')]

## Tables were scraped from local snapshots of Moonboard Dashboard

In [279]:
local_file = './tables/2019_25/2019_25_2.html'

In [210]:
def strip_local_page(filename: str):
    with open(filename) as f:
        soup = BeautifulSoup(f)
        benchMark_table = soup.find('div', {'id': 'grdBenchmarks'})
        rows = benchMark_table.find_all('tr')
        row_data = [row.find_all('td') for row in rows[1:]]
        y = [[cell.text for cell in row][1:2] for row in row_data]
        links = [row[0].find('a', href=True) for row in row_data]
        links = [link['href'] for link in links if link]
        return list(list(zip(*y))[0])[:-1], links

In [283]:
with open(local_file) as f:
    soup = BeautifulSoup(f)
    benchMark_table = soup.find('div', {'id': 'grdBenchmarks'})
    rows = benchMark_table.find_all('tr')
    row_data = [row.find_all('td') for row in rows[1:]]
    print(row_data)

[[<td class="" role="gridcell" style="width:50%;"><a href="https://www.moonboard.com/Problems/View/412516/geralt-of-rivia-25" target="_blank">GERALT OF RIVIA 25°</a></td>, <td class="" role="gridcell" style="width:15%;">6C+</td>, <td class="" role="gridcell" style="width:15%;"> 0 </td>, <td class="" role="gridcell" style="width:20%;">703</td>], [<td class="" role="gridcell" style="width:50%;"><a href="https://www.moonboard.com/Problems/View/412520/i-hate-gaston-25" target="_blank">I HATE GASTON 25°</a></td>, <td class="" role="gridcell" style="width:15%;">6C+</td>, <td class="" role="gridcell" style="width:15%;"> 0 </td>, <td class="" role="gridcell" style="width:20%;">703</td>], [<td class="" role="gridcell" style="width:50%;"><a href="https://www.moonboard.com/Problems/View/368172/jazz-hands" target="_blank">JAZZ HANDS</a></td>, <td class="" role="gridcell" style="width:15%;">6C+</td>, <td class="" role="gridcell" style="width:15%;"> 0 </td>, <td class="" role="gridcell" style="width

In [218]:
from time import sleep 

y, links = strip_local_page('./tables/2016_40.html')
get_holds(links[32])

[('C13', 'hold'),
 ('D18', 'end'),
 ('E6', 'start'),
 ('F10', 'hold'),
 ('H5', 'start')]

## Parsing local snapshot into `DataFrame`

In [234]:
import pandas as pd

def local_file_to_df(filename: str):
    df = pd.DataFrame(columns=['holds', 'grade'])
    y, links = strip_local_page(filename)
    df['grade'] = y
    holds = [get_holds(link) for link in links]
    df['holds'] = holds 
    return df

In [235]:
df_2020_40 = local_file_to_df(local_file)
df_2020_40

Unnamed: 0,holds,grade
0,"[(C12, end), (E3, start), (H11, hold), (H10, h...",7A+
1,"[(A12, end), (B12, end), (H9, hold), (I5, star...",7A
2,"[(A9, hold), (B12, end), (C11, hold), (C12, en...",6A+
3,"[(D9, hold), (E4, start), (G7, hold), (G12, ho...",6B
4,"[(B6, start), (G9, hold), (H12, end)]",7C+
5,"[(D12, end), (E10, hold), (F8, hold), (G6, sta...",6C
6,"[(A6, start), (G7, hold), (G11, hold), (I12, e...",7A+
7,"[(B12, end), (C9, hold), (C12, end), (D5, star...",6C
8,"[(A12, end), (D4, hold), (E10, hold), (E12, ho...",7A+
9,"[(A6, start), (B7, hold), (F6, hold), (G12, en...",6A+


## Saving `DataFrame` of holds to CSV to train ML models later

In [259]:
import pathlib 
import glob 

data_dir = './tables/'
path = pathlib.Path(data_dir)

for x in path.iterdir():
    if x.is_dir():
        df_filename = x.name 
        holds, grades = [], []
        
        year_path = pathlib.Path(x)
        for file in year_path.iterdir():
            filename = str(file)
            current_grades, links = strip_local_page(filename)
            grades.extend(current_grades)
            for link in links:
                holds.append(get_holds(link))
        
        df = pd.DataFrame(columns=['holds', 'grade'])
        df['grade'] = grades 
        df['holds'] = holds 
        
        df.to_csv(f'{df_filename}.csv')
    

IndexError: list index out of range

In [307]:
def get_data(year: int, angle: int):
    data_dir = f'./tables/{year}_{angle}/'
    path = pathlib.Path(data_dir)
    holds, grades = [], []
    for file in path.iterdir():
        if not file.is_dir() and file.suffix == '.html':
            print(f'\tGetting {file}')
            page_grades, page_links = strip_local_page(str(file))
            page_holds = []
            grades.extend(page_grades)
            for link in page_links:
                try:
                    page_holds.append(get_holds(link,sleepTime=0.5))
                except:
                    page_holds.append(None)
            holds.extend(page_holds)
    
    df = pd.DataFrame(columns=['holds', 'grades'])
    df['holds'], df['grades'] = holds, grades 
    return df

In [308]:
for (year, angle) in [(2017, 40), (2016, 40)]:
    print(f'Scraping year={year}, angle={angle}...')
    df = get_data(year, angle)
    df.to_csv(f'{year}_{angle}.csv')
    print(f'Finished year={year}, angle={angle}!')

Scraping year=2017, angle=40...
	Getting tables/2017_40/2017_40_3.html
	Getting tables/2017_40/2017_40_2.html
	Getting tables/2017_40/2017_40_5.html
	Getting tables/2017_40/2017_40_9.html
	Getting tables/2017_40/2017_40_8.html
	Getting tables/2017_40/2017_40_4.html
	Getting tables/2017_40/2017_40_7.html
	Getting tables/2017_40/2017_40_6.html
	Getting tables/2017_40/2017_40_1.html
Finished year=2017, angle=40!
Scraping year=2016, angle=40...
	Getting tables/2016_40/2016_40_5.html
	Getting tables/2016_40/2016_40_9.html
	Getting tables/2016_40/2016_40_8.html
	Getting tables/2016_40/2016_40_4.html
	Getting tables/2016_40/2016_40_3.html
	Getting tables/2016_40/2016_40_10.html
	Getting tables/2016_40/2016_40_11.html
	Getting tables/2016_40/2016_40_2.html
	Getting tables/2016_40/2016_40_12.html
	Getting tables/2016_40/2016_40_1.html
	Getting tables/2016_40/2016_40_13.html
	Getting tables/2016_40/2016_40_14.html
	Getting tables/2016_40/2016_40_7.html
	Getting tables/2016_40/2016_40_6.html
Fini