# Data collection London runs

### Note
The data files associated with this notebook are not included in this repository, largely because they are too large.  If you are interested in further information on this project please do get in touch via Github or Linkedin and I would be very pleased to discuss with you.

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import requests
import bs4
from bs4 import BeautifulSoup
import time
from time import sleep
from random import randint
import re
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(font_scale=1.5)
%config InlineBackend.figure_format = 'retina'
%matplotlib inline
from matplotlib.colors import ListedColormap
cmap = ListedColormap(sns.color_palette("husl", 3))

In [2]:
# define parameters for scraping
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36"
accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"
encoding = "gzip, deflate, br"
lang = "en-GB,en;q=0.9"
# headers = {'user-agent': user_agent}
headers = {'accept': accept, 'accept-encoding': encoding, 'accept-language': lang, 'user-agent': user_agent}
URL_event_template = "https://www.parkrun.org.uk/{}/results/eventhistory/"
URL_run_template = 'https://www.parkrun.org.uk/{}/results/weeklyresults/?runSeqNumber={}'

In [3]:
headers

{'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
 'accept-encoding': 'gzip, deflate, br',
 'accept-language': 'en-GB,en;q=0.9',
 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36'}

In [4]:
# load event list
london_links = pd.read_csv('london_links.csv')
event_names = london_links.event_name
event_names[0]

'allypally'

In [5]:
len(event_names)

52

In [15]:
# divide list into tranches for scraping
# first event run as test
event_group_1 = event_names[0]
# second partially scraped only (blocked)
event_group_2 = event_names[1]
event_group_3 = event_names[2:6]
event_group_4 = event_names[6]
# group 5 failed 'bromley'
event_group_5 = event_names[7]
event_group_6 = event_names[8:17] # all ok
event_group_7 = event_names[17:]
failed_events_7 = ['greenwich', 'tootingcommon']
failed_events_all = failed_events_7 + [event_group_5] + [event_group_2]
failed_events_all

['greenwich', 'tootingcommon', 'bromley', 'barking']

In [7]:
# Define function to extract latest run number from event history
def find_latest(name):
    URL_event_template = "https://www.parkrun.org.uk/{}/results/eventhistory/"
    r_EH = requests.get(URL_event_template.format(name), headers=headers)
    EH_soup = BeautifulSoup(r_EH.text, 'html.parser')
    
    latest_row = EH_soup.find_all('tr', attrs={'class': "Results-table-row"})[0]
    latest_run = latest_row.find('td', attrs={'class': "Results-table-td Results-table-td--position"}).text
    
    return latest_run

In [8]:
# Define function to parse data using Beautiful soup
def data_cleaner(r):
    event_soup = BeautifulSoup(r.text, 'html.parser')
    results_1 = event_soup.find_all('tr', attrs={'class':"Results-table-row"})
    
    date = event_soup.find_all('h3')[0].text.split(" | #")[0]
    event_no = event_soup.find_all('h3')[0].text.split(" | #")[1]
    event_name = event_soup.find_all('h1')[0].text
    
    positions = []
    names = []
    total_parkruns = []
    run_time = []
    event_PB = []
    clubs = []
    age_groups = []
    age_grades = []
    athlete_no = []
    
    for result in results_1:
        try:
            positions.append(result.find("td", attrs={"class":"Results-table-td Results-table-td--position"}).text)
        except:
            positions.append(np.nan)
        try:
            names.append(result.find("div", attrs={"class":"compact"}).text)
        except:
            names.append(np.nan)
        try:
            tp = re.split(r'\xa0\n', result.find("div", attrs={"class":"detailed"}).text)[0]
            total_parkruns.append(tp)
        except:
            total_parkruns.append(np.nan)
        try:
            time = result.find("td", attrs={"class":"Results-table-td--time"}).text
            if time[-2] == 'r':
                run_time.append(re.split(r'First Timer!', time)[0])
                event_PB.append(re.split(r'First Timer!', time)[1])
            elif time[-2] == 'B':
                run_time.append(re.split(r'New PB!', time)[0])
                event_PB.append(re.split(r'New PB!', time)[1])            
            else:
                run_time.append(re.split(r'PB\xa0', time)[0])
                event_PB.append(re.split(r'PB\xa0', time)[1])
        except:
            run_time.append(np.nan)
            event_PB.append(np.nan)
        try:
            club = result.find("td", attrs={"class":"Results-table-td Results-table-td--club"}).text
            if "\n" in club or "\xa0" in club:
                clubs.append(np.nan)
            else:
                clubs.append(club)
        except:
            clubs.append(np.nan)
        try:
            grading = result.find("td", attrs={"class":"Results-table-td Results-table-td--ageGroup"}).text
            if len(grading) < 5:
                age_groups.append(np.nan)
                age_grades.append(np.nan)
            else:
                age_groups.append(grading.split('%')[0][:7])
                age_grades.append(grading.split('%')[0][7:])
        except:
            age_groups.append(np.nan)
            age_grades.append(np.nan)
        try:
            ref = result.find_all(href=True)[0]
            athlete_no.append(str(ref).split('"')[1].split('=')[-1])
        except:
            athlete_no.append(np.nan)
            
    df = pd.DataFrame({'event_name': event_name,
                       'event_no': event_no,
                       'date': date,
                       'positions': positions,
                       'athlete_no': athlete_no,
                       'names': names,
                       'total_parkruns': total_parkruns,
                       'run_time': run_time,
                       'event_PB': event_PB,
                       'club': clubs,
                       'age_groups': age_groups,
                       'age_grades': age_grades})
    
    return df

In [16]:
failed_events_all

['greenwich', 'tootingcommon', 'bromley', 'barking']

In [17]:
event_list = failed_events_all
failed_events = []

for event in event_list:
    
    try:
        latest_run = int(find_latest(event))
        
        # define start to get max 2 years data
        if latest_run > 204:
            latest_run_list = list(range(latest_run+1))
            latest_run_start = latest_run_list[-205]
        else:
            latest_run_start = 1
        
        URL_run_template = 'https://www.parkrun.org.uk/{}/results/weeklyresults/?runSeqNumber={}'
        count = 0
        
        # keep track of current event name in case of error
        working_event = event
        
        for run in tqdm(list(range(latest_run_start, latest_run+1))):
            
            # get run data with requets
            URL_run = URL_run_template.format(event, run)
            r_run = requests.get(URL_run, headers=headers)
            
            df_run = data_cleaner(r_run)
    
            if count == 0:
                df = df_run
            else:
                df = df.append(df_run, ignore_index=True)
            
            count += 1
        
            # pause to slow scraping
            sleep(randint(3,5))
    
        file_name = 'all_runs_{}.csv'
        df.to_csv(file_name.format(event), index=False)
        
        # extra sleep between events
        sleep(randint(60,65))
        
    except:
        failed_events.append(event)
        sleep(randint(60,65))

 88%|████████▊ | 181/205 [20:41<02:44,  6.86s/it]
 80%|████████  | 164/205 [15:32<03:53,  5.69s/it]
  8%|▊         | 17/205 [02:41<29:42,  9.48s/it]
100%|██████████| 205/205 [20:47<00:00,  6.09s/it]


In [18]:
count, working_event

(205, 'barking')

In [22]:
failed_events_8 = ['greenwich', 'tootingcommon', 'bromley']
failed_events_8

['greenwich', 'tootingcommon', 'bromley']

In [13]:
failed_events_7 = ['greenwich', 'tootingcommon']

In [26]:
#
# for failed events try in reverse order, saving after each run
# the create 2nd file and start forwards, to converge on bug, saving as all_runs_{event}_2
#

In [27]:
event_list = failed_events_8
failed_events = []

for event in event_list:
    
    try:
        latest_run = int(find_latest(event))
        
        # define start to get max 2 years data
        if latest_run > 204:
            latest_run_list = list(range(latest_run+1))
            latest_run_start = latest_run_list[-205]
        else:
            latest_run_start = 1
        
        URL_run_template = 'https://www.parkrun.org.uk/{}/results/weeklyresults/?runSeqNumber={}'
        count = 0
        
        # keep track of current event name in case of error
        working_event = event
        
        for run in tqdm(list(range(latest_run_start, latest_run+1))):
            
            # get run data with requets
            URL_run = URL_run_template.format(event, run)
            r_run = requests.get(URL_run, headers=headers)
            
            df_run = data_cleaner(r_run)
    
            if count == 0:
                df = df_run
            else:
                df = df.append(df_run, ignore_index=True)
            
            count += 1
        
            # pause to slow scraping
            sleep(randint(3,5))
    
            file_name = 'all_runs_{}_2.csv'
            df.to_csv(file_name.format(event), index=False)
        
        # extra sleep between events
        sleep(randint(60,65))
        
    except:
        failed_events.append(event)
        sleep(randint(60,65))

 88%|████████▊ | 181/205 [15:09<02:00,  5.02s/it]
 80%|████████  | 164/205 [15:14<03:48,  5.58s/it]
  8%|▊         | 17/205 [01:41<18:45,  5.99s/it]


In [9]:
#
# single event scrape below
#

In [20]:
# remove comments to test
event = event_names[0]
latest_run = find_latest(event)
URL_run_template = 'https://www.parkrun.org.uk/{}/results/weeklyresults/?runSeqNumber={}'
count = 0

#for run in tqdm(list(range(1,int(latest_run)+1))):
    
    # get run data with requets
    URL_run = URL_run_template.format(event, run)
    #r_run = requests.get(URL_run, headers=headers)
    
    df_run = data_cleaner(r_run)
    
    if count == 0:
        df = df_run
    else:
        df = df.append(df_run, ignore_index=True)
    
    count += 1
    
    # pause to slow scraping
    sleep(randint(3,5))
    
file_name = 'all_runs_{}.csv'
#df.to_csv(file_name.format(event), index=False)

100%|██████████| 415/415 [32:23<00:00,  4.68s/it]


In [21]:
count

415

In [24]:
df.shape

(66662, 12)

In [2]:
# end