In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from pandas import DataFrame

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [117]:
# Handle to output list which will be a list of dictionaries
lst = { 'RACE_NUMBER'       : [] , \
        'FIRST_NAME'        : [] , \
        'SURNAME'           : [] , \
        'GENDER'            : [] , \
        'GENDER_POSITION'   : [] , \
        'CATEGORY'          : [] , \
        'CATEGORY_POSITION' : [] , \
        'CLUB'              : [] , \
        '5K'                : [] , \
        '10K'               : [] , \
        '15K'               : [] , \
        '20K'               : [] , \
        'OVERALL_POSITION'  : [] , \
        'CHIP_TIME'         : [] , \
        'CHIP_POSITION'     : []      
      }


In [135]:
# 2022 Cardiff University Cardiff Half Marathon
#   Event 3829 October 2022 Cardiff Half Marathon (130 pages)

#   Event 3738 March   2022 Cardiff Half Marathon (118 pages)

lastPageNo  = 130
#lastPageNo  = 5
eventNo     = 3829
for pageNo in range ( 1, lastPageNo + 1):

    if (pageNo % 5) == 0:
        print ('Processed {p} pages'.format (p=pageNo) )
    
    URL = "https://www.tdleventservices.co.uk/event-results/events?event={eventNo}&sort=categorypos&page={pageNo}"\
           .format(eventNo=eventNo,pageNo=pageNo)
    
    #print (URL)
    
    page = requests.get(URL)
    soup = BeautifulSoup(page.content, "html.parser")

    results = soup.find('tbody')   #in each page there is currently only one table (one tbody element)
    
    for tr_element in results.find_all ('tr'):
        td_elements = tr_element.find_all ('td')
    
        l = [i.text for i in td_elements]    
        lst['RACE_NUMBER'].append       (l[0] )
        lst['FIRST_NAME'].append        (l[1] )    
        lst['SURNAME'].append           (l[2] )    
        lst['GENDER'].append            (l[3] )        
        lst['GENDER_POSITION'].append   (l[4] )            
        lst['CATEGORY'].append          (l[5] )
        lst['CATEGORY_POSITION'].append (l[6] )    
        lst['CLUB'].append              (l[7] )    

        lst['5K'].append               (l[8] )                    
        lst['10K'].append              (l[10] )  
        lst['15K'].append              (l[12] )  
        lst['20K'].append              (l[14] )  

        lst['OVERALL_POSITION'].append (l[17] )          
        lst['CHIP_TIME'].append        (l[18] )  
        lst['CHIP_POSITION'].append    (l[19] )      
    

Processed 5 pages
Processed 10 pages
Processed 15 pages
Processed 20 pages
Processed 25 pages
Processed 30 pages
Processed 35 pages
Processed 40 pages
Processed 45 pages
Processed 50 pages
Processed 55 pages
Processed 60 pages
Processed 65 pages
Processed 70 pages
Processed 75 pages
Processed 80 pages
Processed 85 pages
Processed 90 pages
Processed 95 pages
Processed 100 pages
Processed 105 pages
Processed 110 pages
Processed 115 pages
Processed 120 pages
Processed 125 pages
Processed 130 pages


In [136]:
df = DataFrame ( lst)
df.shape

(24679, 15)

In [137]:
# Data Cleaning
# Remove the DQ rows
df = df [ df.CHIP_TIME != 'DQ'  ]
df = df [ df.CHIP_TIME != 'DNF'  ]
df = df[  df['5K'].str.contains (':') ]

df = df[  df['10K'].str.contains (':') ]
df = df[  df['15K'].str.contains (':') ]
df = df[  df['20K'].str.contains (':') ]
df = df[  df['CHIP_TIME'].str.contains (':') ]


In [138]:

df['5K']        =  pd.to_datetime (  df['5K']  , format = '%H:%M:%S')
df['10K']       =  pd.to_datetime (  df['10K'] , format = '%H:%M:%S')
df['15K']       =  pd.to_datetime (  df['15K'] , format = '%H:%M:%S')
df['20K']       =  pd.to_datetime (  df['20K'] , format = '%H:%M:%S')
df['CHIP_TIME'] =  pd.to_datetime (  df['CHIP_TIME'] , format = '%H:%M:%S')

In [139]:
# Convert times to datetime objects and 
# calculate mins per km for each race section

df['5K']        =  pd.to_datetime (  df['5K']  , format = '%H:%M:%S')
df['10K']       =  pd.to_datetime (  df['10K'] , format = '%H:%M:%S')

df['15K']       =  pd.to_datetime (  df['15K'] , format = '%H:%M:%S')
df['20K']       =  pd.to_datetime (  df['20K'] , format = '%H:%M:%S')
df['CHIP_TIME'] =  pd.to_datetime (  df['CHIP_TIME'] , format = '%H:%M:%S')


df['CHIP_TIME_MINS'] = df['CHIP_TIME'].dt.hour * 60 + \
                       df['CHIP_TIME'].dt.minute    + \
                       df['CHIP_TIME'].dt.second / 60


df['5K_MINS']       = df['5K'].dt.hour * 60  + \
                       df['5K'].dt.minute    + \
                       df['5K'].dt.second / 60

df['10K_MINS']       = df['10K'].dt.hour * 60  + \
                       df['10K'].dt.minute    + \
                       df['10K'].dt.second / 60

df['15K_MINS']       = df['15K'].dt.hour * 60  + \
                       df['15K'].dt.minute    + \
                       df['15K'].dt.second / 60

df['20K_MINS']       = df['20K'].dt.hour * 60  + \
                       df['20K'].dt.minute    + \
                       df['20K'].dt.second / 60

df['0K_5K_SECTION_TIME']  =  df['5K_MINS']
df['5K_10K_SECTION_TIME'] = df['10K_MINS']  - df['5K_MINS']
df['10K_15K_SECTION_TIME'] = df['15K_MINS']  - df['10K_MINS']
df['15K_20K_SECTION_TIME'] = df['20K_MINS']  - df['15K_MINS']

df['min_per_km_course']   = df['CHIP_TIME_MINS']    / 21.075
df['min_per_km_0_5']      = df['5K_MINS']               / 5
df['min_per_km_6_10']     = df['5K_10K_SECTION_TIME']   / 5
df['min_per_km_11_15']    = df['10K_15K_SECTION_TIME']  / 5
df['min_per_km_16_20']    = df['15K_20K_SECTION_TIME']  / 5

df['min_per_km_21_FINISH'] = (df['CHIP_TIME_MINS'] - df['20K_MINS']) / ( 21.0975 - 20)


In [129]:
df.to_csv (  '..\Data\March_2022.csv'   ,index=False)