Let's take all the results from the Severn Bridge Half and do some analysis! This is an example of basic scraping and data analysis.

In [1]:
# for parsing HTML page
import requests
from bs4 import BeautifulSoup
from datetime import time, timedelta, datetime
# for easy import/export
import json
# for analysis
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn

In [2]:
%matplotlib
get_ipython().magic(u'pylab inline')
mpl.rcParams['figure.figsize'] = (14.0, 6.0)  # default size of plots
mpl.rcParams['axes.labelsize'] = 20
mpl.rcParams['xtick.labelsize'] = 16
mpl.rcParams['ytick.labelsize'] = 16
mpl.rcParams['xtick.major.size'] = 10
mpl.rcParams['ytick.major.size'] = 10
mpl.rcParams['xtick.minor.size'] = 5
mpl.rcParams['ytick.minor.size'] = 5
mpl.rcParams['legend.framealpha'] = 0.6
mpl.rcParams['legend.fontsize'] = 16
mpl.rcParams['legend.scatterpoints'] = 1

Using matplotlib backend: MacOSX
Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy


# Scraping the data 

In [3]:
# Get the page HTML, and the table element
page = requests.get(r"https://www.fullonsport.com/event/5834/results")
# soup = BeautifulSoup(page.text, 'html.parser')  # included parser
soup = BeautifulSoup(page.text, 'lxml')  # lxml parser
raw_table = soup.find(id='my-results')

In [4]:
# Get the table column names
columns = [str(x.string) for x in raw_table.find('thead').findAll('th')]
print 'Columns:', columns

Columns: ['Position', 'Race No', 'Firstname', 'Lastname', 'Club', 'Club Pos', 'Category', 'Cat Pos', 'Time', 'Chip Time', 'Status']


In [5]:
# Make a class to hold the runner info

class Runner(object):
    """Class to describe entry."""
    def __init__(self, gun_position, race_num, firstname, surname, club, club_pos, 
                 category, category_pos, gun_time, chip_time, status):
        self.gun_position = int(gun_position) # based off gun time
        self.race_num = int(race_num)
        self.firstname = firstname
        self.surname = surname
        self.name = "%s %s" % (self.firstname, self.surname)
        self.club = club
        self.club_pos = int(club_pos)
        self.category = category
        # Gun time is the time differencce between the gun firing and the runner's finish
        self.gun_datetime = datetime.strptime(gun_time, '%H:%M:%S') if gun_time else datetime.max  # incase there is a missing entry
        self.gun_time = self.gun_datetime.time()
        # Chip time is the time difference between when runner crossing the start and finish lines
        self.chip_datetime = datetime.strptime(chip_time, '%H:%M:%S') if chip_time else datetime.max
        self.chip_time = self.chip_datetime.time()
        self.gun_chip_delta = self.gun_datetime - self.chip_datetime # difference between gun time and chip time
        self.status = status
    
    def json(self):
        """Method to convert to JSON output"""
        pass

In [6]:
# Let's get all the rows!
runner_table = []
for child in raw_table.children:
    if child.name != 'tr':
        continue
    row = {k:str(v.text) for k, v in zip(columns, child.children)}
#     print row
    runner = Runner(gun_position=row['Position'], race_num=row['Race No'], firstname=row['Firstname'], 
                    surname=row['Lastname'], club=row['Club'], club_pos=row['Club Pos'], category=row['Category'],
                    category_pos=row['Cat Pos'], gun_time=row['Time'], chip_time=row['Chip Time'], status=row['Status'])
    runner_table.append(runner)
    

AttributeError: 'module' object has no attribute 'strptime'

In [None]:
print runner_table[0].name

In [None]:
runner_table[683].__dict__

# Analysis

So now we have all the data in the table in our objects, stored in the list `runner_table`. This isn't very convenient for analysis though - let's uses a `pandas` DataFrame instead!

In [None]:
print runner_table[0].__dict__.keys()
df = pd.DataFrame(data=[x.__dict__ for x in runner_table], columns=runner_table[0].__dict__.keys())

In [None]:
num_runners = len(df.index)
print num_runners, 'runners'

One thing we need to recalculate is position. The default 'position' we scraped is based off the _gun_ time, not the _chip_ time (which is much fairer).

In [None]:
pos = 'position'
nbins = 30
lw = 2
x_range = [0, num_runners]
normed = False
ax = df[df.category.str.contains('M')][pos].plot(kind='hist', bins=nbins, color='blue', normed=normed,
                                                 histtype='step', label='Male', lw=lw, range=x_range)
df[df.category.str.contains('F')][pos].plot(kind='hist', bins=nbins, color='green', ax=ax, normed=normed,
                                            histtype='step', label='Female', lw=lw, range=x_range)
plt.xlabel('Gun position')
plt.xlim(right=num_runners)
plt.legend(loc=0)