# Scrape British Cycling

First get rider ids from ranking lists
https://www.britishcycling.org.uk/ranking/regional/?rank_type=regional&hc=&region=4&choice=rider&year=2017&gender=M&rider_cat=16&resultsperpage=100

regions up to 30 or more (some tables are missing)
- SE: 4
- Yorkshire: 12
- South: 1
- Scotland HQ: 15

categories
- cat 2 - 16
- cat 3 - 28

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

regions = [1, 4, 7, 9, 11, 12, 13, 16, 17, 23, 24, 26, 27, 28, 29, 30]
categories = [16, 28]

In [2]:
region, category, year, sex = 4, 16, 2017, 'M'

url = 'https://www.britishcycling.org.uk/ranking/regional/?rank_type=regional&hc=&region={}&choice=rider&year={}&gender={}&rider_cat={}&resultsperpage=999'.format(region, year, sex, category)

r = requests.get(url)


In [3]:
soup = BeautifulSoup(r.text, "lxml")

In [4]:
options = soup.find_all('option',selected='selected')
options

[<option selected="selected" value="">Please select</option>,
 <option selected="selected" value="4">South East</option>,
 <option selected="selected" value="rider">Regional rider ranking</option>,
 <option selected="selected" value="2017">2017</option>,
 <option selected="selected" value="M">Male</option>,
 <option selected="selected" value="16">2nd</option>]

In [5]:
selected = [o.text for o in options]
info = [selected[s] for s in [1,3,4,5]]
info

['South East', '2017', 'Male', '2nd']

In [6]:
table = soup.find_all('tr', "events--desktop__row")

table


[<tr class="events--desktop__row">
 <th>Rank</th>
 <th>Rider</th>
 <th>Club Name</th>
 <th>Points</th>
 </tr>, <tr class="events--desktop__row">
 <td>1</td>
 <td><a href="/points?person_id=48528&amp;year=2017&amp;type=regional&amp;d=4">James Boyman</a></td>
 <td><a href="/clubpoints/?club_id=7259&amp;year=2017&amp;type=regional&amp;rid=4">Hoops Velo</a></td>
 <td>192</td>
 </tr>, <tr class="events--desktop__row">
 <td>2</td>
 <td><a href="/points?person_id=40167&amp;year=2017&amp;type=regional&amp;d=4">Adam Cotterell</a></td>
 <td><a href="/clubpoints/?club_id=7515&amp;year=2017&amp;type=regional&amp;rid=4">G.S.GREENWICH</a></td>
 <td>182</td>
 </tr>, <tr class="events--desktop__row">
 <td>3</td>
 <td><a href="/points?person_id=138253&amp;year=2017&amp;type=regional&amp;d=4">Tyler Lemmon</a></td>
 <td><a href="/clubpoints/?club_id=6566&amp;year=2017&amp;type=regional&amp;rid=4">TBW Bottecchia Wigmore RT</a></td>
 <td>181</td>
 </tr>, <tr class="events--desktop__row">
 <td>4</td>
 <td><

In [7]:
headers = [t.text for t in table[0].find_all('th')]
headers += ['RiderID', 'ClubID', 'Region','Year', 'Sex', 'Cat']
headers

['Rank',
 'Rider',
 'Club Name',
 'Points',
 'RiderID',
 'ClubID',
 'Region',
 'Year',
 'Sex',
 'Cat']

Need to start in row 1

In [8]:
t = table[1]
t

<tr class="events--desktop__row">
<td>1</td>
<td><a href="/points?person_id=48528&amp;year=2017&amp;type=regional&amp;d=4">James Boyman</a></td>
<td><a href="/clubpoints/?club_id=7259&amp;year=2017&amp;type=regional&amp;rid=4">Hoops Velo</a></td>
<td>192</td>
</tr>

In [9]:
recs = t.find_all('td') 
print([r.text for r in recs])

p = recs[1].a['href']
person_id = p[(1+p.find('=')):p.find('&')]
p = recs[2].a['href']
club_id = p[(1+p.find('=')):p.find('&')]

print(person_id,club_id)


['1', 'James\xa0Boyman', 'Hoops Velo', '192']
48528 7259


In [10]:
ranking = []
for t in table[1:]:
    recs = t.find_all('td') 
    p = recs[1].a['href']
    person_id = p[(1+p.find('=')):p.find('&')]
    p = recs[2].a['href']
    club_id = p[(1+p.find('=')):p.find('&')]
    ranking += [[r.text.replace(u'\xa0', u' ') for r in recs]+[person_id,club_id]+info]



In [11]:
df = pd.DataFrame(ranking,columns=headers)
df

Unnamed: 0,Rank,Rider,Club Name,Points,RiderID,ClubID,Region,Year,Sex,Cat
0,1,James Boyman,Hoops Velo,192,48528,7259,South East,2017,Male,2nd
1,2,Adam Cotterell,G.S.GREENWICH,182,40167,7515,South East,2017,Male,2nd
2,3,Tyler Lemmon,TBW Bottecchia Wigmore RT,181,138253,6566,South East,2017,Male,2nd
3,4,Philip Glowinski,VC Londres,160,34945,1390,South East,2017,Male,2nd
4,5,Steve Calland,Southdowns Bikes – Casco PET,158,50182,202,South East,2017,Male,2nd
5,6,Lewis Winfield,VC Londres,155,453292,1390,South East,2017,Male,2nd
6,7,Gary Brind,Southdowns Bikes – Casco PET,150,106672,202,South East,2017,Male,2nd
7,7,Tristan Grigalis,PMR,150,237720,6040,South East,2017,Male,2nd
8,9,Brindley Taylor,Crawley Wheelers,140,47862,861,South East,2017,Male,2nd
9,10,Rob Sharland,Paceline RT,129,182347,7036,South East,2017,Male,2nd


In [13]:
df.Points = df.Points.astype('int')
df.sort_values('Points', ascending=False)
df

Unnamed: 0,Rank,Rider,Club Name,Points,RiderID,ClubID,Region,Year,Sex,Cat
0,1,James Boyman,Hoops Velo,192,48528,7259,South East,2017,Male,2nd
1,2,Adam Cotterell,G.S.GREENWICH,182,40167,7515,South East,2017,Male,2nd
2,3,Tyler Lemmon,TBW Bottecchia Wigmore RT,181,138253,6566,South East,2017,Male,2nd
3,4,Philip Glowinski,VC Londres,160,34945,1390,South East,2017,Male,2nd
4,5,Steve Calland,Southdowns Bikes – Casco PET,158,50182,202,South East,2017,Male,2nd
5,6,Lewis Winfield,VC Londres,155,453292,1390,South East,2017,Male,2nd
6,7,Gary Brind,Southdowns Bikes – Casco PET,150,106672,202,South East,2017,Male,2nd
7,7,Tristan Grigalis,PMR,150,237720,6040,South East,2017,Male,2nd
8,9,Brindley Taylor,Crawley Wheelers,140,47862,861,South East,2017,Male,2nd
9,10,Rob Sharland,Paceline RT,129,182347,7036,South East,2017,Male,2nd


In [18]:
valid = []

for region in regions:

    url = 'https://www.britishcycling.org.uk/ranking/regional/?rank_type=regional&hc=&region={}&choice=rider&year=2017&gender=M&rider_cat=16&resultsperpage=999'.format(i)
    r = requests.get(url)
    soup = BeautifulSoup(r.text, "lxml")
    options = soup.find_all('option',selected='selected')
    selected = [o.text for o in options]
    info = [selected[s] for s in [1,3,4,5]]
    print(i, info)
    table = soup.find_all('tr', "events--desktop__row")
    if len(table) > 0:
        valid += [i]
        headers = [t.text for t in table[0].find_all('th')]
        headers += ['RiderID', 'ClubID', 'Region','Year', 'Sex', 'Cat']
        print(table[:2])
    #    for t in table[1]:
    #        recs = t.find_all('td') 
    #        p = recs[1].a['href']
    #        person_id = p[(1+p.find('=')):p.find('&')]
    #        p = recs[2].a['href']
    #        club_id = p[(1+p.find('=')):p.find('&')]
    #        ranking += [[r.text.replace(u'\xa0', u' ') for r in recs]+[person_id,club_id]+info]
    #    print(ranking)

    


1 ['South', '2017', 'Male', '2nd']
[<tr class="events--desktop__row">
<th>Rank</th>
<th>Rider</th>
<th>Club Name</th>
<th>Points</th>
</tr>, <tr class="events--desktop__row">
<td>1</td>
<td><a href="/points?person_id=245248&amp;year=2017&amp;type=regional&amp;d=4">Michael Ford</a></td>
<td><a href="/clubpoints/?club_id=2933&amp;year=2017&amp;type=regional&amp;rid=1">VC St Raphael</a></td>
<td>157</td>
</tr>]
2 ['Dundee & District (Pre 2015)', '2017', 'Male', '2nd']
3 ['Regional rider ranking', 'Male', '2nd', '100']
4 ['South East', '2017', 'Male', '2nd']
[<tr class="events--desktop__row">
<th>Rank</th>
<th>Rider</th>
<th>Club Name</th>
<th>Points</th>
</tr>, <tr class="events--desktop__row">
<td>1</td>
<td><a href="/points?person_id=2089&amp;year=2017&amp;type=regional&amp;d=4">Joe Hill</a></td>
<td><a href="/clubpoints/?club_id=7021&amp;year=2017&amp;type=regional&amp;rid=4">Sussex Revolution Velo Club</a></td>
<td>193</td>
</tr>]
5 ['West of Scotland (Pre 2015)', '2017', 'Male', '2nd

In [19]:
valid

[1, 4, 7, 9, 11, 12, 13, 16, 17, 23, 24, 26, 27, 28, 29, 30]