In [2]:
from requests import get
from time import strptime
import datetime as dt
import random
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re
import pickle
import resource
import sys

Below a list of user agents to rotate between, when sending out requests to scrape the data about the TDF starters:

In [3]:
user_agent_list = [
   #Chrome
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
    'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
    'Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
    'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
    'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
    'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
    'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
    #Firefox
    'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)',
    'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)',
    'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko',
    'Mozilla/5.0 (Windows NT 6.2; WOW64; Trident/7.0; rv:11.0) like Gecko',
    'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0)',
    'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)',
    'Mozilla/5.0 (Windows NT 6.1; Win64; x64; Trident/7.0; rv:11.0) like Gecko',
    'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)',
    'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)',
    'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)'
]

In [4]:
#List with all the riders 
Rider_Names = []

The following function scrapes different features of a rider and returns a dataframe with the data. The features are for example Name, Weight, Height, Birthday, Results, Teams, etc.

In [5]:
def get_rider_info(rider_name,i):
  user_agent = rider_name
  RiderUrl = 'https://www.procyclingstats.com/rider/%s/' % rider_name
  headers = {'User-Agent': user_agent[i]}
  RiderResponse = get(RiderUrl, headers=headers)
  html_soup = BeautifulSoup(RiderResponse.text, 'html.parser')
  #Get rider name
  RiderName = html_soup.find_all("title")[0].contents[0]
  
  #Get rider weight
  RiderWeight = []
  try:
    RiderWeight = float(html_soup.find('b', text='Weight:').next_sibling.strip()[:-3])
  except AttributeError:
    RiderWeight = np.nan
  else:
    RiderWeight = float(html_soup.find('b', text='Weight:').next_sibling.strip()[:-3])
  
  #Get rider height
  RiderHeight = []
  try:
    RiderHeight = float(html_soup.find('b', text='Height:').next_sibling.strip()[:-2])
  except AttributeError:
    RiderHeight = np.nan
  else:
    RiderHeight = float(html_soup.find('b', text='Height:').next_sibling.strip()[:-2])
  
  #Get rider Birthday
  RiderBirthMonthYear = html_soup.find('b', text='Date of birth:').next_sibling.next_sibling.next_sibling.strip()[:-5]
  RiderBirthMonthHelper = RiderBirthMonthYear.split()[0][0].upper() + RiderBirthMonthYear.split()[0][1:3].lower()

  try:
    RiderBirthMonth = strptime(RiderBirthMonthHelper,'%b').tm_mon
  except:
    RiderBirthMonth = 1
  try:
    RiderBirthYear = RiderBirthMonthYear.split()[1]
  except:
    RiderBirthYear = 1900
  try:
    RiderBirthDay = html_soup.find('b', text='Date of birth:').next_sibling.strip()
  except:
    RiderBirthDay = 1
  RiderBirthdate = dt.datetime(year=int(float(RiderBirthYear)), month=int(float(RiderBirthMonth)), day=int(float(RiderBirthDay)))
  
  #Get retirement date
  rt = []
  for ultag in html_soup.find_all('ul', {'class': 'moblist rdr-teams'}):
    for litag in ultag.find_all('li'):
        if "Retired" in litag.text:
            rt.append(litag.text)
  try:
    dt.datetime.strptime(''.join(rt).strip()[-10:], '%Y-%m-%d')
  except ValueError:
    RiderRetirementDate = np.nan
  else:
    RiderRetirementDate = dt.datetime.strptime(''.join(rt).strip()[-10:], '%Y-%m-%d')

  #Get Rider Nationality
  RiderNationality = html_soup.find('a', {'class': 'black'}).text.strip()

  #Get Rider Starts
  keyStartsList = html_soup.find('ul', {'class': 'key-stats'}).find('li')

  RiderGTStarts = int(re.search(r'\d+', keyStartsList.text).group(0))
  RiderClassicsStarts = int(re.search(r'\d+', keyStartsList.next_sibling.text).group(0))

  #Get Rider KeyStats
  RiderKeyStatsUrl = RiderUrl + '/statistics/key-career-stats'

  KeyStatsResponse = get(RiderKeyStatsUrl, headers=headers)
  
  KeyStats_soup = BeautifulSoup(KeyStatsResponse.text, 'html.parser')
  
  OneDayStats = []
  try:
    OneDayStats = float(KeyStats_soup.find('h3', text = "1. One-day-races").next_sibling.text.strip('%'))/100
  except ValueError:
    OneDayStats = float(0)
  else:
    OneDayStats = float(KeyStats_soup.find('h3', text = "1. One-day-races").next_sibling.text.strip('%'))/100

  GCStats = []
  try:
    GCStats = float(KeyStats_soup.find('h3', text = "2. General classifications").next_sibling.text.strip('%'))/100
  except ValueError:
    GCStats = float(0)
  else:
    GCStats = float(KeyStats_soup.find('h3', text = "2. General classifications").next_sibling.text.strip('%'))/100

  StagesStats = []
  try:
    StagesStats = float(KeyStats_soup.find('h3', text = "3. Stages").next_sibling.text.strip('%'))/100
  except ValueError:
    StagesStats = float(0)
  else:
    StagesStats = float(KeyStats_soup.find('h3', text = "3. Stages").next_sibling.text.strip('%'))/100
    
  TTStats = []
  try:
    TTStats = float(KeyStats_soup.find('h3', text = "4. Time trial").next_sibling.text.strip('%'))/100
  except ValueError:
    TTStats = float(0)
  else:
    TTStats = float(KeyStats_soup.find('h3', text = "4. Time trial").next_sibling.text.strip('%'))/100

    
  #Create a dataframe to store all the information
  Data = {'Name':[RiderName], 'Weight':[RiderWeight], 'Height':[RiderHeight], 'Birthdate':[RiderBirthdate],'RetirementDate':[RiderRetirementDate],
         'Nationality':[RiderNationality], 'StartsGT':[RiderGTStarts], 'StartsClassics':[RiderClassicsStarts], 'StatsOneday':[OneDayStats],
         'StatsGC':[GCStats], 'StatsStages':[StagesStats], 'StatsTT':[TTStats], }
  RiderInfo = pd.DataFrame(Data)  
 
  #Get rider team per year
  team = []
  team_year = []
  for ultag in html_soup.find_all('ul', {'class': 'moblist rdr-teams'}):
    for litag in ultag.find_all('li'):
        if re.match('^\d{4}', litag.text):
            team.append(litag.text[4:])
            team_year.append(litag.text[:4])

  RiderInfo['Team'] = [team]
  RiderInfo['Team Year'] = [team_year]

  #Get rider season statistics
  RiderSeasonStatsUrl = RiderUrl + '/statistics/season-statistics'
  
  SeasonStatsResponse = get(RiderSeasonStatsUrl, headers=headers)
  
  SeasonStats_soup = BeautifulSoup(SeasonStatsResponse.text, 'html.parser')
  SeasonStats_table = SeasonStats_soup.select('tbody tr')
  
  l = []
  for tr in SeasonStats_table:
    td = tr.find_all('td')
    row = [tr.text for tr in td if tr.text]
    if row:
        l.append(row)
        
  RiderSeasonStatistics = pd.DataFrame(l, columns=["Season", "Points", "Racedays", "KMs", "Wins", "Top-10s"])
  RiderSeasonStatistics.drop(RiderSeasonStatistics.tail(1).index,inplace=True)

#Create array for RiderInfo dataframe
  RiderInfo['Stats Season'] = [RiderSeasonStatistics['Season'].to_numpy()]
  RiderInfo['Stats Points'] = [RiderSeasonStatistics['Points'].to_numpy()]
  RiderInfo['Stats Racedays'] = [RiderSeasonStatistics['Racedays'].to_numpy()]
  RiderInfo['Stats KMs'] = [RiderSeasonStatistics['KMs'].to_numpy()]
  RiderInfo['Stats Wins'] = [RiderSeasonStatistics['Wins'].to_numpy()]
  RiderInfo['Stats Top-10s'] = [RiderSeasonStatistics['Top-10s'].to_numpy()]
  
  #Create a dataframe to store all the information
  return RiderInfo

In order to obtain the Tour de France starter lists, the years to be scraped are stored in a list:

In [21]:
#Get rider names
#List with years of TdF startlists
Years = list(range(1995, 2020))
RidersTDF = []


Next, all TdF starters for the given years are scrapped from the crawled startlists

In [23]:
for i in range(len(Years)):
    TdfURL = 'https://www.procyclingstats.com/race/tour-de-france/'+str(Years[i])+'/gc/startlist'
    headers = {'User-Agent': user_agent_list[random.randint(1,(len(user_agent_list))-1)]}
    TdfResponse = get(TdfURL, headers=headers)
    Tdf_soup = BeautifulSoup(TdfResponse.text, 'html.parser')
    
    for ahref in Tdf_soup.find_all('a', {'class': 'rider blue'}):
        RidersTDF.append(ahref['href'][6:])

This results in 1358 riders for the years 1995 till 2019:

In [24]:
RidersTDF = list(set(RidersTDF))
len(RidersTDF)

1358

In [19]:
#Scraping the data for the list of riders:

rider_names = RidersTDF
RidersInfo = None

for rider_name in rider_names:
  RiderInfo = get_rider_info(rider_name,0)
  if RidersInfo is None:
    RidersInfo = RiderInfo
  else:
    RidersInfo = RidersInfo.append(RiderInfo, ignore_index = True)

Raw scraped data:

In [26]:
RidersInfo.head(2)

Unnamed: 0,Name,Weight,Height,Birthdate,RetirementDate,Nationality,StartsGT,StartsClassics,StatsOneday,StatsGC,StatsStages,StatsTT,Team,Team Year,Stats Season,Stats Points,Stats Racedays,Stats KMs,Stats Wins,Stats Top-10s
0,[Sebastian Langeveld],67.0,1.78,1985-01-17,,Netherlands,9,39,0.58,0.32,0.1,0.08,"[EF Education First (WT), EF Education First (...","[2020, 2019, 2018, 2017, 2016, 2016, 2015, 201...","[2019, 2018, 2017, 2016, 2015, 2014, 2013, 201...","[263, 85, 253, 152, 93, 332, 289, 244, 416, 28...","[74, 77, 57, 65, 60, 68, 44, 84, 52, 85, 69, 7...","[11620, 11787, 9584, 10465, 9855, 11331, 7217,...","[0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 3, 0, 1, 1, 0, ...","[17, 2, 1, 4, 1, 6, 7, 5, 7, 8, 18, 6, 6, 5, 5..."
1,[Michael Blaudzun],,,1973-04-30,,Denmark,10,11,0.27,0.62,0.11,0.1,"[CSC ProTeam (PT), CSC ProTeam (PT), CSC ProTe...","[2008, 2007, 2006, 2005, 2004, 2003, 2002, 200...","[2008, 2007, 2006, 2005, 2004, 2003, 2002, 200...","[207, 59, 99, 413, 154, 102, 84, 377, 248, 355...","[79, 71, 75, 75, 60, 69, 38, 64, 58, 38, 21, 1...","[11440, 10431, 11510, 11179, 8729, 9840, 5381,...","[0, 0, 0, 2, 1, 1, 0, 2, 0, 2, 0, 0, 2, 0, 1, 0]","[8, 2, 3, 10, 3, 4, 4, 12, 9, 17, 1, 1, 17, 4,..."


Export of the data as pickle to keep the nested arrays in the dataframe:

In [27]:
import sys
sys.setrecursionlimit(100000)
RidersInfo.to_pickle('TDF-1995-2019_3.pkl')