# This notebook explores results from the 2018 road cycling season

Data<sup>1</sup> include categorized races performed by all riders in World Tour teams<sup>2</sup>. This include 18 teams with a total of 522 riders.
#### The goal is to find features that can predict the general classification top 10 of Tour de France (TdF).  
<sup>1</sup>data is scraped from CQranking  
<sup>2</sup>Pro Continental teams are excluded for now

In [2]:
import pandas as pd
import numpy as np

In [359]:
pd.set_option('max_colwidth', 500)
pd.set_option('display.width', 100)
pd.set_option('display.expand_frame_repr', False)
pd.set_option("display.max_rows", 100)

In [371]:
df_riders = pd.read_csv(r'E:\cycling\cycling\data\riders_2018.csv',encoding='iso-8859-1')
df_rider_results = pd.read_csv(r'E:\cycling\cycling\data\rider_results_2018.csv',encoding='iso-8859-1')

In [361]:
df_riders.head(20)

Unnamed: 0,team,Rider,Date of birth,rank_start,point_start,rank_end,point_end,Comments,Country
0,ALM,BAGDONAS Gediminas,26/12/1985,510.0,140.0,317.0,226.0,,LTU
1,ALM,BAKELANTS Jan,14/02/1986,117.0,472.0,565.0,120.0,,BEL
2,ALM,BARBIER Rudy,18/12/1992,104.0,506.0,717.0,90.0,,FRA
3,ALM,BARDET Romain,09/11/1990,30.0,1162.0,14.0,1492.0,,FRA
4,ALM,BIDARD François,19/03/1992,781.0,80.0,523.0,137.0,,FRA
5,ALM,BOUCHARD Geoffrey,01/04/1992,,,1140.0,42.0,Trainee as from 31/07,FRA
6,ALM,CHAMPOUSSIN Clément,29/05/1998,2949.0,2.0,978.0,55.0,Trainee as from 31/07,FRA
7,ALM,CHEREL Mikael,17/03/1986,552.0,127.0,492.0,147.0,,FRA
8,ALM,CHEVRIER Clément,29/06/1992,844.0,70.0,1416.0,25.0,,FRA
9,ALM,COSNEFROY Benoit,17/10/1995,375.0,191.0,311.0,229.0,,FRA


In [372]:
df_results = df_rider_results.merge(df_riders,on='Rider',how='inner',suffixes=('_l','_r'))

In [373]:
df_results

Unnamed: 0,Date,Race,Rank,Rider,CQ,team,Date of birth,rank_start,point_start,rank_end,point_end,Comments,Country
0,6/10/2018,Tour de Vendée,23.,BAGDONAS Gediminas,0,ALM,26/12/1985,510.0,140.0,317.0,226.0,,LTU
1,4/10/2018,Paris - Bourges,70.,BAGDONAS Gediminas,0,ALM,26/12/1985,510.0,140.0,317.0,226.0,,LTU
2,26/09/2018,World Championships (Innsbruck) I.T.T.,51.,BAGDONAS Gediminas,5,ALM,26/12/1985,510.0,140.0,317.0,226.0,,LTU
3,23/09/2018,World Championships (Innsbruck) T.T.T.,15.,BAGDONAS Gediminas,7,ALM,26/12/1985,510.0,140.0,317.0,226.0,,LTU
4,12/09/2018,GP de Wallonie,107.,BAGDONAS Gediminas,0,ALM,26/12/1985,510.0,140.0,317.0,226.0,,LTU
5,1/09/2018,Brussels Cycling Classic,70.,BAGDONAS Gediminas,0,ALM,26/12/1985,510.0,140.0,317.0,226.0,,LTU
6,24/08/2018,Tour du Poitou Charentes,6.,BAGDONAS Gediminas,45,ALM,26/12/1985,510.0,140.0,317.0,226.0,,LTU
7,23/08/2018,"Tour du Poitou Charentes, Stage 4 : Champagné-Saint-Hilaire - Couhé I.T.T.",9.,BAGDONAS Gediminas,0,ALM,26/12/1985,510.0,140.0,317.0,226.0,,LTU
8,23/08/2018,"Tour du Poitou Charentes, Stage 3 : Gençay - Couhé",20.,BAGDONAS Gediminas,0,ALM,26/12/1985,510.0,140.0,317.0,226.0,,LTU
9,22/08/2018,"Tour du Poitou Charentes, Stage 2 : Segonzac - Melle",15.,BAGDONAS Gediminas,0,ALM,26/12/1985,510.0,140.0,317.0,226.0,,LTU


### Clean up the data types

Replace below values  
leader (leader of the stage race): 0  
\- (usually means TTT): nan  
DNF (did not finish): -1  
OOT (out of time): -2  
DNS (did not start): -5  
DQ (disqualified): -10

In [374]:
df_results['Rank'] = df_results['Rank'].replace('leader',0)
df_results['Rank'] = df_results['Rank'].replace('DNF',-1)
df_results['Rank'] = df_results['Rank'].replace('OOT',-2)
df_results['Rank'] = df_results['Rank'].replace('-',np.nan)
df_results['Rank'] = df_results['Rank'].replace('DNS',-5)
df_results['Rank'] = df_results['Rank'].replace('DQ',-10)
df_results['Rank'] = df_results['Rank'].astype(float)

df_results['Date'] = pd.to_datetime(df_results['Date'],format='%d/%m/%Y')

TdF results

In [375]:
df_tdf_gc = df_results.loc[df_results['Race']=='Tour de France',['Rider','Rank','rank_start','point_start']]

In [376]:
df_tdf_gc.head()

Unnamed: 0,Rider,Rank,rank_start,point_start
128,BARDET Romain,6.0,30.0,1162.0
518,DILLIER Silvan,83.0,72.0,653.0
819,FRANK Mathias,55.0,156.0,375.0
1288,LATOUR Pierre,13.0,124.0,453.0
1420,NAESEN Oliver,66.0,37.0,1019.0


Riders who participated in TdF

In [377]:
tdf_riders = df_tdf_gc.Rider.values

Top ten finishers

In [378]:
top10_riders = df_tdf_gc.loc[df_tdf_gc['Rank'].between(1,10),'Rider'].values

Non top ten finishers

In [379]:
non_top10_riders = list(set(tdf_riders) - set(top10_riders))

Find races before TdF (2018-07-07)

In [380]:
mask = (df_results['Date'] < '2018-07-07')
df_pre_tdf = df_results.loc[mask]

Calculate rider performance in pre-TdF races

In [381]:
points_sum = []
race_days = []
wins = []
podiums = []
top_tens = []
for r in tdf_riders:
    df_r = df_pre_tdf[df_pre_tdf['Rider']==r]
    points_sum.append(df_r['CQ'].sum())
    race_days.append(df_r['Race'].count())
    wins.append(df_r.loc[df_r['Rank']==1,'Race'].count())
    podiums.append(df_r.loc[df_r['Rank'].between(1,3),'Race'].count())
    top_tens.append(df_r.loc[df_r['Rank'].between(1,10),'Race'].count())
   
df_performance = pd.DataFrame(
    {'riders': tdf_riders,
     'win': wins,
     'podium': podiums,
     'top_ten': top_tens,
     'points': points_sum,
     'race_days': race_days,
    },columns=['riders','win','podium','top_ten','points','race_days'])

In [382]:
df_performance.sort_values(by=['win', 'podium','top_ten'],ascending=False).head(10)

Unnamed: 0,riders,win,podium,top_ten,points,race_days
56,VALVERDE BELMONTE Alejandro,11,17,26,1600,42
47,IMPEY Daryl,8,16,19,838,49
83,ROGLIC Primoz,7,14,18,982,37
89,KWIATKOWSKI Michal,7,11,17,805,43
31,SAGAN Peter,6,14,26,1246,43
86,BERNAL GOMEZ Egan Arley,6,11,20,866,39
22,VAN AVERMAET Greg,5,10,21,785,47
57,ALAPHILIPPE Julian,5,10,20,932,45
92,THOMAS Geraint,4,11,15,804,36
20,KÜNG Stefan,4,7,11,269,40


In [383]:
df_performance[df_performance['riders'].isin(top10_riders)]

Unnamed: 0,riders,win,podium,top_ten,points,race_days
0,BARDET Romain,1,6,17,772,32
53,LANDA MEANA Mikel,1,4,13,500,33
54,QUINTANA ROJAS Nairo Alexander,1,7,14,606,31
78,ZAKARIN Ilnur,0,0,4,190,34
81,KRUIJSWIJK Steven,0,0,9,362,32
83,ROGLIC Primoz,7,14,18,982,37
88,FROOME Chris,3,4,15,837,41
92,THOMAS Geraint,4,11,15,804,36
95,DUMOULIN Tom,1,5,12,646,39
112,MARTIN Daniel,1,3,8,370,37


### We see that:
1. Many riders with good pre-tdf performance are sprinters like Dylan Groenewegen who will never win a TdF general classification.  
1. TdF top 10 riders don't necessarily perform well in the early season as they are building up towards TdF  

Let's look at Critérium du Dauphiné and Tour de Suisse instead which are TdF preparation races and are good indicators of the form of riders leading up to TdF.

In [6]:
def get_race_performance(race_name,abbrev):
    df_race = df_results.loc[df_results['Race'].str.contains(race_name.decode('utf-8')),:]
    riders = df_race['Rider'].unique()
    points_sum = []
    wins = []
    podiums = []
    top_tens = []
    for r in riders:
        df_r = df_race[df_race['Rider']==r]
        points_sum.append(df_r['CQ'].sum())
        wins.append(df_r.loc[df_r['Rank']==1,'Race'].count())
        podiums.append(df_r.loc[df_r['Rank'].between(1,3),'Race'].count())
        top_tens.append(df_r.loc[df_r['Rank'].between(1,10),'Race'].count())

    df_race_performance = pd.DataFrame(
        {'riders': riders,
         'win': wins,
         'podium': podiums,
         'top_ten': top_tens,
         'points': points_sum,
        },columns=['riders','win','podium','top_ten','points'])
    df_race_performance = df_race_performance.add_suffix('_{}'.format(abbrev))
    return df_race_performance

In [386]:
df_dauphine_perform = get_race_performance('Critérium du Dauphiné','dauphine')
df_swiss_perform = get_race_performance('Tour de Suisse','swiss')

In [387]:
df_dauphine_perform.sort_values(by=['win_dauphine', 'podium_dauphine','top_ten_dauphine'],ascending=False).head(10)

Unnamed: 0,riders_dauphine,win_dauphine,podium_dauphine,top_ten_dauphine,points_dauphine
102,THOMAS Geraint,2,5,6,372
99,KWIATKOWSKI Michal,2,2,3,99
55,YATES Adam,1,3,6,268
120,MARTIN Daniel,1,3,5,230
52,IMPEY Daryl,1,2,3,81
63,ALAPHILIPPE Julian,1,2,3,92
100,MOSCON Gianni,1,2,3,45
28,ACKERMANN Pascal,1,2,2,63
97,CASTROVIEJO NICOLAS Jonathan,1,1,2,17
98,GEOGHEGAN HART Tao,1,1,2,59


In [388]:
df_swiss_perform.sort_values(by=['win_swiss', 'podium_swiss','top_ten_swiss'],ascending=False).head(10)

Unnamed: 0,riders_swiss,win_swiss,podium_swiss,top_ten_swiss,points_swiss
24,PORTE Richie,2,3,5,337
23,KÜNG Stefan,2,2,2,105
106,ANDERSEN Søren Kragh,1,3,3,85
34,SAGAN Peter,1,2,5,90
61,QUINTANA ROJAS Nairo Alexander,1,2,4,209
27,VAN GARDEREN Tejay,1,2,2,34
126,ULISSI Diego,1,1,5,126
14,COLBRELLI Sonny,1,1,3,56
26,VAN AVERMAET Greg,1,1,2,19
36,DEMARE Arnaud,1,1,2,50


We're getting closer to filtering out the actual GC riders like Geraint Thomas but there's still many irrelevant riders.  
Maybe we can use these to predict stage wins... but for GC wins we need to further scrutinize.  
Also, some big contenders like Chris Froome are not doing either race.  

We probably need to figure out how to extract more information e.g. **rider specialty, stage type** etc.  
But for now let's just use these features to see what we can get.

Creating the training data set

In [389]:
df_performance.shape

(115, 6)

In [390]:
df_pre_perform = df_tdf_gc.merge(df_performance,left_on='Rider',right_on='riders',how='left',suffixes=('_l','_r'))
df_pre_perform = df_pre_perform.merge(df_dauphine_perform,left_on='Rider',right_on='riders_dauphine',how='left')
df_pre_perform = df_pre_perform.merge(df_swiss_perform,left_on='Rider',right_on='riders_swiss',how='left')
df_pre_perform['riders_dauphine'].fillna(df_pre_perform['riders_swiss'],inplace=True)
df_pre_perform['win_dauphine'].fillna(df_pre_perform['win_swiss'],inplace=True)
df_pre_perform['podium_dauphine'].fillna(df_pre_perform['podium_swiss'],inplace=True)
df_pre_perform['top_ten_dauphine'].fillna(df_pre_perform['top_ten_swiss'],inplace=True)
df_pre_perform['points_dauphine'].fillna(df_pre_perform['points_swiss'],inplace=True)

In [391]:
df_pre_perform_clean = df_pre_perform.drop(df_pre_perform.columns[-5:],axis=1)
df_pre_perform_clean.drop(['riders','riders_dauphine'],axis=1,inplace=True)
df_pre_perform_clean['tdf_top10'] = (df_pre_perform_clean['Rank'] <= 10).astype(int)
df_pre_perform_clean.dropna(inplace=True)
df_pre_perform_clean.set_index('Rider', inplace=True)

In [392]:
df_pre_perform_clean

Unnamed: 0_level_0,Rank,rank_start,point_start,win,podium,top_ten,points,race_days,win_dauphine,podium_dauphine,top_ten_dauphine,points_dauphine,tdf_top10
Rider,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
BARDET Romain,6.0,30.0,1162.0,1,6,17,772,32,0.0,2.0,6.0,203.0,1
DILLIER Silvan,83.0,72.0,653.0,1,3,6,371,38,0.0,0.0,0.0,5.0,0
FRANK Mathias,55.0,156.0,375.0,0,0,8,262,52,0.0,0.0,1.0,30.0,0
LATOUR Pierre,13.0,124.0,453.0,1,3,18,601,43,0.0,0.0,5.0,106.0,0
NAESEN Oliver,66.0,37.0,1019.0,0,2,7,433,41,0.0,0.0,2.0,15.0,0
CORT NIELSEN Magnus,68.0,112.0,480.0,2,6,17,424,47,0.0,0.0,2.0,13.0,0
FRAILE MATARRANZ Omar,57.0,257.0,263.0,2,4,4,225,37,0.0,0.0,0.0,5.0,0
FUGLSANG Jakob,12.0,67.0,703.0,1,7,16,786,39,0.0,2.0,3.0,224.0,0
HANSEN Jesper,56.0,233.0,279.0,0,0,1,145,38,0.0,0.0,0.0,12.0,0
KANGERT Tanel,16.0,926.0,58.0,1,1,1,123,44,0.0,0.0,0.0,60.0,0


In [393]:
df_X = df_pre_perform_clean.iloc[:,1:-1]
X = np.asarray(df_X)
df_y = pd.DataFrame(df_pre_perform_clean.iloc[:,-1])
y = np.asarray(df_y)

In [394]:
df_X.to_csv('E:\cycling\cycling\data\X_train.csv',encoding='iso-8859-1')
df_y.to_csv('E:\cycling\cycling\data\y_train.csv',encoding='iso-8859-1')

Creating the test data set from 2017 results

In [7]:
df_riders = pd.read_csv(r'E:\cycling\cycling\data\riders_2017.csv',encoding='iso-8859-1')
df_rider_results = pd.read_csv(r'E:\cycling\cycling\data\rider_results_2017.csv',encoding='iso-8859-1')

In [18]:
df_results = df_rider_results.merge(df_riders,on='Rider',how='inner',suffixes=('_l','_r'))

df_results['Rank'] = df_results['Rank'].replace('leader',0)
df_results['Rank'] = df_results['Rank'].replace('DNF',-1)
df_results['Rank'] = df_results['Rank'].replace('OOT',-2)
df_results['Rank'] = df_results['Rank'].replace('-',np.nan)
df_results['Rank'] = df_results['Rank'].replace('DNS',-5)
df_results['Rank'] = df_results['Rank'].replace('DQ',-10)
df_results['Rank'] = df_results['Rank'].astype(float)

df_results['Date'] = pd.to_datetime(df_results['Date'],format='%d/%m/%Y')

df_tdf_gc = df_results.loc[df_results['Race']=='Tour de France',['Rider','Rank','rank_start','point_start']]

tdf_riders = df_tdf_gc.Rider.values

mask = (df_results['Date'] < '2017-07-01')
df_pre_tdf = df_results.loc[mask]

points_sum = []
race_days = []
wins = []
podiums = []
top_tens = []
for r in tdf_riders:
    df_r = df_pre_tdf[df_pre_tdf['Rider']==r]
    points_sum.append(df_r['CQ'].sum())
    race_days.append(df_r['Race'].count())
    wins.append(df_r.loc[df_r['Rank']==1,'Race'].count())
    podiums.append(df_r.loc[df_r['Rank'].between(1,3),'Race'].count())
    top_tens.append(df_r.loc[df_r['Rank'].between(1,10),'Race'].count())
   
df_performance = pd.DataFrame(
    {'riders': tdf_riders,
     'win': wins,
     'podium': podiums,
     'top_ten': top_tens,
     'points': points_sum,
     'race_days': race_days,
    },columns=['riders','win','podium','top_ten','points','race_days'])

df_dauphine_perform = get_race_performance('Critérium du Dauphiné',abbrev='dauphine')
df_swiss_perform = get_race_performance('Tour de Suisse',abbrev='swiss')

df_pre_perform = df_tdf_gc.merge(df_performance,left_on='Rider',right_on='riders',how='left',suffixes=('_l','_r'))
df_pre_perform = df_pre_perform.merge(df_dauphine_perform,left_on='Rider',right_on='riders_dauphine',how='left')
df_pre_perform = df_pre_perform.merge(df_swiss_perform,left_on='Rider',right_on='riders_swiss',how='left')
df_pre_perform['riders_dauphine'].fillna(df_pre_perform['riders_swiss'],inplace=True)
df_pre_perform['win_dauphine'].fillna(df_pre_perform['win_swiss'],inplace=True)
df_pre_perform['podium_dauphine'].fillna(df_pre_perform['podium_swiss'],inplace=True)
df_pre_perform['top_ten_dauphine'].fillna(df_pre_perform['top_ten_swiss'],inplace=True)
df_pre_perform['points_dauphine'].fillna(df_pre_perform['points_swiss'],inplace=True)

df_pre_perform_clean = df_pre_perform.drop(df_pre_perform.columns[-5:],axis=1)
df_pre_perform_clean.drop(['riders','riders_dauphine'],axis=1,inplace=True)
df_pre_perform_clean['tdf_top10'] = (df_pre_perform_clean['Rank'] <= 10).astype(int)
df_pre_perform_clean.dropna(inplace=True)
df_pre_perform_clean.set_index('Rider', inplace=True)

In [19]:
df_pre_perform_clean

Unnamed: 0_level_0,Rank,rank_start,point_start,win,podium,top_ten,points,race_days,win_dauphine,podium_dauphine,top_ten_dauphine,points_dauphine,tdf_top10
Rider,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
BAKELANTS Jan,22.0,70.0,636.0,0,1,3,136,33,0.0,0.0,1.0,26.0,0
BARDET Romain,3.0,8.0,1667.0,0,1,13,481,37,0.0,0.0,4.0,120.0,1
DOMONT Axel,68.0,566.0,117.0,0,1,1,69,40,0.0,1.0,1.0,33.0,0
FRANK Mathias,30.0,215.0,297.0,0,0,6,274,42,0.0,0.0,3.0,113.0,0
GAUTIER Cyril,48.0,319.0,222.0,0,1,3,142,39,0.0,0.0,0.0,0.0,0
LATOUR Pierre,29.0,67.0,645.0,1,3,10,339,47,0.0,0.0,1.0,48.0,0
NAESEN Oliver,63.0,68.0,643.0,1,4,17,688,48,0.0,0.0,2.0,8.0,0
VUILLERMOZ Alexis,13.0,124.0,437.0,1,1,1,161,35,0.0,0.0,0.0,18.0,0
ARU Fabio,5.0,50.0,777.0,1,3,12,461,26,0.0,0.0,4.0,135.0,1
GRUZDEV Dmitriy,115.0,421.0,166.0,2,2,3,76,31,0.0,0.0,0.0,5.0,0


In [352]:
df_pre_perform_clean[df_pre_perform_clean['tdf_top10']==1]

Unnamed: 0_level_0,Rank,rank_start,point_start,win,podium,top_ten,points,race_days,win_dauphine,podium_dauphine,top_ten_dauphine,points_dauphine,tdf_top10
Rider,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
BARDET Romain,3.0,8.0,1667.0,1,4,23,1151,81,0.0,0.0,4.0,120.0,1
ARU Fabio,5.0,50.0,777.0,2,6,25,1268,76,0.0,0.0,4.0,135.0,1
YATES Simon,7.0,72.0,621.0,3,4,14,828,82,0.0,0.0,0.0,48.0,1
MARTIN Daniel,6.0,30.0,993.0,1,11,29,1383,62,0.0,2.0,3.0,193.0,1
FROOME Chris,1.0,2.0,2552.0,4,16,34,2592,105,0.0,1.0,4.0,156.0,1
BARGUIL Warren,10.0,76.0,613.0,2,4,12,710,69,0.0,0.0,0.0,6.0,1
CONTADOR VELASCO Alberto,9.0,7.0,1698.0,1,13,33,1494,87,0.0,0.0,3.0,62.0,1
MEINTJES Louis,8.0,95.0,551.0,0,1,10,616,91,0.0,1.0,2.0,108.0,1


In [355]:
df_X = df_pre_perform_clean.iloc[:,1:-1]
X = np.asarray(df_X)
df_y = pd.DataFrame(df_pre_perform_clean.iloc[:,-1])
y = np.asarray(df_y)
df_X.to_csv('E:\cycling\cycling\data\X_test.csv',encoding='iso-8859-1')
df_y.to_csv('E:\cycling\cycling\data\y_test.csv',encoding='iso-8859-1')

In [357]:
df_y

Unnamed: 0_level_0,tdf_top10
Rider,Unnamed: 1_level_1
BAKELANTS Jan,0
BARDET Romain,1
DOMONT Axel,0
FRANK Mathias,0
GAUTIER Cyril,0
LATOUR Pierre,0
NAESEN Oliver,0
VUILLERMOZ Alexis,0
ARU Fabio,1
GRUZDEV Dmitriy,0
