In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
pd.set_option('max_colwidth', 500)
pd.set_option('display.width', 100)
pd.set_option('display.expand_frame_repr', False)
pd.set_option("display.max_rows", 100)

In [97]:
df_stages = pd.read_csv(r'E:\cycling\cycling\data\tdf_stages.csv',encoding='iso-8859-1',na_filter=False)

In [71]:
df_stages[df_stages.Type=='Half Stage']

Unnamed: 0,Stage,Date,Distance,Origin,Destination,Type,Winner,Winner_Country
940,5b,6/29/1976,144.0,Leuven,Verviers,Half Stage,Miguel-Maria Lasa,ESP
953,18a,7/14/1976,86.0,Auch,Langon,Half Stage,Freddy Maertens,BEL
954,18b,7/14/1976,123.0,Langon,Lacanau,Half Stage,Freddy Maertens,BEL
955,18c,7/14/1976,70.0,Lacanau,Bordeaux,Half Stage,Gerben Karstens,NED
960,22b,7/18/1976,91.0,Paris,Paris,Half Stage,Gerben Karstens,NED


### Clean up stage types
Merge types into "flat", "hilly", "mountain", "ITT", "TTT" and "other"

In [77]:
df_stages.Type.unique().tolist()

[u'Flat stage',
 u'Team time trial',
 u'Hilly stage',
 u'Mountain stage',
 u'Mountain',
 u'Flat',
 u'Individual time trial',
 u'Medium mountain stage',
 u'High mountain stage',
 u'Mountain time trial',
 u'Flat cobblestone stage',
 u'Mountain Stage',
 u'Transition stage',
 u'Plain stage',
 u'Intermediate stage',
 u'Stage with mountain(s)',
 u'Plain stage with cobblestones',
 u'Flat Stage',
 u'Half Stage',
 u'Stage with mountain']

In [76]:
stage_types = ['Flat','TTT','Hilly','Mountain','Mountain','Flat','ITT','Hilly','Mountain','ITT','Flat','Mountain','Other','Flat','Hilly','Mountain','Flat','Flat','Other','Mountain']

In [98]:
stage_types_dict = dict(zip(df_stages.Type.unique().tolist(),stage_types))
stage_types_dict

{u'Flat': 'Flat',
 u'Flat Stage': 'Flat',
 u'Flat cobblestone stage': 'Flat',
 u'Flat stage': 'Flat',
 u'Half Stage': 'Other',
 u'High mountain stage': 'Mountain',
 u'Hilly stage': 'Hilly',
 u'Individual time trial': 'ITT',
 u'Intermediate stage': 'Hilly',
 u'Medium mountain stage': 'Hilly',
 u'Mountain': 'Mountain',
 u'Mountain Stage': 'Mountain',
 u'Mountain stage': 'Mountain',
 u'Mountain time trial': 'ITT',
 u'Plain stage': 'Flat',
 u'Plain stage with cobblestones': 'Flat',
 u'Stage with mountain': 'Mountain',
 u'Stage with mountain(s)': 'Mountain',
 u'Team time trial': 'TTT',
 u'Transition stage': 'Other'}

In [99]:
df_stages['Type_clean'] = df_stages['Type'].replace(stage_types_dict)

In [100]:
df_stages = pd.get_dummies(df_stages,columns=['Winner_Country','Type_clean'])

In [101]:
df_stages['Date'] = pd.to_datetime(df_stages['Date'], format='%m/%d/%Y')

In [102]:
df_stages

Unnamed: 0,Stage,Date,Distance,Origin,Destination,Type,Winner,Winner_Country_,Winner_Country_ FRA,Winner_Country_AUS,...,Winner_Country_UKR,Winner_Country_URS,Winner_Country_USA,Winner_Country_UZB,Type_clean_Flat,Type_clean_Hilly,Type_clean_ITT,Type_clean_Mountain,Type_clean_Other,Type_clean_TTT
0,1,2018-07-07,201.0,Noirmoutier-en-l'Île,Fontenay-le-Comte,Flat stage,Fernando Gaviria,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,2,2018-07-08,182.5,Mouilleron-Saint-Germain,La Roche-sur-Yon,Flat stage,Peter Sagan,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,3,2018-07-09,35.5,Cholet,Cholet,Team time trial,BMC Racing Team,1,0,0,...,0,0,0,0,0,0,0,0,0,1
3,4,2018-07-10,195.0,La Baule,Sarzeau,Flat stage,Fernando Gaviria,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,5,2018-07-11,204.5,Lorient,Quimper,Hilly stage,Peter Sagan,0,0,0,...,0,0,0,0,0,1,0,0,0,0
5,6,2018-07-12,181.0,Brest,Mûr-de-Bretagne,Hilly stage,Dan Martin,0,0,0,...,0,0,0,0,0,1,0,0,0,0
6,7,2018-07-13,231.0,Fougères,Chartres,Flat stage,Dylan Groenewegen,0,0,0,...,0,0,0,0,1,0,0,0,0,0
7,8,2018-07-14,181.0,Dreux,Amiens,Flat stage,Dylan Groenewegen,0,0,0,...,0,0,0,0,1,0,0,0,0,0
8,9,2018-07-15,156.5,Arras,Roubaix,Hilly stage,John Degenkolb,0,0,0,...,0,0,0,0,0,1,0,0,0,0
9,10,2018-07-17,158.5,Annecy,Le Grand-Bornand,Mountain stage,Julian Alaphilippe,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [104]:
df_year = pd.DataFrame(df_stages.groupby(df_stages['Date'].dt.year).sum())
df_year

Unnamed: 0_level_0,Distance,Winner_Country_,Winner_Country_ FRA,Winner_Country_AUS,Winner_Country_AUT,Winner_Country_BEL,Winner_Country_BRA,Winner_Country_CAN,Winner_Country_COL,Winner_Country_CZE,...,Winner_Country_UKR,Winner_Country_URS,Winner_Country_USA,Winner_Country_UZB,Type_clean_Flat,Type_clean_Hilly,Type_clean_ITT,Type_clean_Mountain,Type_clean_Other,Type_clean_TTT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1903,2428.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,5,0,0,1,0,0
1904,2429.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,5,0,0,1,0,0
1905,3021.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,9,0,0,2,0,0
1906,4543.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,11,0,0,2,0,0
1907,4488.0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,10,0,0,4,0,0
1908,4497.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,10,0,0,4,0,0
1909,4497.0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,10,0,0,4,0,0
1910,4734.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,9,0,0,6,0,0
1911,5344.0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,8,0,0,7,0,0
1912,5319.0,0,0,0,0,5,0,0,0,0,...,0,0,0,0,8,0,0,7,0,0


In [106]:
df_year = pd.DataFrame(df_stages.groupby(df_stages['Date'].dt.year).sum())
df_year['ave_distance'] = df_stages.groupby(df_stages['Date'].dt.year)['Distance'].mean()
df_year['num_stages']  = df_stages.groupby(df_stages['Date'].dt.year)['Distance'].count()
df_year

Unnamed: 0_level_0,Distance,Winner_Country_,Winner_Country_ FRA,Winner_Country_AUS,Winner_Country_AUT,Winner_Country_BEL,Winner_Country_BRA,Winner_Country_CAN,Winner_Country_COL,Winner_Country_CZE,...,Winner_Country_USA,Winner_Country_UZB,Type_clean_Flat,Type_clean_Hilly,Type_clean_ITT,Type_clean_Mountain,Type_clean_Other,Type_clean_TTT,ave_distance,num_stages
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1903,2428.0,0,0,0,0,0,0,0,0,0,...,0,0,5,0,0,1,0,0,404.666667,6
1904,2429.0,0,0,0,0,0,0,0,0,0,...,0,0,5,0,0,1,0,0,404.833333,6
1905,3021.0,0,0,0,0,0,0,0,0,0,...,0,0,9,0,0,2,0,0,274.636364,11
1906,4543.0,0,0,0,0,0,0,0,0,0,...,0,0,11,0,0,2,0,0,349.461538,13
1907,4488.0,0,1,0,0,0,0,0,0,0,...,0,0,10,0,0,4,0,0,320.571429,14
1908,4497.0,0,0,0,0,0,0,0,0,0,...,0,0,10,0,0,4,0,0,321.214286,14
1909,4497.0,0,0,0,0,1,0,0,0,0,...,0,0,10,0,0,4,0,0,321.214286,14
1910,4734.0,0,0,0,0,0,0,0,0,0,...,0,0,9,0,0,6,0,0,315.600000,15
1911,5344.0,0,0,0,0,1,0,0,0,0,...,0,0,8,0,0,7,0,0,356.266667,15
1912,5319.0,0,0,0,0,5,0,0,0,0,...,0,0,8,0,0,7,0,0,354.600000,15


In [109]:
from bokeh.io import show, output_notebook
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, HoverTool
output_notebook()

In [110]:
source = ColumnDataSource(data=df_year)

hover = HoverTool(tooltips=[('Year','@Date')])

plot=figure(x_axis_label='Total distance (km)',y_axis_label='Average distance per stage (km)')
plot.circle(x='Distance',y='ave_distance',source=source,size=5)
plot.add_tools(hover)
show(plot)

In [123]:
plot=figure(x_axis_label='Year',y_axis_label='Number of French winners',plot_width=900, plot_height=500)
plot.line(x='Date',y='Winner_Country_FRA',source=source,line_width=3, line_alpha=0.6)
plot.circle(x='Date',y='Winner_Country_FRA',source=source,size=5)
plot.add_tools(hover)
show(plot)