In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
pd.set_option('max_colwidth', 500)
pd.set_option('display.width', 100)
pd.set_option('display.expand_frame_repr', False)
pd.set_option("display.max_rows", 100)

In [3]:
df_stages = pd.read_csv(r'E:\cycling\cycling\data\tdf_stages.csv',encoding='iso-8859-1',na_filter=False)

### Clean up stage types
Merge types into "flat", "hilly", "mountain", "ITT", "TTT" and "other"

In [4]:
df_stages.Type.unique().tolist()

[u'Flat stage',
 u'Team time trial',
 u'Hilly stage',
 u'Mountain stage',
 u'Mountain',
 u'Flat',
 u'Individual time trial',
 u'Medium mountain stage',
 u'High mountain stage',
 u'Mountain time trial',
 u'Flat cobblestone stage',
 u'Mountain Stage',
 u'Transition stage',
 u'Plain stage',
 u'Intermediate stage',
 u'Stage with mountain(s)',
 u'Plain stage with cobblestones',
 u'Flat Stage',
 u'Half Stage',
 u'Stage with mountain']

In [5]:
stage_types = ['Flat','TTT','Hilly','Mountain','Mountain','Flat','ITT','Hilly','Mountain','ITT','Flat','Mountain','Other','Flat','Hilly','Mountain','Flat','Flat','Other','Mountain']

In [6]:
stage_types_dict = dict(zip(df_stages.Type.unique().tolist(),stage_types))
stage_types_dict

{u'Flat': 'Flat',
 u'Flat Stage': 'Flat',
 u'Flat cobblestone stage': 'Flat',
 u'Flat stage': 'Flat',
 u'Half Stage': 'Other',
 u'High mountain stage': 'Mountain',
 u'Hilly stage': 'Hilly',
 u'Individual time trial': 'ITT',
 u'Intermediate stage': 'Hilly',
 u'Medium mountain stage': 'Hilly',
 u'Mountain': 'Mountain',
 u'Mountain Stage': 'Mountain',
 u'Mountain stage': 'Mountain',
 u'Mountain time trial': 'ITT',
 u'Plain stage': 'Flat',
 u'Plain stage with cobblestones': 'Flat',
 u'Stage with mountain': 'Mountain',
 u'Stage with mountain(s)': 'Mountain',
 u'Team time trial': 'TTT',
 u'Transition stage': 'Other'}

In [7]:
df_stages['Type_clean'] = df_stages['Type'].replace(stage_types_dict)

In [8]:
df_stages = pd.get_dummies(df_stages,columns=['Winner_Country','Type_clean'])

In [9]:
df_stages['Date'] = pd.to_datetime(df_stages['Date'], format='%m/%d/%Y')

In [70]:
df_year = pd.DataFrame(df_stages.groupby(df_stages['Date'].dt.year).sum())
df_year['ave_distance'] = df_stages.groupby(df_stages['Date'].dt.year)['Distance'].mean()
df_year['num_stages']  = df_stages.groupby(df_stages['Date'].dt.year)['Distance'].count()
df_year.tail(10)

Unnamed: 0_level_0,Distance,Winner_Country_,Winner_Country_ FRA,Winner_Country_AUS,Winner_Country_AUT,Winner_Country_BEL,Winner_Country_BRA,Winner_Country_CAN,Winner_Country_COL,Winner_Country_CZE,...,Winner_Country_USA,Winner_Country_UZB,Type_clean_Flat,Type_clean_Hilly,Type_clean_ITT,Type_clean_Mountain,Type_clean_Other,Type_clean_TTT,ave_distance,num_stages
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2009,3459.5,1,0,0,0,0,0,0,0,0,...,0,0,10,1,2,7,0,1,164.738095,21
2010,3641.9,0,0,0,0,0,0,0,0,0,...,0,0,10,3,2,6,0,0,173.42381,21
2011,3430.0,1,0,1,0,2,0,0,0,0,...,1,0,10,3,1,6,0,1,163.333333,21
2012,3496.9,0,0,0,0,0,0,0,0,0,...,0,0,9,4,3,5,0,0,166.519048,21
2013,3403.5,1,0,1,0,1,0,0,1,0,...,0,0,8,3,2,7,0,1,162.071429,21
2014,3660.5,0,0,1,0,0,0,0,0,0,...,0,0,9,5,1,6,0,0,174.309524,21
2015,3360.3,1,0,1,0,1,0,0,0,1,...,0,0,7,5,1,7,0,1,160.014286,21
2016,3529.0,0,0,1,0,2,0,0,1,0,...,0,0,9,3,2,7,0,0,168.047619,21
2017,3540.0,0,0,2,0,0,0,0,1,0,...,0,0,8,6,2,5,0,0,168.571429,21
2018,3351.0,1,0,0,0,0,0,0,3,0,...,0,0,8,5,1,6,0,1,159.571429,21


In [62]:
from bokeh.io import show, output_notebook
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, HoverTool
from bokeh.core.properties import value
output_notebook()

In [49]:
source = ColumnDataSource(data=df_year)

hover = HoverTool(tooltips=[('Year','@Date')])

plot=figure(x_axis_label='Total distance (km)',y_axis_label='Average distance per stage (km)')
plot.circle(x='Distance',y='ave_distance',source=source,size=5)
plot.add_tools(hover)
show(plot)

In [17]:
plot=figure(x_axis_label='Year',y_axis_label='Number of French winners',plot_width=900, plot_height=500,
            title='Number of French winners by year')
plot.line(x='Date',y='Winner_Country_FRA',source=source,line_width=3, line_alpha=0.6)
plot.circle(x='Date',y='Winner_Country_FRA',source=source,size=5)
plot.add_tools(hover)
show(plot)

In [77]:
types = ['Type_clean_Flat','Type_clean_Mountain','Type_clean_Hilly','Type_clean_Other','Type_clean_ITT','Type_clean_TTT']
colors = ["#40acd5", "#e9d5ce", "#f48064","#c8ddde", "#755148", "#cbe896"]
p = figure(x_axis_label='Year',y_axis_label='Stage type',plot_width=900, plot_height=500,
            title='Stage composition by year')

p.vbar_stack(types,x='Date', width=.8, source=source,color=colors,
             legend=[value(x) for x in list(set(stage_types))])
p.legend.location = "top_left"
show(p)

In [65]:
list(set(stage_types))

['Flat', 'Mountain', 'Hilly', 'Other', 'ITT', 'TTT']