In [77]:
import pandas as pd
import numpy as np

In [111]:
df = pd.read_csv('data/all_results.csv', parse_dates=True)

def getResultTimeInSeconds(result_time):
    delta = pd.to_timedelta(result_time)
    if pd.isnull(delta):
        return np.nan
    else:
        return delta.seconds

df['resultTimeSec'] = df['resultTime'].map(getResultTimeInSeconds)

df[['year', 'gender', 'resultTime']].groupby(['year', 'gender']).size()

year  gender
2013  Female     282
      Male      2084
2014  Female     594
      Male      3665
2015  Female    1004
      Male      4902
2016  Female    1293
      Male      6864
dtype: int64

In [324]:
df[ df['resultTimeSec'].isin([df['resultTimeSec'].max(), df['resultTimeSec'].min()]) ]

Unnamed: 0,year,gender,status,resultTime,genderPosition,country,city,team,resultTimeSec
13824,2016,Male,Finished,2:13:40,1.0,Россия,Уфа,,8020.0
20426,2016,Male,Finished,6:39:09,6603.0,Россия,Москва,Мисцево,23949.0


In [325]:
by_year_count_df = {}
for gender in ['Male', 'Female']:
    gender_df = df[ df['gender'] == gender ]
    by_year_count_df[gender] = gender_df.groupby('year').size()
    
years_list = by_year_count_df['Male'].index

by_year_count_df['Male']

year
2013    2084
2014    3665
2015    4902
2016    6864
dtype: int64

In [331]:
%matplotlib notebook
import matplotlib.pyplot as plt
import datetime
import matplotlib.gridspec as gridspec

# Common chart settings
font_size_subplots_title = 10
labels_text_alpha = 0.6
item_width = .25
item_offset = .15
# Color names: http://matplotlib.org/2.0.0b3/examples/color/named_colors.html
data_color_male = 'royalblue'
data_color_female = 'deeppink'

# Layout settings
plt.figure(figsize=(10, 6))

years_index = np.arange(1, len(years) + 1)

# Total runners count
ax1 = plt.subplot(2, 1, 1)
plt.title("Total number of runners", fontsize=font_size_subplots_title)

plt.xticks(years_index, years_list.values, alpha=labels_text_alpha)
ax1.legend(['Males', 'Females'], frameon=False)

male_count_bars = plt.bar(years_index - item_offset, height=by_year_count_df['Male'], width=item_width, color=data_color_male)
female_count_bars = plt.bar(years_index + item_offset, height=by_year_count_df['Female'], width=item_width, color=data_color_female)
for bar in male_count_bars + female_count_bars:
    bar_value = bar.get_height()
    bar_text = str(int(bar_value))
    text_x = bar.get_x() + bar.get_width() / 2
    text_y = bar_value
    ax1.text(text_x, text_y, bar_text,
             ha='center', fontsize=10, color='black', alpha=labels_text_alpha)

plt.tick_params(top='off', bottom='off', left='off', right='off', labelleft='off', labelbottom='on')
for spine in ax1.spines.values():
    spine.set_visible(False)

# Time percentiles
ax2 = plt.subplot(2, 1, 2, sharex=ax1)
plt.title("Finish time distribution")

by_years_speed = {'Male': [], 'Female': []}
for year in years_list:
    for gender in ['Male', 'Female']:
        times_df = df[ ~df['resultTimeSec'].isnull() & (df['year'] == year) & (df['gender'] == gender) ]
        by_years_speed[gender].append(times_df['resultTimeSec'])

plt.title("Finish time distribution", fontsize=font_size_subplots_title)
plt.ylim(2 * 60 * 60, 7 * 60 * 60)
plt.boxplot(by_years_speed['Male'], whis='range', widths=item_width, positions=(years_index - item_offset))
plt.boxplot(by_years_speed['Female'], whis='range', widths=item_width, positions=(years_index + item_offset))

# Year ticks
plt.xticks(years_index, years_list.values, alpha=labels_text_alpha)

# Proper times ticks
time_values = np.arange(2 * 60 * 60, 7 * 60 * 60 + 1, 30 * 60)
time_labels = []
for value in time_values:
    dt = datetime.datetime.fromtimestamp(value, tz=datetime.timezone.utc)
    time_labels.append(dt.strftime('%H:%M'))

ax2.set_yticks(time_values)
ax2.set_yticklabels(time_labels)

plt.tick_params(top='off', bottom='off', left='on', right='off', labelleft='on', labelbottom='on')
ax2.spines['top'].set_visible(False)
ax2.spines['right'].set_visible(False)


