In [77]:
import pandas as pd
import numpy as np

In [111]:
df = pd.read_csv('data/all_results.csv', parse_dates=True)

def getResultTimeInSeconds(result_time):
    delta = pd.to_timedelta(result_time)
    if pd.isnull(delta):
        return np.nan
    else:
        return delta.seconds

df['resultTimeSec'] = df['resultTime'].map(getResultTimeInSeconds)

df[['year', 'gender', 'resultTime']].groupby(['year', 'gender']).size()

year  gender
2013  Female     282
      Male      2084
2014  Female     594
      Male      3665
2015  Female    1004
      Male      4902
2016  Female    1293
      Male      6864
dtype: int64

In [105]:
df.sort_values(['resultTime']).reset_index().head(20)

Unnamed: 0,index,year,gender,status,resultTime,genderPosition,country,city,team,resultTimeSec
0,5,2013,Male,Finished,02:19:36,1.0,Украина,Киев,,8376.0
1,9,2013,Male,Finished,02:21:12,2.0,Украина,Киев,,8472.0
2,10,2013,Male,Finished,02:21:27,3.0,Украина,Киев,,8487.0
3,11,2013,Male,Finished,02:21:57,4.0,Россия,Москва,,8517.0
4,16,2013,Male,Finished,02:28:55,5.0,Россия,Санкт-Петербург,,8935.0
5,17,2013,Male,Finished,02:31:00,6.0,,,,9060.0
6,18,2013,Male,Finished,02:31:28,7.0,Россия,ХАБАРОВСК,,9088.0
7,19,2013,Male,Finished,02:35:27,8.0,Россия,Москва,,9327.0
8,23,2013,Male,Finished,02:39:54,9.0,Россия,Нижний Новгород,,9594.0
9,24,2013,Male,Finished,02:42:14,10.0,Россия,ЖУКОВСКИЙ,,9734.0


In [202]:
by_year_count_df = {}
for gender in ['Male', 'Female']:
    gender_df = df[ df['gender'] == gender ]
    by_year_count_df[gender] = gender_df.groupby('year').size()
    
years_list = by_year_count_df['Male'].index

by_year_count_df['Male']

year
2013    2084
2014    3665
2015    4902
2016    6864
dtype: int64

In [224]:
%matplotlib notebook
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt

# Common chart settings
font_size_subplots_title = 10
labels_text_alpha = 0.6
item_width = .3
# Color names: http://matplotlib.org/2.0.0b3/examples/color/named_colors.html
data_color_male = 'royalblue'
data_color_female = 'deeppink'

years_index = np.arange(1, len(years) + 1)
print(years_list.values)

plt.figure(figsize=(10, 7))

# Total runners count
ax1 = plt.subplot(2, 1, 1)
plt.title("Number of runners", fontsize=font_size_subplots_title)
male_count_bars = plt.bar(years_index, height=by_year_count_df['Male'], width=item_width, color=data_color_male)
female_count_bars = plt.bar(years_index, height=by_year_count_df['Female'], width=item_width, color=data_color_female)
plt.xticks(years_index, years_list.values, alpha=labels_text_alpha)
ax1.legend(['Males', 'Females'])

def draw_count_labels(bars_list):
    for bar in bars_list:
        bar_value = bar.get_height()
        bar_text = str(int(bar_value))
        text_x = bar.get_x() + bar.get_width() / 2
        text_y = bar_value + 17
        ax1.text(text_x, text_y, bar_text,
                 ha='center', fontsize=10, color='black', alpha=labels_text_alpha)

draw_count_labels(male_count_bars)
draw_count_labels(female_count_bars)


# Time percentiles
ax2 = plt.subplot(2, 1, 2, sharex=ax1)
plt.title("Finish time distribution")

by_years_speed = []
for year in years_list:
    times_df = gender_df[ ~gender_df['resultTimeSec'].isnull() & (gender_df['year'] == year) ]
    by_years_speed.append(times_df['resultTimeSec'])

plt.title("Finish time distribution", fontsize=font_size_subplots_title)
plt.ylim(0, 7 * 60 * 60)
chart_times = plt.boxplot(by_years_speed, whis='range', widths=item_width)
#plot_time.yaxis.set_data_interval(0, 25000)
#plt.set_yticks(years_list.values)

plt.xticks(years_index, years_list.values, alpha=labels_text_alpha)





[2013 2014 2015 2016]


([<matplotlib.axis.XTick at 0x1539028d0>,
  <matplotlib.axis.XTick at 0x153903550>,
  <matplotlib.axis.XTick at 0x154425cc0>,
  <matplotlib.axis.XTick at 0x1544f8550>],
 <a list of 4 Text xticklabel objects>)