In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import statistics
sns.set_theme()

In [None]:
df = pd.read_csv('../input/boston-marathon-2019/Dataset-Boston-2019.csv', index_col = 'Rank_Tot')
df

**Convert Result-hr column to datetime object**

In [None]:
df['Result_hr'] = pd.to_datetime(df['Result_hr']).dt.time

In [None]:
newCols = ['Age', 'Gender', 'Rank_Gender', 'Country', 'Country_code', 'Result_hr', 'Result_sec']
df = df.reindex(columns = newCols)
df

### Age Distribution

In [None]:
mu = df['Age'].mean()
sigma = np.std(df['Age'])
n_bins = 20

fig, ax = plt.subplots()
ax.hist(df['Age'], bins = n_bins, alpha = 0.75)
ax.set_title("Age Distribution of All Runners:"'\n$\mu = $' + str(round(mu, 2)) + '    $\sigma = $' + str(round(sigma, 2)))
ax.set_ylabel('Frequency/Count')
ax.set_xlabel('Age')
fig.tight_layout()
plt.show()

***Age Distribution of Male and Female Runners*

In [None]:
males = df[df['Gender'] == 'M']
females = df[df['Gender'] == 'F']

M_mu = males['Age'].mean()
M_sigma = np.std(males['Age'])

F_mu = females['Age'].mean()
F_sigma = np.std(females['Age'])

fig, axs = plt.subplots(1, 2, figsize = (8,4), sharey = True, tight_layout = True)

axs[0].hist(males['Age'], label = 'Male', bins = 20)
axs[0].set_title("Age Distribution of Male Runners:"'\n$\mu = $' + str(round(M_mu, 2)) + '    $\sigma = $' + str(round(M_sigma, 2)))
axs[0].set_xlabel('Age Range in Years')
axs[0].set_ylabel('Frequency')
axs[0].legend(loc = 'upper right')

axs[1].hist(females['Age'], alpha = .6, color = 'green', label = 'Female', bins = 20)
axs[1].set_title("Age Distribution of Female Runners:"'\n$\mu = $' + str(round(F_mu, 2)) + '    $\sigma = $' + str(round(F_sigma, 2)))
axs[1].set_xlabel('Age Range in Years')
axs[1].legend(loc = 'upper right')
plt.show()

### Breakdown of Male and Female Runners

In [None]:
gender = df['Gender'].value_counts(normalize = True)
gender

fig, ax = plt.subplots()
ax.pie(gender, explode = (0.0, 0.1), labels = gender.index, autopct = '%1.2f%%', 
       shadow = True, colors = ('royalblue','forestgreen'))
ax.axis('equal')
ax.set_title('Percentage of Male/Female Runners')
ax.legend()
plt.show()

### Top 5 Fielded Countries by Gender

In [None]:
female_field = females['Country_code'].value_counts().head(5)
male_field = males['Country_code'].value_counts().head(5)


fig, ax = plt.subplots(1,2, figsize = (15,4))

#Plotting Male Field
ax[0].pie(male_field, labels = male_field.index)
ax[0].legend(male_field,
            title = 'Male Runners Fielded',
            loc = 'center left',
            bbox_to_anchor=(1.1, 0, 0.5, 1))
ax[0].set_title('Top 5 Countries Fielding Male Runners')

#Plotting Female field
ax[1].pie(female_field, labels = female_field.index)
ax[1].legend(female_field,
            title = 'Female Runners Fielded',
            loc = 'center left',
            bbox_to_anchor=(1.1, 0, 0.5, 1))
ax[1].set_title('Top 5 Countries Fielding Female Runners')


plt.tight_layout()
plt.show()

### Top 20 Finishes: Men's Division

In [None]:
top20 = males.iloc[:20].Country_code.value_counts()


fig, ax = plt.subplots(tight_layout = True)
ax.bar(top20.index, top20)
ax.set_title('Top 20 Finishes by Country: Mens Division')
ax.set_ylabel('Number of Runners Inside top 20')
ax.set_xlabel('Countries with top 20 Finishes')
plt.show()

### Percentage of Fielded Runners by Country finishing in top 20
***50% of Ethiopian Men and 77.78% of Kenyan Men finished in top 20***

In [None]:
male_field2 = males['Country_code'].value_counts()

for country in top20.index:
    if country in male_field:
        x = pd.DataFrame(top20/male_field2)
        
x.dropna(inplace = True)
x.sort_values(by = 'Country_code', ascending = False)

### Top 20 Finishes: Women's Division

In [None]:
top20F = females.iloc[:20].Country_code.value_counts()


fig, ax = plt.subplots(tight_layout = True)
ax.bar(top20F.index, top20)
ax.set_title('Top 20 Finishes by Country: Women\'s Divison')
ax.set_ylabel('Number of Runners Inside top 20')
ax.set_xlabel('Countries with top 20 Finishes')
plt.show()

### Percentage of Fielded Runners by Country finishing in top 20
***100% of Ethiopian Women and 71.43% of Kenyan Women finished in top 20***

In [None]:
female_field2 = females['Country_code'].value_counts()

for country in top20.index:
    if country in female_field:
        x = pd.DataFrame(top20F/female_field2)
        
x.dropna(inplace = True)
x.sort_values(by = 'Country_code', ascending = False)

### Observing Data of American Men
**Looks like Country_code "USA" includes runners from other Countries.**

In [None]:
usa_men = males.loc[males['Country_code'] == 'USA']
usa_men.Country.value_counts()

In [None]:
usa_men2 = males.loc[(males['Country_code'] == 'USA') & (males['Country'] == 'United States')]
usa_men2

In [None]:
bins = [18,20,30,40,50,60,70,80,90]
ranges = ['U20','20-29','30-39','40-49','50-59','60-69','70-79','80-89']
usa_men2['age_range'] = pd.cut(usa_men2['Age'], bins = bins, labels = ranges, right = False)

In [None]:
ages = usa_men2['age_range'].value_counts().sort_index()

fig, ax = plt.subplots(figsize = (8,4))
ax.bar(ages.index, height = ages.values)
ax.set_title('American Runners by Age Group')
ax.set_ylabel('Number of Runners')
ax.set_xlabel('Age Group')
ax.legend()
fig.tight_layout()
plt.show()


### Time Distributions by Age Group

In [None]:
sns.displot(data = usa_men2, x = 'Result_sec', col = 'age_range', col_wrap = 4, kde = True, color = ('r'))
plt.show()

In [None]:
sns.kdeplot(data = usa_men2, x = 'Result_sec', hue = 'age_range', fill = True, common_norm = False, alpha = 0.5, linewidth = 0)


In [None]:
ageStats = usa_men2.groupby('age_range').describe()['Result_sec']
ageStats