In [282]:
import pandas as pd
import numpy as np

### In this notebook, we will be looking into how age & BMI affects the success of athletes in Olympic sports.

$BMI = \frac{Weight}{Height^2}$
Weight in kg, Height in m

In [283]:
df = pd.read_csv(r'C:\Users\Prakansh Mishra\Desktop\archive\athlete_events.csv')
df["Age"].fillna((df["Age"].mean()), inplace = True)
df["Height"].fillna((df["Height"].mean()), inplace = True)
df["Weight"].fillna((df["Weight"].mean()), inplace = True)
df['Medal'].fillna('None', inplace = True)
df.drop(columns=list(set(df.columns)-set(["ID","Sex","Age", "Height", "Weight", "Sport", "Medal"])),inplace=True)
df['isMedal'] = df["Medal"].apply(lambda x: 1 if x != 'None' else 0)
df['Height']/=100.0
df['BMI'] = df['Weight']/(df['Height']**2)
df.drop(columns="Medal",inplace=True)
medals_df = df[df["isMedal"] == 1]
df.shape

(271116, 8)

In [284]:
male = df[df["Sex"] == 'M'].drop(columns="Sex")
female = df[df["Sex"] == 'F'].drop(columns="Sex")

In [285]:
male.shape, female.shape

((196594, 7), (74522, 7))

In [286]:
male_sport_count = male.groupby('Sport').count().drop(columns=['Age','Height','Weight','isMedal','BMI'])
male_sport_count.columns = ['Count']
male_sport_avg = male.groupby('Sport').mean().drop(columns=['ID','isMedal','Height','Weight'])
male_sport_std = male.groupby('Sport').apply(np.std).drop(columns=['ID','isMedal','Height','Weight'])

female_sport_count = female.groupby('Sport').count().drop(columns=['Age','Height','Weight','isMedal','BMI'])
female_sport_count.columns = ['Count']
female_sport_avg = female.groupby('Sport').mean().drop(columns=['ID','isMedal','Height','Weight'])
female_sport_std = female.groupby('Sport').apply(np.std).drop(columns=['ID','isMedal','Height','Weight'])

In [287]:
medals_male = medals_df[medals_df['Sex'] == 'M'].drop(columns="Sex")
medals_female = medals_df[medals_df['Sex'] == 'F'].drop(columns="Sex")

## ANALYSIS FOR MALE ATHLETES

In [288]:
medals_male_sport_avg = medals_male.groupby('Sport').mean().drop(columns=['ID','isMedal','Height','Weight'])
medals_male_sport_std = medals_male.groupby('Sport').apply(np.std).drop(columns=['ID','isMedal','Height','Weight'])

In [289]:
medals_male_sport_avg_std = medals_male_sport_avg.join(medals_male_sport_std, lsuffix = '_sample_mean', rsuffix = '_sample_std')
male_sport_avg.columns = [i+'_population_mean' for i in male_sport_avg.columns]
comb_male_sport = male_sport_avg.join(medals_male_sport_avg_std)
comb_male_sport = comb_male_sport.join(male_sport_count)
comb_male_sport = comb_male_sport[comb_male_sport['Age_sample_mean'] != comb_male_sport['Age_population_mean']]

In [290]:
comb_male_sport.head()

Unnamed: 0_level_0,Age_population_mean,BMI_population_mean,Age_sample_mean,BMI_sample_mean,Age_sample_std,BMI_sample_std,Count
Sport,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Alpine Skiing,23.784098,24.230461,25.344186,24.931328,3.656488,2.065457,5431
Archery,28.89612,23.846687,32.212835,23.707732,11.765935,2.444968,1319
Art Competitions,43.301788,23.027195,41.970143,23.036149,11.079648,0.281314,3201
Athletics,25.275115,22.74472,24.973945,23.051348,3.941212,3.197419,26958
Badminton,26.315202,23.004234,26.309524,22.751405,3.373483,1.25283,717


### Null Hypothesis (Age, Male)
$H_0$ = Success (getting a medal) of an individual in a sport does not depend on the age.
### Alternate Hypothesis (Age, Male)
$H_1$ = Success (getting a medal) of an individual in a sport depends on the age.

In [291]:
from scipy.stats import norm as n

In [292]:
age_male_sport = comb_male_sport.drop(columns=['BMI_sample_mean', 'BMI_sample_std', 'BMI_population_mean'])
age_male_sport['t_test'] = (age_male_sport['Age_sample_mean']-age_male_sport['Age_population_mean'])*np.sqrt(age_male_sport['Count'])/age_male_sport['Age_sample_std']
age_male_sport['p_value'] = None
for row in age_male_sport.index:
    if age_male_sport.loc[row].Age_population_mean < age_male_sport.loc[row].Age_sample_mean:
        age_male_sport['p_value'][row] = 1-n.cdf(age_male_sport['t_test'][row])
    else:
        age_male_sport['p_value'][row] = n.cdf(age_male_sport['t_test'][row])
age_male_sport

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0_level_0,Age_population_mean,Age_sample_mean,Age_sample_std,Count,t_test,p_value
Sport,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Alpine Skiing,23.784098,25.344186,3.656488,5431,31.443053,0.0
Archery,28.89612,32.212835,11.765935,1319,10.237738,0.0
Art Competitions,43.301788,41.970143,11.079648,3201,-6.799945,5.23295e-12
Athletics,25.275115,24.973945,3.941212,26958,-12.546582,2.07521e-36
Badminton,26.315202,26.309524,3.373483,717,-0.045072,0.482025
Baseball,26.239997,26.252976,4.40694,894,0.088063,0.464913
Basketball,25.263897,25.209773,3.653753,3280,-0.848385,0.198112
Beach Volleyball,29.895833,29.944444,3.879751,288,0.212632,0.415807
Biathlon,26.983682,27.011628,4.284896,3030,0.359002,0.359797
Bobsleigh,29.08511,29.874636,5.379105,2915,7.924564,1.11022e-15


In [293]:
reject_hypo_age_male = age_male_sport[age_male_sport['p_value'] <= 0.05]

#### We can thus infer that higher aged male athletes perform better in the following sports. An explanation for this could be that experienced athletes might have an edge in these sports.

In [294]:
print(np.array(reject_hypo_age_male[reject_hypo_age_male['Age_sample_mean'] > reject_hypo_age_male['Age_population_mean']].index))

['Alpine Skiing' 'Archery' 'Bobsleigh' 'Canoeing' 'Cross Country Skiing'
 'Curling' 'Equestrianism' 'Fencing' 'Figure Skating' 'Football'
 'Gymnastics' 'Handball' 'Luge' 'Modern Pentathlon' 'Nordic Combined'
 'Rowing' 'Sailing' 'Shooting' 'Ski Jumping' 'Speed Skating' 'Swimming'
 'Tennis' 'Volleyball' 'Water Polo' 'Wrestling']


#### We can also infer that lower aged male athletes perform better in the following sports. An explanation for this could be that younger athletes might have a fitness/agility advantage in these.

In [295]:
print(np.array(reject_hypo_age_male[reject_hypo_age_male['Age_sample_mean'] < reject_hypo_age_male['Age_population_mean']].index))

['Art Competitions' 'Athletics' 'Boxing' 'Cycling' 'Diving' 'Golf' 'Judo'
 'Snowboarding' 'Table Tennis' 'Taekwondo' 'Trampolining' 'Triathlon']


#### We can accept the null hypothesis that success does not depend on age for male athletes in the following sports. 

In [296]:
print(np.array(age_male_sport[age_male_sport['p_value'] > 0.05].index))

['Badminton' 'Baseball' 'Basketball' 'Beach Volleyball' 'Biathlon'
 'Croquet' 'Freestyle Skiing' 'Hockey' 'Ice Hockey' 'Jeu De Paume'
 'Military Ski Patrol' 'Motorboating' 'Polo' 'Racquets' 'Roque'
 'Rugby Sevens' 'Short Track Speed Skating' 'Skeleton' 'Tug-Of-War'
 'Weightlifting']


### Null Hypothesis (BMI, Male)
$H_0$ = Success (getting a medal) of an individual in a sport does not depend on the BMI.
### Alternate Hypothesis (BMI, Male)
$H_1$ = Success (getting a medal) of an individual in a sport depends on the BMI.

In [297]:
bmi_male_sport = comb_male_sport.drop(columns=['Age_sample_mean', 'Age_sample_std', 'Age_population_mean'])
bmi_male_sport['t_test'] = (bmi_male_sport['BMI_sample_mean']-bmi_male_sport['BMI_population_mean'])*np.sqrt(bmi_male_sport['Count'])/bmi_male_sport['BMI_sample_std']
bmi_male_sport['p_value'] = None
for row in bmi_male_sport.index:
    if bmi_male_sport.loc[row].BMI_population_mean < bmi_male_sport.loc[row].BMI_sample_mean:
        bmi_male_sport['p_value'][row] = 1-n.cdf(bmi_male_sport['t_test'][row])
    else:
        bmi_male_sport['p_value'][row] = n.cdf(bmi_male_sport['t_test'][row])
bmi_male_sport

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0_level_0,BMI_population_mean,BMI_sample_mean,BMI_sample_std,Count,t_test,p_value
Sport,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Alpine Skiing,24.230461,24.931328,2.065457,5431,25.0069,0.0
Archery,23.846687,23.707732,2.444968,1319,-2.064069,0.0195056
Art Competitions,23.027195,23.036149,0.281314,3201,1.800822,0.0358654
Athletics,22.74472,23.051348,3.197419,26958,15.74548,0.0
Badminton,23.004234,22.751405,1.25283,717,-5.403719,3.26365e-08
Baseball,25.535373,25.718744,2.314637,894,2.368742,0.00892435
Basketball,23.559988,23.942328,1.768276,3280,12.38331,0.0
Beach Volleyball,23.879613,24.038344,1.268315,288,2.123876,0.0168403
Biathlon,22.699528,22.61111,1.154409,3030,-4.21605,1.2431e-05
Bobsleigh,25.990098,26.098548,3.061436,2915,1.912602,0.0278995


In [298]:
reject_hypo_bmi_male = bmi_male_sport[bmi_male_sport['p_value'] <= 0.05]

#### We can thus infer that male athletes with higher BMI perform better in the following sports. An explanation for this could be that athletes with bulky body type may have an advantage in these sports.

In [299]:
print(np.array(reject_hypo_bmi_male[reject_hypo_bmi_male['BMI_sample_mean'] > reject_hypo_bmi_male['BMI_population_mean']].index))

['Alpine Skiing' 'Art Competitions' 'Athletics' 'Baseball' 'Basketball'
 'Beach Volleyball' 'Bobsleigh' 'Boxing' 'Canoeing' 'Cycling' 'Fencing'
 'Figure Skating' 'Football' 'Freestyle Skiing' 'Gymnastics' 'Handball'
 'Hockey' 'Ice Hockey' 'Jeu De Paume' 'Judo' 'Luge' 'Rowing'
 'Rugby Sevens' 'Snowboarding' 'Speed Skating' 'Swimming' 'Triathlon'
 'Volleyball' 'Water Polo' 'Weightlifting' 'Wrestling']


#### We can also infer that lower aged male athletes perform better in the following sports. An explanation for this could be that athletes with skinny body type may have an advantage in these sports.

In [300]:
print(np.array(reject_hypo_bmi_male[reject_hypo_bmi_male['BMI_sample_mean'] < reject_hypo_bmi_male['BMI_population_mean']].index))

['Archery' 'Badminton' 'Biathlon' 'Croquet' 'Cross Country Skiing'
 'Diving' 'Golf' 'Modern Pentathlon' 'Motorboating' 'Nordic Combined'
 'Sailing' 'Shooting' 'Skeleton' 'Ski Jumping' 'Table Tennis' 'Taekwondo']


#### We can accept the null hypothesis that success does not depend on age for male athletes in the following sports. 

In [301]:
print(np.array(bmi_male_sport[bmi_male_sport['p_value'] > 0.05].index))

['Curling' 'Equestrianism' 'Military Ski Patrol' 'Polo' 'Racquets'
 'Short Track Speed Skating' 'Tennis' 'Trampolining' 'Tug-Of-War']


## ANALYSIS FOR FEMALE ATHLETES

In [302]:
medals_female_sport_avg = medals_female.groupby('Sport').mean().drop(columns=['ID','isMedal','Height','Weight'])
medals_female_sport_std = medals_female.groupby('Sport').apply(np.std).drop(columns=['ID','isMedal','Height','Weight'])

In [303]:
medals_female_sport_avg_std = medals_female_sport_avg.join(medals_female_sport_std, lsuffix = '_sample_mean', rsuffix = '_sample_std')
female_sport_avg.columns = [i+'_population_mean' for i in female_sport_avg.columns]
comb_female_sport = female_sport_avg.join(medals_female_sport_avg_std)
comb_female_sport = comb_female_sport.join(female_sport_count)
comb_female_sport = comb_female_sport[comb_female_sport['Age_sample_mean'] != comb_female_sport['Age_population_mean']]

In [304]:
comb_male_sport.head()

Unnamed: 0_level_0,Age_population_mean,BMI_population_mean,Age_sample_mean,BMI_sample_mean,Age_sample_std,BMI_sample_std,Count
Sport,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Alpine Skiing,23.784098,24.230461,25.344186,24.931328,3.656488,2.065457,5431
Archery,28.89612,23.846687,32.212835,23.707732,11.765935,2.444968,1319
Art Competitions,43.301788,23.027195,41.970143,23.036149,11.079648,0.281314,3201
Athletics,25.275115,22.74472,24.973945,23.051348,3.941212,3.197419,26958
Badminton,26.315202,23.004234,26.309524,22.751405,3.373483,1.25283,717


### Null Hypothesis (Age, Female)
$H_0$ = Success (getting a medal) of an individual in a sport does not depend on the age.
### Alternate Hypothesis (Age, Female)
$H_1$ = Success (getting a medal) of an individual in a sport depends on the age.

In [305]:
age_female_sport = comb_female_sport.drop(columns=['BMI_sample_mean', 'BMI_sample_std', 'BMI_population_mean'])
age_female_sport['t_test'] = (age_female_sport['Age_sample_mean']-age_female_sport['Age_population_mean'])*np.sqrt(age_female_sport['Count'])/age_female_sport['Age_sample_std']
age_female_sport['p_value'] = None
for row in age_female_sport.index:
    if age_female_sport.loc[row].Age_population_mean < age_female_sport.loc[row].Age_sample_mean:
        age_female_sport['p_value'][row] = 1-n.cdf(age_female_sport['t_test'][row])
    else:
        age_female_sport['p_value'][row] = n.cdf(age_female_sport['t_test'][row])
age_female_sport

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0_level_0,Age_population_mean,Age_sample_mean,Age_sample_std,Count,t_test,p_value
Sport,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Alpine Skiing,22.334609,23.333333,3.546693,3398,16.414737,0.0
Archery,26.499083,26.955016,10.006542,1015,1.45161,0.073305
Art Competitions,40.61094,40.959718,12.596794,377,0.537601,0.295426
Athletics,24.935925,25.129064,4.154003,11666,5.021855,2.55874e-07
Badminton,25.047297,24.452381,3.072167,740,-5.267772,6.90447e-08
Basketball,25.517516,25.575064,4.215576,1256,0.4838,0.314264
Beach Volleyball,28.315217,29.555556,4.374449,276,4.710547,1.23527e-06
Biathlon,25.799249,26.713333,3.468596,1863,11.374682,0.0
Bobsleigh,27.832168,29.083333,3.200477,143,4.674854,1.47082e-06
Boxing,26.638889,26.166667,4.624812,72,-0.8664,0.193135


In [306]:
reject_hypo_age_female = age_female_sport[age_female_sport['p_value'] <= 0.05]

#### We can thus infer that higher aged female athletes perform better in the following sports. An explanation for this could be that experienced athletes might have an edge in these sports.

In [307]:
print(np.array(reject_hypo_age_female[reject_hypo_age_female['Age_sample_mean'] > reject_hypo_age_female['Age_population_mean']].index))

['Alpine Skiing' 'Athletics' 'Beach Volleyball' 'Biathlon' 'Bobsleigh'
 'Canoeing' 'Cross Country Skiing' 'Curling' 'Cycling' 'Figure Skating'
 'Football' 'Freestyle Skiing' 'Gymnastics' 'Hockey' 'Ice Hockey' 'Luge'
 'Modern Pentathlon' 'Rhythmic Gymnastics' 'Rowing' 'Sailing'
 'Ski Jumping' 'Softball' 'Speed Skating' 'Swimming'
 'Synchronized Swimming' 'Tennis' 'Triathlon' 'Volleyball' 'Wrestling']


#### We can also infer that lower aged female athletes perform better in the following sports. An explanation for this could be that younger athletes might have a fitness/agility advantage in these.

In [308]:
print(np.array(reject_hypo_age_female[reject_hypo_age_female['Age_sample_mean'] < reject_hypo_age_female['Age_population_mean']].index))

['Badminton' 'Diving' 'Equestrianism' 'Handball' 'Rugby Sevens' 'Shooting'
 'Short Track Speed Skating' 'Table Tennis' 'Taekwondo' 'Trampolining'
 'Weightlifting']


#### We can accept the null hypothesis that success does not depend on age for female athletes in the following sports. 

In [309]:
print(np.array(age_female_sport[age_female_sport['p_value'] > 0.05].index))

['Archery' 'Art Competitions' 'Basketball' 'Boxing' 'Fencing' 'Golf'
 'Judo' 'Skeleton' 'Snowboarding' 'Water Polo']


### Null Hypothesis (BMI, Female)
$H_0$ = Success (getting a medal) of an individual in a sport does not depend on the BMI.
### Alternate Hypothesis (BMI, Female)
$H_1$ = Success (getting a medal) of an individual in a sport depends on the BMI.

In [310]:
bmi_female_sport = comb_female_sport.drop(columns=['Age_sample_mean', 'Age_sample_std', 'Age_population_mean'])
bmi_female_sport['t_test'] = (bmi_female_sport['BMI_sample_mean']-bmi_female_sport['BMI_population_mean'])*np.sqrt(bmi_female_sport['Count'])/bmi_female_sport['BMI_sample_std']
bmi_female_sport['p_value'] = None
for row in bmi_female_sport.index:
    if bmi_female_sport.loc[row].BMI_population_mean < bmi_female_sport.loc[row].BMI_sample_mean:
        bmi_female_sport['p_value'][row] = 1-n.cdf(bmi_female_sport['t_test'][row])
    else:
        bmi_female_sport['p_value'][row] = n.cdf(bmi_female_sport['t_test'][row])
bmi_female_sport

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0_level_0,BMI_population_mean,BMI_sample_mean,BMI_sample_std,Count,t_test,p_value
Sport,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Alpine Skiing,22.541945,22.684517,1.808699,3398,4.594942,2.16435e-06
Archery,22.339717,22.443119,1.996808,1015,1.649778,0.0494942
Art Competitions,23.009576,22.997319,7.105427e-15,377,-33493200000000.0,0.0
Athletics,21.035795,21.330406,3.038567,11666,10.47226,0.0
Badminton,21.868087,21.83347,1.643533,740,-0.5729593,0.283336
Basketball,22.042478,21.936222,1.879417,1256,-2.00365,0.0225538
Beach Volleyball,21.404029,21.337982,1.05895,276,-1.036165,0.150063
Biathlon,20.727202,20.766774,1.182247,1863,1.444706,0.0742702
Bobsleigh,24.291185,24.636003,2.070677,143,1.991342,0.0232217
Boxing,22.041693,21.717206,2.15384,72,-1.278351,0.100563


In [311]:
reject_hypo_bmi_female = bmi_female_sport[bmi_female_sport['p_value'] <= 0.05]

#### We can thus infer that female athletes with higher BMI perform better in the following sports. An explanation for this could be that athletes with bulky body type may have an advantage in these sports.

In [312]:
print(np.array(reject_hypo_bmi_female[reject_hypo_bmi_female['BMI_sample_mean'] > reject_hypo_bmi_female['BMI_population_mean']].index))

['Alpine Skiing' 'Archery' 'Athletics' 'Bobsleigh' 'Cross Country Skiing'
 'Curling' 'Cycling' 'Golf' 'Gymnastics' 'Ice Hockey' 'Judo' 'Rowing'
 'Rugby Sevens' 'Shooting' 'Short Track Speed Skating' 'Softball'
 'Swimming' 'Tennis' 'Volleyball' 'Water Polo']


#### We can also infer that lower aged female athletes perform better in the following sports. An explanation for this could be that athletes with skinny body type may have an advantage in these sports.

In [313]:
print(np.array(reject_hypo_bmi_female[reject_hypo_bmi_female['BMI_sample_mean'] < reject_hypo_bmi_female['BMI_population_mean']].index))

['Art Competitions' 'Basketball' 'Diving' 'Fencing' 'Hockey' 'Luge'
 'Modern Pentathlon' 'Rhythmic Gymnastics' 'Sailing' 'Snowboarding'
 'Synchronized Swimming' 'Table Tennis' 'Triathlon']


#### We can accept the null hypothesis that success does not depend on age for female athletes in the following sports. 

In [314]:
print(np.array(bmi_female_sport[bmi_female_sport['p_value'] > 0.05].index))

['Badminton' 'Beach Volleyball' 'Biathlon' 'Boxing' 'Canoeing'
 'Equestrianism' 'Figure Skating' 'Football' 'Freestyle Skiing' 'Handball'
 'Skeleton' 'Ski Jumping' 'Speed Skating' 'Taekwondo' 'Trampolining'
 'Weightlifting' 'Wrestling']
