In [1]:
# 120 years of Olympics data analysis using python contains
# Country Demographics 
# Age Demographics of Athletes
# Gender Demographics of Athletes, specially Women participation over the year
# Medal Demographics


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline



import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [2]:
athletes = pd.read_csv('../input/120-years-of-olympic-history-athletes-and-results/athlete_events.csv')
regions = pd.read_csv('../input/120-years-of-olympic-history-athletes-and-results/noc_regions.csv')

In [3]:
regions.head()

In [4]:
athletes.head()

In [5]:
#Merge Dataframes

athletes_df = athletes.merge(regions, how = 'left', on = 'NOC')
athletes_df.head()

In [6]:
athletes_df.shape

In [7]:
athletes_df.rename(columns={'region':'Region', 'notes': 'Notes'}, inplace=True)
athletes_df.head()

In [8]:
athletes_df.info()

In [9]:
athletes_df.describe()

In [10]:
#Null Values
nan_values = athletes_df.isna()
nan_columns = nan_values.any()
nan_columns

In [11]:
athletes_df.isnull().sum()

In [12]:
nan_values = athletes_df.isna()
nan_columns = nan_values.any()

columns_with_nan = athletes_df.columns[nan_columns].tolist()
print(columns_with_nan)

In [13]:
#India
athletes_df.query('Team == "India"').head(5)

In [14]:
#Japan
athletes_df.query('Team == "Japan"').head(5)

In [15]:
#Top Countries participating
top_10_countries = athletes_df.Team.value_counts().sort_values(ascending=False).head(10)
top_10_countries

In [16]:
#Visualization
plt.figure(figsize=(12,6))
#plt.xticks(rotation=20)
plt.title('Top 10 Participation by Country')
sns.barplot(x=top_10_countries.index, y=top_10_countries, palette = 'Set2');

In [17]:
#Age Demographics of the Athletes
plt.figure(figsize=(12, 6))
plt.title("Age Demographics of the Athletes")
plt.xlabel('Age')
plt.ylabel('Number of Athletes')
plt.hist(athletes_df.Age, bins = np.arange(10,80,2), color='blue', edgecolor='white');

In [18]:
winter_sports = athletes_df[athletes_df.Season == 'Winter'].Sport.unique()
winter_sports

In [19]:
summer_sports = athletes_df[athletes_df.Season == 'Summer'].Sport.unique()
summer_sports

In [20]:
#Gender Demographics
gender_counts = athletes_df.Sex.value_counts()
gender_counts 

In [21]:
plt.figure(figsize=(12,6))
plt.title('Male vs Female Athletes')
plt.pie(gender_counts, labels=gender_counts.index, autopct='%1.1f%%', startangle=150);


In [22]:
#Female Demographics

female_athletes = athletes_df[(athletes_df.Sex=='F') & (athletes_df.Season=='Summer')][['Sex','Year']]
female_athletes = female_athletes.groupby('Year').count().reset_index()
female_athletes.tail()

In [23]:
female_olympics = athletes_df[(athletes_df.Sex == 'F') & (athletes_df.Season == 'Summer')]


In [24]:
#Total Medals
athletes_df.Medal.value_counts()

In [25]:
sns.set(style="darkgrid")
plt.figure(figsize=(20,10))
sns.countplot(x='Year', data=female_olympics, palette="Spectral")
plt.title('Women Participation Demographics')

In [26]:
part = female_olympics.groupby('Year')[('Sex')].value_counts()
plt.figure(figsize=(20, 10))
part.loc[:,'F'].plot()
plt.title('Women participation Over the time')

In [27]:
#Medal Demographics


athletes_df.Medal.value_counts()

In [28]:
goldmedals = athletes_df[(athletes_df.Medal == 'Gold')]
goldmedals = goldmedals[np.isfinite(goldmedals['Age'])] #without NaN
goldmedals.head()


In [41]:
goldmedals['ID'][goldmedals['Age'] > 60].count()

In [50]:
sportevents = goldmedals['Sport'][goldmedals['Age'] > 30]
sportevents.count()

In [51]:
#Gold Medals Country Demographics

goldmedals.Region.value_counts().reset_index(name='Medal').head(6)


In [70]:
total_goldmedals = goldmedals.Region.value_counts().reset_index(name='Medal').head(20)
g = sns.catplot(x="index", y="Medal", data=total_goldmedals, height=20, kind="bar", palette="icefire")
g.despine(left=True)
g.set_xlabels("Top 20 countries")
g.set_ylabels("Number of Medals")
plt.title('Gold Medals by Country')