# 120 Years of Olympic History - EDA 
![](https://i.imgur.com/zLF5Gj2.jpg)

Historical data on the modern Olympic Games, from Athens 1896 to Rio 2016. Each row corresponds to an individual athlete competing in an individual event, including the athlete's name, sex, age, height, weight, country, and medal, and the event's name, sport, games, year, and city.

ID : Unique number for each athlete<br>
Name : Athlete's name<br>
Sex : Male (M) or Female (F)<br>
Age : Integer<br>
Height : In centimeters<br>
Weight : In kilograms<br>
Team : Team name<br>
NOC : National Olympic Committee 3-letter code<br>
Games : Year and season<br>
Year : Integer<br>
Season : Summer or Winter<br>
City : Host city<br>
Sport : Sport<br>
Event : Event<br>
Medal : Gold, Silver, Bronze, or NA<br>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style('darkgrid')
from wordcloud import WordCloud
from datetime import datetime
import datetime
from collections import Counter

## Importing the Dataset

In [None]:
athlete_events = pd.read_csv("../input/120-years-of-olympic-history/athlete_events.csv")

In [None]:
regions = pd.read_csv("../input/120-years-of-olympic-history/country_definitions.csv")

In [None]:
athlete_events

In [None]:
regions

In [None]:
athlete_events.shape

In [None]:
regions.shape

In [None]:
athletes = athlete_events.merge(regions,how = "left",on = "NOC")

In [None]:
athletes.rename(columns = {"region": "Region","notes" : "Notes"},inplace = True)

In [None]:
athletes

In [None]:
athletes.info()

In [None]:
athletes.describe()

Let's check for null values

In [None]:
athletes.isna().sum()

## Gender Distribution

In [None]:
athletes.Sex.value_counts()

In [None]:
plt.figure(figsize = (12,8))
sns.countplot(x = 'Sex',data = athletes);

## Age distribution

In [None]:
plt.figure(figsize = (12,8))
sns.distplot(athletes['Age']);

## Weight Distribution

In [None]:
plt.figure(figsize = (12,8))
sns.distplot(athletes['Weight']);

## Height Distribution

In [None]:
plt.figure(figsize = (12,8))
sns.distplot(athletes['Height']);

## Height VS Weight

In [None]:
plt.figure(figsize = (12,8))
sns.scatterplot(x = 'Height',y = 'Weight',data = athletes,hue = 'Sex')
plt.title("Height VS Weight");

## Teams Column

In [None]:
athletes.Team.value_counts()

In [None]:
team_list = list(athletes['Team'].apply(lambda x: [x] if x != {} else []))
for element in team_list:
    element[0] = element[0].replace(" ","_")
text = ' '.join(i for j in team_list for i in j)

In [None]:
plt.figure(figsize = (12,8))
word_cld = WordCloud(max_font_size=None, background_color='white', collocations=False,
                      width=1200, height=1000).generate(text)
plt.imshow(word_cld)
plt.title("Teams from Countries")
plt.axis("off")
plt.show();

In [None]:
top_10_teams =  athletes.Team.value_counts().sort_values(ascending = False).head(10)
top_10_teams

In [None]:
top_10_teams.values

In [None]:
plt.figure(figsize = (12,8))
plt.title("Top 10 Countries Participating")
sns.barplot(x = top_10_teams.index , y = top_10_teams);

## NOC (National Olympic Committee)

In [None]:
athletes.NOC.value_counts()

In [None]:
noc_list = list(athletes['NOC'].apply(lambda x: [x] if x != {} else []))
for element in noc_list:
    element[0] = element[0].replace(" ","_")
new_text = ' '.join(i for j in noc_list for i in j)

In [None]:
plt.figure(figsize = (12,8))
word_cld = WordCloud(max_font_size=None, background_color='white', collocations=False,
                      width=1200, height=1000).generate(new_text)
plt.imshow(word_cld)
plt.title("Teams from Countries")
plt.axis("off")
plt.show();

## Games

In [None]:
athletes.Games.value_counts()

In [None]:
plt.figure(figsize = (12,8))
sns.countplot(x = 'Games',data = athletes)
plt.xticks(fontsize=12,rotation=90);

## Year

In [None]:
athletes.Year.value_counts()

In [None]:
plt.figure(figsize = (12,8))
sns.countplot(x = 'Year',data = athletes)
plt.xticks(fontsize=12,rotation=90);

## Season

In [None]:
athletes.Season.value_counts()

In [None]:
plt.figure(figsize = (12,8))
sns.countplot(x = 'Season',data = athletes);

## City 

In [None]:
athletes.City.value_counts()

In [None]:
plt.figure(figsize = (12,8))
sns.countplot(x = 'City',data = athletes)
plt.xticks(fontsize=12,rotation=90);

## Sport

In [None]:
athletes.Sport.value_counts()

In [None]:
plt.figure(figsize = (15,12))
sns.countplot(x = 'Sport',data = athletes)
plt.xticks(fontsize=12,rotation=90);

# Event

In [None]:
athletes.Event.value_counts()

## Medal

In [None]:
athletes.Medal.value_counts()

In [None]:
plt.figure(figsize = (15,12))
sns.countplot(x = 'Medal',data = athletes,hue = 'Sex');

In [None]:
plt.figure(figsize = (15,12))
gold_medals = athletes[(athletes.Medal == "Gold")]
top_countries_gold = gold_medals.Region.value_counts().reset_index(name = "Medal").head(5)
sns.catplot(x = 'index',y = 'Medal',data = top_countries_gold,kind = "bar")
plt.title("Gold Medals");

In [None]:
plt.figure(figsize = (15,12))
silver_medals = athletes[(athletes.Medal == "Silver")]
top_countries_silver = silver_medals.Region.value_counts().reset_index(name = "Medal").head(5)
sns.catplot(x = 'index',y = 'Medal',data = top_countries_silver,kind = "bar")
plt.title("Silver Medals");

In [None]:
plt.figure(figsize = (15,12))
bronze_medals = athletes[(athletes.Medal == "Bronze")]
top_countries_bronze = bronze_medals.Region.value_counts().reset_index(name = "Medal").head(5)
sns.catplot(x = 'index',y = 'Medal',data = top_countries_bronze,kind = "bar")
plt.title("Bronze Medals");

In [None]:
plt.figure(figsize = (18,12))
sns.heatmap(athletes.corr(),annot = True,cmap = 'Blues');