
# Greatest Olympians

In this project, we will explore the Olympic Games data and find out the highest decorated countries and athletes since 1896.

    
<font color='blue'>
   
Content:

    
1. [Summer Game Analysis](#1) 
    * [Highest Decorated Athletes](#11)
    * [Breakdown by Medal](#12)
    * [Medals by Country](#13)
    * [Best Male and Female Athletes](#14)
1. [Winter Game Analysis](#2)
    * [Athletes with the Highest Medal Type](#21)
    * [Medals by Country](#22)
    * [Best Male and Female Athletes](#23)
1. [Performance of USA Athletes](#3)
    
    

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<a id=1></a><br>

# Summer Game Analysis

In [None]:
summer=pd.read_csv("/kaggle/input/olympic-games/summer.csv")
winter=pd.read_csv("/kaggle/input/olympic-games/winter.csv")
countries=pd.read_csv("/kaggle/input/olympic-games/dictionary.csv")

In [None]:
summer.head()

In [None]:
winter.head()

In [None]:
countries.head()

In [None]:
summer["Athlete"]=summer["Athlete"].str.split(', ').str[::-1].str.join(' ')
summer["Athlete"]=summer["Athlete"].str.title()
summer.head()

In [None]:
winter["Athlete"]=winter["Athlete"].str.split(', ').str[::-1].str.join(' ')
winter["Athlete"]=winter["Athlete"].str.title()
winter.head()

In [None]:
summer=summer.merge(countries,left_on='Country',right_on='Code',how='left')
summer=summer[['Year','City','Sport','Discipline','Athlete','Country_x','Gender','Event','Medal','Country_y']]
summer.columns=['Year','City','Sport','Discipline','Athlete','Code','Gender','Event','Medal','Country']
summer.head()

<a id=11></a><br>

## Highest Decorated Athletes

In [None]:
male_most_medals=summer[summer["Gender"]=="Men"]["Athlete"].value_counts()[:1].index[0]
male_medals=summer[summer["Gender"]=="Men"]["Athlete"].value_counts()[:1].values[0]
female_most_medals=summer[summer["Gender"]=="Women"]["Athlete"].value_counts()[:1].index[0]
female_medals=summer[summer["Gender"]=="Women"]["Athlete"].value_counts()[:1].values[0]

print("The Highest Decorated Male Athlete is: ",male_most_medals,"with: ",male_medals," medals")
print("The Highest Decorated Female Athlete is: ",female_most_medals,"with: ",female_medals," medals")

<a id=12></a><br>

## Breakdown by Medal

In [None]:
medals=summer.groupby(["Athlete","Medal"])["Sport"].count().reset_index().sort_values(by="Sport",ascending=False)
medals

In [None]:
medals=medals.drop_duplicates(subset=["Medal"],keep="first")
medals.columns=[["Athlete","Medal","Count"]]
medals

<a id=13></a><br>

## Medals by Country

In [None]:
medals_country = summer.groupby(['Country','Medal'])["Sport"].count().reset_index().sort_values(by='Sport',ascending=False)
medals_country = medals_country.pivot('Country','Medal','Sport').fillna(0)
medals_country

In [None]:
top=medals_country.sort_values(by="Gold",ascending=False)[:11]
top

In [None]:
top.plot.barh(width=0.8,color=['#CD7F32','#FFDF00','#D3D3D3'])
fig=plt.gcf()
fig.set_size_inches(10,10)
plt.title("Medals Distribution of Top Countries for Summer Olympics")
plt.show()

<a id=14></a><br>

## Best Male and Female Athletes

In [None]:
fig,ax=plt.subplots(1,2,figsize=(16,10))
men=summer[summer["Gender"]=="Men"]
men=men.groupby(["Athlete","Medal"])["Country"].count().reset_index().sort_values(by="Country",ascending=False)
men=men[men["Athlete"].isin(summer["Athlete"].value_counts().index[:20])]
men=men.pivot("Athlete","Medal","Country")
men.plot.barh(width=0.8,color=['#CD7F32','#FFDF00','#D3D3D3'],ax=ax[0])
ax[0].set_title('Best Male Athletes')
ax[0].set_ylabel('Athlete')

women=summer[summer["Gender"]=="Women"]
women=women.groupby(["Athlete","Medal"])["Country"].count().reset_index().sort_values(by="Country",ascending=False)
women=women[women["Athlete"].isin(summer["Athlete"].value_counts().index[:45])]
women=women.pivot("Athlete","Medal","Country")
women.plot.barh(width=0.8,color=['#CD7F32','#FFDF00','#D3D3D3'],ax=ax[1])
ax[1].set_title("Best Female Athletes")
ax[1].set_ylabel("")
plt.show()


In [None]:
summer["Discipline"].unique()

In [None]:
summer.loc[summer["Discipline"].str.contains("Wrestling"),"Discipline"]="Wrestling"
summer.loc[summer["Discipline"].str.contains("Weightlifting"),"Discipline"]="Weightlifting"

test=summer[summer["Country"].isin(summer["Country"].value_counts()[:10].index)]
test.head()


In [None]:
test=test[test['Discipline'].isin(summer['Discipline'].value_counts()[:10].index)]
test.head()

In [None]:
test=test.groupby(["Country","Discipline"])["Sport"].count().reset_index()
test.head()

## Medal Distrubition by Disciplines by Country

In [None]:
test=test.pivot("Discipline","Country","Sport")
sns.heatmap(test,cmap='RdYlGn',annot=True,fmt='2.0f')
fig=plt.gcf()
fig.set_size_inches(8,6)
plt.show()


In [None]:
test1=summer.groupby(["Country","Year"])["Medal"].count().reset_index()
test1=test1[test1["Country"].isin(summer["Country"].value_counts()[:5].index)]
test1=test1.pivot("Year","Country","Medal")
test1.plot()

fig=plt.gcf()
fig.set_size_inches(18,8)
plt.title("Medals by Years by Country")
plt.show()

<font color='blue'>
* The gaps on the graph are due to USSR breaking up as well as the split with East and West Germany after WW2.

<a id=2></a><br>
# Winter Game Analysis

In [None]:
print("The Highest Decorated Male Athlete is: ",winter[winter["Gender"]=="Men"]["Athlete"].value_counts()[:1].index[0],"with: ",winter[winter["Gender"]=="Men"]["Athlete"].value_counts()[:1].values[0],"medals")
print("The Highest Decorated Female Athlete is: ",winter[winter["Gender"]=="Women"]["Athlete"].value_counts()[:1].index[0],"with: ",winter[winter["Gender"]=="Women"]["Athlete"].value_counts()[:1].values[0],"medals")

<a id=21></a><br>

## Athletes with the Highest Medal Type

In [None]:
winter=winter.merge(countries,left_on="Country",right_on="Code",how="left")
winter=winter[['Year','City','Sport','Discipline','Athlete','Country_x','Gender','Event','Medal','Country_y']]
winter.columns=['Year','City','Sport','Discipline','Athlete','Code','Gender','Event','Medal','Country']

medals=winter.groupby(["Athlete","Medal"])["Sport"].count().reset_index().sort_values(by="Sport",ascending=False)
medals=medals.drop_duplicates(subset=["Medal"],keep="first")
medals.columns=[["Athlete","Medal","Count"]]
medals


<a id=22></a><br>

## Medals by Country

In [None]:
medals_map=winter.groupby(["Country","Code"])["Medal"].count().reset_index()
medals_map

In [None]:
medals_country=winter.groupby(["Country","Medal"])["Gender"].count().reset_index().sort_values(by="Gender",ascending=False)
medals_country=medals_country.pivot("Country","Medal","Gender").fillna(0)

top=medals_country.sort_values(by="Gold",ascending=False)[:11]
top.plot.barh(width=0.8,color=['#CD7F32','#FFDF00','#D3D3D3'])

fig=plt.gcf()
fig.set_size_inches(8,8)
plt.title("Medals Distribution of Top 10 Countries for Winter Olympics")
plt.show()


<a id=23></a><br>

## Best Male and Female Athletes

In [None]:
fig,ax=plt.subplots(1,2,figsize=(16,10))
men=winter[winter["Gender"]=="Men"]
men=men.groupby(["Athlete","Medal"])["Code"].count().reset_index().sort_values(by="Code",ascending=False)
men=men[men["Athlete"].isin(winter["Athlete"].value_counts().index[:15])]
men=men.pivot("Athlete","Medal","Code")

men.plot.barh(width=0.8,color=['#CD7F32','#FFDF00','#D3D3D3'],ax=ax[0])
ax[0].set_title("Best Male Athletes")
ax[0].set_ylabel("Athlete")


women=winter[winter["Gender"]=="Women"]
women=women.groupby(["Athlete","Medal"])["Code"].count().reset_index().sort_values(by="Code",ascending=False)
women=women[women["Athlete"].isin(winter["Athlete"].value_counts().index[:10])]
women=women.pivot("Athlete","Medal","Code")

women.plot.barh(width=0.8,color=['#CD7F32','#FFDF00','#D3D3D3'],ax=ax[1])
ax[1].set_title("Best Female Athletes")
ax[1].set_ylabel("")

<a id=3></a><br>

# Performance of USA Athletes

In [None]:
USA_medal_male=summer[(summer["Country"]=="United States")&(summer["Gender"]=="Men")]
USA_medal_female=summer[(summer["Country"]=="United States")&(summer["Gender"]=="Women")]

fig,ax=plt.subplots(2,figsize=(15,8))
male=USA_medal_male.groupby(["Medal","Year"])["Event"].count().reset_index()
male=male.pivot("Year","Medal","Event")
male.plot(color=['#CD7F32','#FFDF00','#D3D3D3'],ax=ax[0])
ax[0].set_xlabel("")
ax[0].set_title("Performance of USA Men")


female=USA_medal_female.groupby(["Medal","Year"])["Event"].count().reset_index()
female=female.pivot("Year","Medal","Event")
female.plot(color=['#CD7F32','#FFDF00','#D3D3D3'],ax=ax[1])
ax[1].set_title("Performance of USA Women")
plt.show()
