# Interactive Visualizations and EDA : Olympic Sports and Medals, 1896-2014


## Importing neccessary modules : 

In [None]:
import pandas as pd
import numpy as np

import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

## Reading the data :
### Countries :

In [None]:
countries = pd.read_csv('../input/olympic-games/dictionary.csv')
countries.head()

In [None]:
countries.describe()

In [None]:
countries.info()

In [None]:
# Renaming column for easy merging with summer and winter csvs
countries['Country'] = countries['Country'].apply(str)
countries['Code'] = countries['Code'].apply(str)
countries = countries.rename(columns={
    'Country': 'Country_name',
    'Code': 'Country'
})
countries.info()

### Summer :

In [None]:
summer = pd.read_csv('../input/olympic-games/summer.csv')
summer.head()

In [None]:
summer.info()

In [None]:
summer.describe(include=['O'])

### Winter:

In [None]:
winter = pd.read_csv('../input/olympic-games/winter.csv')
winter.head()

In [None]:
winter.info()

In [None]:
winter.describe(include=['O'])

# Summer Game Olympics Analysis:

## Merging and Data cleaning :

In [None]:
# Merging countries and summer csvs
summer_games = pd.merge(countries, summer, on="Country", how='left')
summer_games.info()

In [None]:
#Checking NaN values:
nrows = len(summer_games)
print("Number of rows : ", nrows)
summer_games.isna().sum()

**Selecting all rows with only na values in Year column:**

In [None]:
year_na = summer_games[summer_games['Year'].isna()]
year_na.head()

In [None]:
print("Total number of rows with NaN Year column : ", len(year_na))

year_na.isna().sum()

**Since the rest of the fields contain NaN values, its safe to remove these rows :**

In [None]:
summer_games = summer_games.dropna(subset=['Year'])
print("Number of rows dropped = ", (nrows - len(summer_games)))
summer_games.isna().sum()

In [None]:
# Changing float year column to int
summer_games['Year'] = summer_games['Year'].astype(int)
summer_games.head(15)

**Lets convert the Athelete names to lower case and remove the comma between the last name and first name :**

In [None]:
summer_games.Athlete = summer_games.Athlete.apply(lambda x: x.capitalize().replace(',', ''))
summer_games.Athlete.head(10)

In [None]:
### Check the different characters present in Athlete names.
# import re
# extra_chars=set()
# for name in summer_games.Athlete.values:
#     extra_chars.update(list(re.sub("[A-Za-z0-9]","",name)))
# print(extra_chars)

## Visualizing summer game dataset :

### Athlete Gender distribution :

In [None]:
# Summer Game Gender distribution :
M, F = summer_games.Gender.value_counts()
print("Gender distribution : \nMale --------------- : ", M,"\nFemale ------------- :  ", F, "\n\nTotal Athltete Count : ", M + F)

fig = px.pie(pd.DataFrame({'Gender': ["Male", "Female"],'Counts': [M, F]}),
             values='Counts',
             names='Gender',
             width=600,
             height=600,
             color='Gender',
             color_discrete_map={
                 "Male": "#67001F",
                 "Female": "#B2182B"
             },
             title='Athlete Gender distribution - Summer Games')
fig.show()

Gender distribution in Summer games :

In [None]:
print("Total Athltete Count : ", M + F)

fig = px.bar(pd.DataFrame({ 'Gender': ["Male", "Female"], 'Counts': [M, F]}),
             x='Gender',
             y='Counts',
             width=600,
             height=600,
             color=["#67001F", "#B2182B"],
             color_discrete_map="identity",
             hover_name="Counts",
             text='Counts',
             title="Athlete Gender distribution - Summer Games")
fig.show()

## Top Countries with highest Medals won in Summer Olympics: 

### Top 20 Countries Medals wins sorted by total overall medal wins :

In [None]:
print("Total number of distict countries : ",len(summer_games.Country_name.unique()))

In [None]:
country_wise_medals = summer_games.groupby(['Country_name', 'Medal'])['Gender'].count().reset_index()
country_wise_medals = country_wise_medals.pivot_table(index='Country_name',
                                                      columns='Medal',
                                                      values='Gender',
                                                      aggfunc=np.sum,
                                                      fill_value=0)
# rearranging Medal column order to : Gold, Silver, Bronze
rearranged_col_index = country_wise_medals.columns.reindex(['Gold', 'Silver', 'Bronze'])
country_wise_medals = country_wise_medals.reindex(columns=rearranged_col_index[0])
country_wise_medals = pd.concat([country_wise_medals, country_wise_medals.sum(axis=1)],axis=1)
country_wise_medals = country_wise_medals.sort_values(by=0, ascending=False).rename(columns={0: 'Total'})

print("\nTop 20 Countries with highest overall Medals won in Summer Olympics: ")
country_wise_medals.iloc[:20]

In [None]:
top_20 = country_wise_medals.drop(columns='Total')[:20]  #.sort_values(by=country_wise_medals.columns.tolist(),ascending=True)
fig = px.bar(
    top_20,
    color_discrete_map={
        "Gold": "#FFCC00",
        "Silver": "#CCCCCC",
        "Bronze": "#CC6633"
    },
    width=1200,
    height=600,
    labels={"Country_name": "Countries","value": "Medals"},
    title='Top 20 Countries with highest overall Medals won in Summer Olympics:'
)

fig.update_layout(title={'x': .5}, font={"family": "roboto"}, hovermode='x')
fig.show()
# vertical plot
# top_20 = country_wise_medals[:20].sort_values(by=country_wise_medals.columns.tolist(),ascending=True)
# fig = px.bar(top_20,orientation='v',title='Top 20 Countries with highest Medals won in Summer Olympics:')
# fig.show()

**Plot with total medal wins per country :**

In [None]:
top_20 = country_wise_medals[:20]  #.sort_values(by=country_wise_medals.columns.tolist(),ascending=True)
fig = px.bar(
    top_20,
    color_discrete_map={
        "Gold": "#FFCC00",
        "Silver": "#CCCCCC",
        "Bronze": "#CC6633",
        "Total": "#fded81"
    },
    width=1200,
    height=600,
    labels={"Country_name": "Top 20 Countries","value": "Medals Earned"},
    title='Top 20 Countries with highest Medals won in Summer Olympics:')
fig.update_layout(hovermode='x')
fig.show()

In [None]:
# Top 20 Country names :
top20_country_names = list(country_wise_medals[:20].index)
top20_country_names

### Top Countries Medals wins sorted by highest Gold Medal wins :

In [None]:
country_wise_medals = summer_games.groupby(['Country_name', 'Medal'])['Gender'].count().reset_index()
country_wise_medals = country_wise_medals.pivot_table(index='Country_name',
                                                      columns='Medal',
                                                      values='Gender',
                                                      aggfunc=np.sum,
                                                      fill_value=0)
# Sorting by Gold Medals
country_wise_medals = country_wise_medals.sort_values(by=['Gold'],ascending=False)
# rearranging Medal column order to : Gold, Silver, Bronze
rearranged_col_index = country_wise_medals.columns.reindex(['Gold', 'Silver', 'Bronze'])
country_wise_medals = country_wise_medals.reindex(columns=rearranged_col_index[0])

print("\nTop 20 Countries with sorted by Gold Medal wins in Summer Olympics: ")
country_wise_medals.iloc[:20]

In [None]:
top_10 = country_wise_medals[:10]
fig = px.bar(
    top_10,
    orientation='v',
    color_discrete_map={
        "Gold": "#FFCC00",
        "Silver": "#CCCCCC",
        "Bronze": "#CC6633"
    },
    width=1000,
    height=600,
    labels={"Country_name": "Top 10 Countries","value": "Medal Wins"},
    title='Top 10 Countries sorted by highest Gold Medal wins in Summer Olympics:')
fig.show()

## Country wise medals distribution over the Year for top 20 countries  : 

In [None]:
medal_year_country = summer_games.groupby(['Year','Country_name'])['Medal'].count()
medal_year_country = pd.DataFrame(medal_year_country)
# medal_year_country = medal_year_country.groupby(level=0).apply(lambda df : df.sort_index(ascending=False)[:20])
# medal_year_country.index = medal_year_country.index.droplevel(0)
medal_year_country = medal_year_country.reset_index()
top20_medal_year_country = medal_year_country[medal_year_country['Country_name'].isin(top20_country_names)]  #.reset_index(drop=True)
top20_medal_year_country = pd.pivot_table(top20_medal_year_country,
                                          index='Year',
                                          columns='Country_name',
                                          values='Medal')
top20_medal_year_country.head()

In [None]:
# Top 20 Countries
fig = px.line(
    top20_medal_year_country,
    template="plotly_dark",
    title='Top 20 Country wise Medal distribution over the years for Summer Olympics'
)
fig.update_layout(plot_bgcolor="#1f1c1c", title={'x': 0.5})

fig.show()

**Top 10 Countries Medal distribution over the years:**

In [None]:
# First Top 10 Countries:
top10_medal_year_country = medal_year_country[medal_year_country['Country_name'].isin(top20_country_names[:10])].reset_index(drop=True)
top10_medal_year_country = pd.pivot_table(top10_medal_year_country,
                                          index='Year',
                                          columns='Country_name',
                                          values='Medal')

fig = px.line(
    top10_medal_year_country,
    template="plotly_dark",
    title='Top 10 Country wise Medal distribution over the years for Summer Olympics'
)

fig.show()

## Overall Top 20 Athlete Medal achievement distribution across the years :

In [None]:
# Overall Summer top 20 Athlete's Medal achievement
print("Overall Summer top 20 Athelete's with their Medal achievement : ")
best_athlete_summer = pd.DataFrame(summer_games.groupby('Athlete')['Medal'].count().sort_values(ascending=False))
best_athlete_summer[:20]

In [None]:
fig = px.bar(
    best_athlete_summer[:20].reset_index(),
    x='Athlete',
    y='Medal',
    hover_name="Athlete",
    color='Athlete',
    template='plotly_white',
    width=800,
    height=600,
    title="Overall Top 20 Athlete Medal achievement distribution across all the years (Summer Olympics)"
)
fig.show()

## Top athlete acivement gender wise :

### MEN : 

In [None]:
men_summer = summer_games[summer_games['Gender'] == 'Men']
player_wise_medal_men = men_summer[['Athlete', 'Medal','Gender']].groupby(['Athlete','Medal']).count()
men_summer = pd.DataFrame(men_summer.groupby('Athlete')['Medal'].count().sort_values(ascending=False)).reset_index()
best_men = men_summer.iloc[0]['Athlete']
print("Top Men Athlete with highest medal achievement : \n\nAthlete Name     : ",best_men, "\nTotal Medals Won : ", men_summer.iloc[0]['Medal'])
print("\nMedal Distribution - \nGold   : ",int(player_wise_medal_men.loc[best_men].loc['Gold']))
print("Silver :  ", int(player_wise_medal_men.loc[best_men].loc['Silver']))
print("Bronze :  ", int(player_wise_medal_men.loc[best_men].loc['Bronze']))

### WOMEN :

In [None]:
women_summer = summer_games[summer_games['Gender'] == 'Women']
player_wise_medal_women = women_summer[['Athlete', 'Medal', 'Gender']].groupby(['Athlete','Medal']).count()
women_summer = pd.DataFrame(women_summer.groupby('Athlete')['Medal'].count().sort_values(ascending=False)).reset_index()
best_women = women_summer.iloc[0]['Athlete']

print("Top Women Athlete with highest medal achievement : \n")
print("1.\nAthlete Name     : ", best_women, "\nTotal Medals Won : ",women_summer.iloc[0]['Medal'])
print("Medal Distribution : \nGold   : ",int(player_wise_medal_women.loc[best_women].loc['Gold']))
print("Silver : ", int(player_wise_medal_women.loc[best_women].loc['Silver']))
print("Bronze : ", int(player_wise_medal_women.loc[best_women].loc['Bronze']))

best_women = women_summer.iloc[1]['Athlete']
print("\n2.\nAthlete Name     : ", best_women, "\nTotal Medals Won : ",women_summer.iloc[1]['Medal'])
print("Medal Distribution : \nGold   : ",
      int(player_wise_medal_women.loc[best_women].loc['Gold']))
print("Silver : ", int(player_wise_medal_women.loc[best_women].loc['Silver']))
print("Bronze : ", int(player_wise_medal_women.loc[best_women].loc['Bronze']))

best_women = women_summer.iloc[2]['Athlete']
print("\n3.\nAthlete Name     : ", best_women, "\nTotal Medals Won : ",women_summer.iloc[2]['Medal'])
print("Medal Distribution : \nGold   : ",int(player_wise_medal_women.loc[best_women].loc['Gold']))
print("Silver : ", int(player_wise_medal_women.loc[best_women].loc['Silver']))
print("Bronze : ", int(player_wise_medal_women.loc[best_women].loc['Bronze']))

### Top 20 Men and Women achiever :

In [None]:
fig = px.bar(men_summer.iloc[:20],
             x='Athlete',
             y='Medal',
             color='Athlete',
             template='seaborn',
             title="Top 20 Men Athlete Medal achievement across the years (Summer Olympics)")
fig.show()

In [None]:
fig = px.bar(women_summer.iloc[:20],
             x='Athlete',
             y='Medal',
             color='Athlete',
             template='seaborn',
             title="Women Athlete achievement across the years (Summer Olympics)",
             hover_name='Medal')
fig.show()

### Top 10 Men and Women Medal achievement comparison : 
#### Comparison by overall Medals earned :

In [None]:
fig = make_subplots(rows=1, cols=2, shared_yaxes=True)

custom_data = np.transpose([
    list(men_summer.iloc[:10]['Athlete'].values),
    list(men_summer.iloc[:10]['Medal'].values)
])
fig.add_trace(go.Bar(x=list(men_summer.iloc[:10]['Athlete'].values),
                     y=list(men_summer.iloc[:10]['Medal'].values),
                     text=list(men_summer.iloc[:10]['Medal'].values),
                     textposition='auto',
                     name="Men",
                     marker_color="#0B3C49",
                     customdata=custom_data,
                     hovertemplate="<br>".join([
                         "Athlete: %{customdata[0]}",
                         "Medal : %{customdata[1]}",
                     ])),
              row=1,
              col=1)

custom_data = np.transpose([
    list(women_summer.iloc[:10]['Athlete'].values),
    list(women_summer.iloc[:10]['Medal'].values)
])
fig.add_trace(go.Bar(x=list(women_summer.iloc[:10]['Athlete'].values),
                     y=list(women_summer.iloc[:10]['Medal'].values),
                     text=list(women_summer.iloc[:10]['Medal'].values),
                     textposition='auto',
                     name="Women",
                     marker_color="#731963",
                     customdata=custom_data,
                     hovertemplate="<br>".join([
                         "Athlete: %{customdata[0]}",
                         "Medal : %{customdata[1]}",
                     ])),
              row=1,
              col=2)

fig.update_traces(marker_line_color="#000000",
                  marker_line_width=1,
                  opacity=0.7)
fig.update_yaxes(tickvals=list(range(0, 25, 2)), gridcolor='#A1BDCE')
fig.update_layout(height=600,
                  width=1000,
                  xaxis_tickangle=30,
                  xaxis_tickfont_size=12,
                  title_text="Top 10 Men and Women medal achievers (Summer Olympics)",
                  uniformtext=dict(mode="hide", minsize=10),
                  legend=dict(orientation="h",
                              yanchor="bottom",
                              y=1.01,
                              xanchor="left",
                              x=0,
                              bgcolor="#fcf7ff",
                              borderwidth=0.5))
fig['layout']['xaxis1'].update(title='MEN ATHLETE')
fig['layout']['xaxis2'].update(title='WOMEN ATHLETE')
fig.show()

#### Comparison by Medals [Gold, Silver, Bronze] earned :

In [None]:
top_10_men = list(men_summer.iloc[:10]['Athlete'].values)
top_10_women = list(women_summer.iloc[:10]['Athlete'].values)

player_wise_medal_men = player_wise_medal_men.reset_index()
player_wise_medal_women = player_wise_medal_women.reset_index()

In [None]:
top_10_medal_distribution_men = pd.pivot_table(
    player_wise_medal_men[player_wise_medal_men.Athlete.isin(top_10_men)],
    columns='Medal',
    index="Athlete",
    fill_value=0,
    values='Gender')
# Sorting by Gold Medals
top_10_medal_distribution_men = top_10_medal_distribution_men.sort_values(by=['Gold'], ascending=False)
# Rearranging Medal column order to : Gold, Silver, Bronze
new_col = top_10_medal_distribution_men.columns.reindex(['Gold', 'Silver', 'Bronze'])
top_10_medal_distribution_men = top_10_medal_distribution_men.reindex(columns=new_col[0])

top_10_medal_distribution_women = pd.pivot_table(player_wise_medal_women[player_wise_medal_women.Athlete.isin(top_10_women)],
                                                 columns='Medal',
                                                 index="Athlete",
                                                 fill_value=0,
                                                 values='Gender')
# Sorting by Gold Medals
top_10_medal_distribution_women = top_10_medal_distribution_women.sort_values(by=['Gold'], ascending=False)
# Rearranging Medal column order to : Gold, Silver, Bronze
new_col = top_10_medal_distribution_women.columns.reindex(['Gold', 'Silver', 'Bronze'])
top_10_medal_distribution_women = top_10_medal_distribution_women.reindex(columns=new_col[0])
top_10_medal_distribution_women

In [None]:
men_Y = list(zip(top_10_medal_distribution_men['Gold'].tolist(),top_10_medal_distribution_men['Silver'].tolist(),top_10_medal_distribution_men['Bronze'].tolist()))
women_Y = list(zip(top_10_medal_distribution_women['Gold'].tolist(),top_10_medal_distribution_women['Silver'].tolist(),top_10_medal_distribution_women['Bronze'].tolist()))

In [None]:
fig = make_subplots(rows=2, cols=1, shared_yaxes=True)

fig.add_trace(go.Bar(
    x=list(top_10_medal_distribution_men.index.values),
    y=[x[0] for x in men_Y],
    name="Men - Gold",
    marker_color="#FFCC00",
    text=[x[0] for x in men_Y],
    textposition='auto',
),
              row=1,
              col=1)
fig.add_trace(go.Bar(
    x=list(top_10_medal_distribution_men.index.values),
    y=[x[1] for x in men_Y],
    name="Men - Silver",
    marker_color="#CCCCCC",
    text=[x[1] for x in men_Y],
    textposition='auto',
),
              row=1,
              col=1)
fig.add_trace(go.Bar(
    x=list(top_10_medal_distribution_men.index.values),
    y=[x[2] for x in men_Y],
    name="Men - Bronze",
    marker_color="#CC6633",
    text=[x[2] for x in men_Y],
    textposition='auto',
),
              row=1,
              col=1)

fig.add_trace(go.Bar(
    x=list(top_10_medal_distribution_women.index.values),
    y=[x[0] for x in women_Y],
    name="Women - Gold",
    marker_color="#FFCC00",
    text=[x[0] for x in women_Y],
    textposition='auto',
),
              row=2,
              col=1)
fig.add_trace(go.Bar(
    x=list(top_10_medal_distribution_women.index.values),
    y=[x[1] for x in women_Y],
    name="Women - Silver",
    marker_color="#CCCCCC",
    text=[x[1] for x in women_Y],
    textposition='auto',
),
              row=2,
              col=1)
fig.add_trace(go.Bar(
    x=list(top_10_medal_distribution_women.index.values),
    y=[x[2] for x in women_Y],
    name="Women - Bronze",
    marker_color="#CC6633",
    text=[x[2] for x in women_Y],
    textposition='auto',
),
              row=2,
              col=1)

fig.update_layout(
    height=1000,
    width=800,
    barmode='stack',
    legend=dict(bgcolor="#fcf7ff", borderwidth=0.5),
    title_text="Top 10 Men and Women medal achievers (Summer Olympics)",
    template='ggplot2',
)
fig['layout']['xaxis1'].update(title='MEN ATHLETES')
fig['layout']['xaxis2'].update(title='WOMEN ATHLETES')
fig.show()

## Medals achieved by countries distributed across Sport Events : 

In [None]:
top10_countries = summer_games[summer_games.Country_name.isin(top20_country_names[:10])]
top10_countries = pd.DataFrame(top10_countries.groupby(['Country_name', 'Sport'])['Medal'].count())
top10_countries = top10_countries.reindex(axis='index', level=0, labels=top20_country_names[:10]).reset_index()
top10_countries[:10]

In [None]:
fig = px.bar(
    top10_countries,
    x="Country_name",
    y="Medal",
    color="Sport",
    hover_name="Medal",
    template='ggplot2',
    labels={
        "Country_name": "Top 10 Countries",
        "Medal": "Medals earned"
    },
    title="Medals achieved by countries distributed across Sport Events (Summer Olympics)")
fig.show()

OR

In [None]:
fig = px.imshow(
    pd.pivot_table(top10_countries,
                   index='Country_name',
                   values='Medal',
                   columns='Sport'),
    labels=dict(y="Top 10 Countries", x="Olympics sports events"),
    title="Medals achieved by countries distributed across Sport Events (Summer Olympics)")
fig.show()

# Winter Game Analysis :
## Merging and Data cleaning : 

In [None]:
# Merging countries and winter csvs
winter_games = pd.merge(countries, winter, on="Country", how='left')
winter_games.info()

In [None]:
#Checking Na values:
nrows = len(winter_games)
print("Number of rows : ", nrows)
winter_games.isna().sum()

Selecting all rows with only na values in Year column:

In [None]:
year_na = winter_games[winter_games['Year'].isna()]
year_na.head()

In [None]:
print("Total number of rows with NaN Year column : ", len(year_na))

year_na.isna().sum()

**Since the rest of the fields contain NaN values, its safe to remove these rows :**

In [None]:
winter_games = winter_games.dropna(subset=['Year'])
print("Number of rows dropped = ", (nrows - len(winter_games)))
winter_games.isna().sum()

In [None]:
# Changing float year column to int
winter_games['Year'] = winter_games['Year'].astype(int)
winter_games.head(5)

**Lets convert the Athelete names to lower case and remove the comma between the last name and first name :**

In [None]:
winter_games.Athlete = winter_games.Athlete.apply(lambda x: x.capitalize().replace(',', ''))
winter_games.Athlete.head(10)

## Visualizing winter game dataset :

### Athlete Gender distribution :

In [None]:
# Winter Game Gender distribution :
M, F = winter_games.Gender.value_counts()
print("Gender distribution : \nMale --------------- : ", M,"\nFemale ------------- : ", F, "\n\nTotal Athltete Count : ", M + F)

In [None]:
women_winter = winter_games[winter_games['Gender'] == 'Women']
player_wise_medal_women = women_winter[['Athlete', 'Medal', 'Gender']].groupby(['Athlete','Medal']).count()
women_winter = pd.DataFrame(women_winter.groupby('Athlete')['Medal'].count().sort_values(ascending=False)).reset_index()
best_women = women_winter.iloc[0]['Athlete']
print("Top Women Athlete with highest medal achievement : \n")
print("Athlete Name     : ", best_women, "\nTotal Medals Won : ",women_winter.iloc[0]['Medal'])
print("Medal Distribution : \nGold   : ",int(player_wise_medal_women.loc[best_women].loc['Gold']))
print("Silver : ", int(player_wise_medal_women.loc[best_women].loc['Silver']))
print("Bronze : ", int(player_wise_medal_women.loc[best_women].loc['Bronze']))

In [None]:
# Winter Game Gender distribution :
M, F = winter_games.Gender.value_counts()
print("Gender distribution : \nMale --------------- : ", M,"\nFemale ------------- : ", F, "\n\nTotal Athltete Count : ", M + F)

fig = make_subplots(rows=1,
                    cols=2,
                    shared_xaxes=False,
                    shared_yaxes=False,
                    subplot_titles=["Athlete Counts", "Athlete Percentage"],
                    specs=[[{"type": "bar"}, {"type": "pie"}]])

gender_counts = pd.DataFrame({'Gender': ["Male", "Female"], 'Counts': [M, F]})
fig.add_trace(go.Bar(
    x=list(gender_counts['Gender'].values),
    y=list(gender_counts['Counts'].values),
    text=list(gender_counts['Counts'].values),
    textposition='auto',
    name="Bar chart",
    marker_color=px.colors.sequential.RdBu,
),
              row=1,
              col=1)

fig.add_trace(go.Pie(
    labels=list(gender_counts['Gender'].values),
    values=list(gender_counts['Counts'].values),
    textposition='auto',
    textinfo='label+percent',
    marker_colors=px.colors.sequential.RdBu,
    name="Pie chart",
),
              row=1,
              col=2)

fig.update_traces(marker_line_color="#000000",
                  marker_line_width=1,
                  opacity=0.8)
fig.update_yaxes(tickvals=list(range(0, 3300, 400)), gridcolor='#A1BDCE')
fig.update_layout(height=600,
                  width=1000,
                  xaxis_tickangle=30,
                  xaxis_tickfont_size=12,
                  title_text="Top 10 Men and Women medal achievers (Winter Olympics)",
                  template="ggplot2",
                  uniformtext=dict(mode="hide", minsize=10))
fig['layout']['xaxis1'].update(title='Gender')
fig.show()

## Top Countries with highest Medals won in Winter Olympics: 

### Top Countries Medals wins sorted by total overall medal wins :

In [None]:
print("Total number of distict countries : ",len(winter_games.Country_name.unique()))

In [None]:
country_wise_medals = winter_games.groupby(['Country_name', 'Medal'])['Gender'].count().reset_index()
country_wise_medals = country_wise_medals.pivot_table(index='Country_name',
                                                      columns='Medal',
                                                      values='Gender',
                                                      aggfunc=np.sum,
                                                      fill_value=0)
# rearranging Medal column order to : Gold, Silver, Bronze
rearranged_col_index = country_wise_medals.columns.reindex(['Gold', 'Silver', 'Bronze'])
country_wise_medals = country_wise_medals.reindex(columns=rearranged_col_index[0])
country_wise_medals = pd.concat([country_wise_medals, country_wise_medals.sum(axis=1)],axis=1,
)
country_wise_medals = country_wise_medals.sort_values(by=0, ascending=False).rename(columns={0: 'Total'})

print("\nTop 20 Countries with highest Medals won in Winter Olympics: ")
country_wise_medals.iloc[:20]

In [None]:
top_20 = country_wise_medals.drop(columns='Total')[:20]
fig = px.bar(
    top_20,
    color_discrete_map={
        "Gold": "#FFCC00",
        "Silver": "#CCCCCC",
        "Bronze": "#CC6633"
    },
    width=1200,
    height=600,
    labels={
        "Country_name": "Top 20 Countries",
        "value": "Medals Earned"
    },
    template='ggplot2',
    title='Top 20 Countries with highest Medals won in Winter Olympics:')
fig.show()

**Plot with total medal wins per country :**

In [None]:
top_20 = country_wise_medals[:
                             20]  #.sort_values(by=country_wise_medals.columns.tolist(),ascending=True)
fig = px.bar(
    top_20,
    color_discrete_map={
        "Gold": "#FFCC00",
        "Silver": "#CCCCCC",
        "Bronze": "#CC6633",
        "Total": "#fded81"
    },
    width=1200,
    height=600,
    labels={
        "Country_name": "Top 20 Countries",
        "value": "Medals"
    },
    title='Top 20 Countries with highest Medals won in Winter Olympics:')
fig.show()

In [None]:
# Country names :
top20_country_names = list(country_wise_medals[:20].index)
top20_country_names

### Top Countries Medals wins sorted by highest Gold Medal wins :

In [None]:
country_wise_medals = winter_games.groupby(['Country_name', 'Medal'])['Gender'].count().reset_index()
country_wise_medals = country_wise_medals.pivot_table(index='Country_name',
                                                      columns='Medal',
                                                      values='Gender',
                                                      aggfunc=np.sum,
                                                      fill_value=0)
# Sorting by Gold Medals
country_wise_medals = country_wise_medals.sort_values(by=['Gold'],ascending=False)
# rearranging Medal column order to : Gold, Silver, Bronze
rearranged_col_index = country_wise_medals.columns.reindex(['Gold', 'Silver', 'Bronze'])
country_wise_medals = country_wise_medals.reindex(columns=rearranged_col_index[0])

print("\nTop 20 Countries with highest Medals won in Winter Olympics: ")
country_wise_medals.iloc[:20]

In [None]:
top_20 = country_wise_medals[:10]
fig = px.bar(
    top_20,
    orientation='v',
    color_discrete_map={
        "Gold": "#FFCC00",
        "Silver": "#CCCCCC",
        "Bronze": "#CC6633"
    },
    width=1200,
    height=600,
    labels={
        "Country_name": "Top 10 Countries",
        "value": "Medal Wins"
    },
    title='Top 20 Countries with highest Medals won in Winter Olympics:')
fig.show()

## Medals Distribution over the Year for top 20 countries  : 

In [None]:
medal_year_country = winter_games.groupby(['Year','Country_name'])['Medal'].count()
medal_year_country = pd.DataFrame(medal_year_country)
medal_year_country = medal_year_country.reset_index()
top20_medal_year_country = medal_year_country[medal_year_country['Country_name'].isin(top20_country_names)].reset_index(drop=True)
top20_medal_year_country = pd.pivot_table(top20_medal_year_country,
                                          index='Year',
                                          columns='Country_name',
                                          values='Medal')
top20_medal_year_country.head()

In [None]:
# Top 20 Countries
fig = px.line(
    top20_medal_year_country,
    template="plotly_dark",
    title='Top 20 Country wise Medal distribution over the years for Winter Olympics'
)
fig.show()

**Top 5 Country-wise Medal distribution over the years for Winter Olympics :**

In [None]:
# First Top 5 Countries:
top10_medal_year_country = medal_year_country[medal_year_country['Country_name'].isin(top20_country_names[:5])].reset_index(drop=True)
top10_medal_year_country = pd.pivot_table(top10_medal_year_country,
                                          index='Year',
                                          columns='Country_name',
                                          values='Medal')

fig = px.line(
    top10_medal_year_country,
    template="plotly_dark",
    title='Top 5 Country-wise Medal distribution over the years for Winter Olympics')
fig.show()

## Overall Winter top 20 Athlete Medal achievement  :

In [None]:
# Overall Winter top 20 Athlete's Medal achievement
print("Overall Winter top 20 Athelete's with their Medal achievement : ")
best_athlete_winter = pd.DataFrame(winter_games.groupby('Athlete')['Medal'].count().sort_values(ascending=False))
best_athlete_winter[:20]

In [None]:
fig = px.bar(
    best_athlete_winter[:20].reset_index(),
    x='Athlete',
    y='Medal',
    hover_name="Athlete",
    color='Athlete',
    template='plotly_white',
    width=800,
    height=600,
    title="Winter Overall Top 20 Athlete Medal achievement distribution (Winter Olympics)")
fig.show()

## Top athlete acivement gender wise :

### MEN : 

In [None]:
men_winter = winter_games[winter_games['Gender'] == 'Men']
player_wise_medal_men = men_winter[['Athlete', 'Medal','Gender']].groupby(['Athlete','Medal']).count()
men_winter = pd.DataFrame(men_winter.groupby('Athlete')['Medal'].count().sort_values(ascending=False)).reset_index()
best_men = men_winter.iloc[0]['Athlete']
print("Top Men Athlete with highest medal achievement : \n\nAthlete Name     : ",best_men, "\nTotal Medals Won : ", men_winter.iloc[0]['Medal'])
print("\nMedal Distribution - \nGold    : ",int(player_wise_medal_men.loc[best_men].loc['Gold']))
print("Silver :  ", int(player_wise_medal_men.loc[best_men].loc['Silver']))
print("Bronze :  ", int(player_wise_medal_men.loc[best_men].loc['Bronze']))

### WOMEN :

In [None]:
women_winter = winter_games[winter_games['Gender'] == 'Women']
player_wise_medal_women = women_winter[['Athlete', 'Medal', 'Gender']].groupby(['Athlete','Medal']).count()
women_winter = pd.DataFrame(women_winter.groupby('Athlete')['Medal'].count().sort_values(ascending=False)).reset_index()
best_women = women_winter.iloc[0]['Athlete']
print("Top Women Athlete with highest medal achievement : \n\nAthlete Name     : ",best_women, "\nTotal Medals Won : ", women_winter.iloc[0]['Medal'])
print("\nMedal Distribution - \nGold   : ",int(player_wise_medal_women.loc[best_women].loc['Gold']))
print("Silver : ", int(player_wise_medal_women.loc[best_women].loc['Silver']))
print("Bronze : ", int(player_wise_medal_women.loc[best_women].loc['Bronze']))

### Top 10 Men and Women Medal achievement comparison : 
#### Comparison by overall Medals earned :

In [None]:
fig = make_subplots(rows=1, cols=2, shared_yaxes=True)

custom_data = np.transpose([
    list(men_winter.iloc[:10]['Athlete'].values),
    list(men_winter.iloc[:10]['Medal'].values)
])
fig.add_trace(go.Bar(x=list(men_winter.iloc[:10]['Athlete'].values),
                     y=list(men_winter.iloc[:10]['Medal'].values),
                     text=list(men_winter.iloc[:10]['Medal'].values),
                     textposition='auto',
                     name="Men",
                     marker_color="#413620",
                     customdata=custom_data,
                     hovertemplate="<br>".join([
                         "Athlete: %{customdata[0]}",
                         "Medal : %{customdata[1]}",
                     ])),
              row=1,
              col=1)

custom_data = np.transpose([
    list(women_winter.iloc[:10]['Athlete'].values),
    list(women_winter.iloc[:10]['Medal'].values)
])
fig.add_trace(go.Bar(x=list(women_winter.iloc[:10]['Athlete'].values),
                     y=list(women_winter.iloc[:10]['Medal'].values),
                     text=list(women_winter.iloc[:10]['Medal'].values),
                     textposition='auto',
                     name="Women",
                     marker_color="#9c6615",
                     customdata=custom_data,
                     hovertemplate="<br>".join([
                         "Athlete: %{customdata[0]}",
                         "Medal : %{customdata[1]}",
                     ])),
              row=1,
              col=2)

fig.update_traces(marker_line_color="#000000",
                  marker_line_width=1,
                  opacity=0.8)
fig.update_yaxes(tickvals=list(range(0, 25, 2)), gridcolor='#A1BDCE')
fig.update_layout(height=600,
                  width=1000,
                  xaxis_tickangle=30,
                  xaxis_tickfont_size=12,
                  title_text="Top 10 Men and Women medal achievers (Winter Olympics)",
                  uniformtext=dict(mode="hide", minsize=10),
                  legend=dict(orientation="h",
                              yanchor="bottom",
                              y=1.01,
                              xanchor="left",
                              x=0,
                              bgcolor="#fcf7ff",
                              borderwidth=0.5))
fig['layout']['xaxis1'].update(title='MEN ATHLETE')
fig['layout']['xaxis2'].update(title='WOMEN ATHLETE')
fig.show()

#### Comparison by Medals [Gold, Silver, Bronze] earned :

In [None]:
top_10_men   = list(men_winter.iloc[:10]['Athlete'].values)
top_10_women = list(women_winter.iloc[:10]['Athlete'].values)

player_wise_medal_men   = player_wise_medal_men.reset_index()
player_wise_medal_women = player_wise_medal_women.reset_index()

In [None]:
top_10_medal_distribution_men = pd.pivot_table(
    player_wise_medal_men[player_wise_medal_men.Athlete.isin(top_10_men)],
    columns='Medal',
    index="Athlete",
    fill_value=0,
    values='Gender')
# Sorting by Gold Medals
top_10_medal_distribution_men = top_10_medal_distribution_men.sort_values(by=['Gold'], ascending=False)
# Rearranging Medal column order to : Gold, Silver, Bronze
new_col = top_10_medal_distribution_men.columns.reindex(['Gold', 'Silver', 'Bronze'])
top_10_medal_distribution_men = top_10_medal_distribution_men.reindex(columns=new_col[0])

top_10_medal_distribution_women = pd.pivot_table(player_wise_medal_women[player_wise_medal_women.Athlete.isin(top_10_women)],
                                                 columns='Medal',
                                                 index="Athlete",
                                                 fill_value=0,
                                                 values='Gender')
# Sorting by Gold Medals
top_10_medal_distribution_women = top_10_medal_distribution_women.sort_values(by=['Gold'], ascending=False)
# Rearranging Medal column order to : Gold, Silver, Bronze
new_col = top_10_medal_distribution_women.columns.reindex(['Gold', 'Silver', 'Bronze'])
top_10_medal_distribution_women = top_10_medal_distribution_women.reindex(columns=new_col[0])
top_10_medal_distribution_women

In [None]:
men_Y = list(
    zip(top_10_medal_distribution_men['Gold'].tolist(),
        top_10_medal_distribution_men['Silver'].tolist(),
        top_10_medal_distribution_men['Bronze'].tolist()))
women_Y = list(
    zip(top_10_medal_distribution_women['Gold'].tolist(),
        top_10_medal_distribution_women['Silver'].tolist(),
        top_10_medal_distribution_women['Bronze'].tolist()))

In [None]:
fig = make_subplots(rows=2, cols=1, shared_yaxes=True)

fig.add_trace(go.Bar(
    x=list(top_10_medal_distribution_men.index.values),
    y=[x[0] for x in men_Y],
    name="Men - Gold",
    marker_color="#FFCC00",
    text=[x[0] for x in men_Y],
    textposition='auto',
),
              row=1,
              col=1)
fig.add_trace(go.Bar(
    x=list(top_10_medal_distribution_men.index.values),
    y=[x[1] for x in men_Y],
    name="Men - Silver",
    marker_color="#CCCCCC",
    text=[x[1] for x in men_Y],
    textposition='auto',
),
              row=1,
              col=1)
fig.add_trace(go.Bar(
    x=list(top_10_medal_distribution_men.index.values),
    y=[x[2] for x in men_Y],
    name="Men - Bronze",
    marker_color="#CC6633",
    text=[x[2] for x in men_Y],
    textposition='auto',
),
              row=1,
              col=1)

fig.add_trace(go.Bar(
    x=list(top_10_medal_distribution_women.index.values),
    y=[x[0] for x in women_Y],
    name="Women - Gold",
    marker_color="#FFCC00",
    text=[x[0] for x in women_Y],
    textposition='auto',
),
              row=2,
              col=1)
fig.add_trace(go.Bar(
    x=list(top_10_medal_distribution_women.index.values),
    y=[x[1] for x in women_Y],
    name="Women - Silver",
    marker_color="#CCCCCC",
    text=[x[1] for x in women_Y],
    textposition='auto',
),
              row=2,
              col=1)
fig.add_trace(go.Bar(
    x=list(top_10_medal_distribution_women.index.values),
    y=[x[2] for x in women_Y],
    name="Women - Bronze",
    marker_color="#CC6633",
    text=[x[2] for x in women_Y],
    textposition='auto',
),
              row=2,
              col=1)

fig.update_layout(height=1000,
                  width=800,
                  barmode='stack',
                  template='ggplot2',
                  legend=dict(bgcolor="#fcf7ff", borderwidth=0.5),
                  title_text="Top 10 Men and Women medal achievers (Winter Olympics)")
fig['layout']['xaxis1'].update(title='MEN ATHLETES')
fig['layout']['xaxis2'].update(title='WOMEN ATHLETES')
fig.show()

## Medals achieved by countries distributed across Sport Events : 

In [None]:
top10_countries = winter_games[winter_games.Country_name.isin(top20_country_names[:10])]
top10_countries = pd.DataFrame(top10_countries.groupby(['Country_name', 'Sport'])['Medal'].count())
top10_countries = top10_countries.reindex(axis='index', level=0, labels=top20_country_names[:10]).reset_index()
top10_countries.head()

In [None]:
fig = px.bar(
    top10_countries,
    x="Country_name",
    y="Medal",
    color="Sport",
    hover_name="Medal",
    template='ggplot2',
    labels={
        "Country_name": "Top 10 Countries",
        "Medal": "Medals earned"
    },
    title="Medals achieved by countries distributed across Sport Events (Winter)")
fig.show()

OR

In [None]:
fig = px.imshow(
    pd.pivot_table(top10_countries,
                   index='Country_name',
                   values='Medal',
                   columns='Sport'),
    labels=dict(y="Top 10 Countries", x="Olympics sports events"),
    title="Medals achieved by countries distributed across Sport Events")
fig.show()

# Summer and Winter Olympics comparisons :
## Which Sport has the highest Medals ? 

In [None]:
sport_medals_summer = summer_games.groupby(['Sport', 'Medal'])['Athlete'].count().reset_index()
sport_medals_summer = pd.pivot_table(sport_medals_summer,
                                     index='Sport',
                                     columns='Medal',
                                     values='Athlete')
# Rearranging Medal in order to : Gold, Silver, Bronze
new_col = sport_medals_summer.columns.reindex(['Gold', 'Silver', 'Bronze'])
sport_medals_summer = sport_medals_summer.reindex(columns=new_col[0]).sort_values('Gold', ascending=False)
sport_medals_summer.head()

In [None]:
# Top 20 Sports
fig = px.bar(
    sport_medals_summer[:20],
    orientation='v',
    barmode='group',
    template='ggplot2',
    color_discrete_map={
        "Gold": "#FFCC00",
        "Silver": "#CCCCCC",
        "Bronze": "#CC6633"
    },
    width=1000,
    height=600,
    labels={
        "Country_name": "Top 10 Countries",
        "value": "Medal Wins"
    },
    title='Summer Olympics : Top 20 Sports Medal distribution sorted by Gold Medals')
fig.show()

In [None]:
sport_medals_winter = winter_games.groupby(['Sport', 'Medal'])['Athlete'].count().reset_index()
sport_medals_winter = pd.pivot_table(sport_medals_winter,
                                     index='Sport',
                                     columns='Medal',
                                     values='Athlete')
# Rearranging Medal in order to : Gold, Silver, Bronze
new_col = sport_medals_winter.columns.reindex(['Gold', 'Silver', 'Bronze'])
sport_medals_winter = sport_medals_winter.reindex(columns=new_col[0]).sort_values('Gold', ascending=False)
sport_medals_winter.head()

In [None]:
fig = px.bar(
    sport_medals_winter,
    orientation='v',
    barmode='group',
    template='ggplot2',
    color_discrete_map={
        "Gold": "#FFCC00",
        "Silver": "#CCCCCC",
        "Bronze": "#CC6633"
    },
    width=1000,
    height=600,
    labels={
        "Country_name": "Top 10 Countries",
        "value": "Medal Wins"
    },
    title='Winter Olympics : Sports Medal distribution sorted by Gold Medals')
fig.show()

## Trend of Medals achieved per year by athletes overall across the years :

### Summer Trend :

In [None]:
medals_trend = pd.pivot_table(summer_games.groupby(['Year', 'Medal'])['Country'].count().reset_index(),
                              index='Year',
                              columns='Medal',
                              values='Country')

fig = px.line(
    medals_trend,
    template='ggplot2',
    hover_name='value',
    title="Trend of Medals achieved per year by athletes in Summer Olympics",
    color_discrete_map={
        "Gold": "#FFCC00",
        "Silver": "#CCCCCC",
        "Bronze": "#CC6633"
    })
fig.update_xaxes(tickvals=list(range(1890, 2014, 5)), gridcolor='#A1BDCE')
fig.update_yaxes(title_text='Medals count')

fig.show()

### Winter Trend :

In [None]:
medals_trend = pd.pivot_table(winter_games.groupby(['Year', 'Medal'])['Country'].count().reset_index(),
                              index='Year',
                              columns='Medal',
                              values='Country')

fig = px.line(
    medals_trend,
    template='ggplot2',
    hover_name='value',
    title="Trend of Medals achieved per year by athletes in Winter Olympics",
    color_discrete_map={
        "Gold": "#FFCC00",
        "Silver": "#CCCCCC",
        "Bronze": "#CC6633"
    })
fig.update_xaxes(tickvals=list(range(1920, 2016, 5)), gridcolor='#A1BDCE')
fig.update_yaxes(title_text='Medals count')
fig.show()

## Overall Medal country wise distribution for Summer and Winter Olympic Games over all the years :

### Summer :

1. #### Country wise Medal Distribution Per Year(slider) :

In [None]:
country_medal_overall = pd.DataFrame(summer_games.groupby(['Year','Country'])['Medal'].count()).reset_index()
fig = px.choropleth(
    country_medal_overall,
    locations="Country",
    color='Medal',
    hover_name='Medal',
    animation_frame='Year',
    template='plotly_white',
    title="Country wise Medal Distribution Per Year (Summer Olympic Games)",
    color_continuous_scale=px.colors.sequential.Sunsetdark)

fig.show()

#### Overall Medal country wise distribution for Summer Olympic Games over all the years :

In [None]:
country_medal_overall = pd.DataFrame(summer_games.groupby('Country')['Medal'].count()).reset_index()
fig = px.choropleth(
    country_medal_overall,
    locations="Country",
    color='Medal',
    hover_name='Medal',
    template='plotly_white',
    title="Overall Medal country wise distribution for Summer Olympic Games over all the years",
    color_continuous_scale=px.colors.sequential.Sunsetdark)

fig.show()

 ### Winter :

#### Country wise Medal Distribution Per Year(slider) :

In [None]:
country_medal_overall = pd.DataFrame(winter_games.groupby(['Year','Country'])['Medal'].count()).reset_index()
fig = px.choropleth(
    country_medal_overall,
    locations="Country",
    color='Medal',
    animation_frame='Year',
    template='plotly_white',
    title="Country wise Medal Distribution Per Year (Winter Olympic Games)",
    color_continuous_scale=px.colors.sequential.Sunsetdark)

fig.show()

#### Overall Medal country wise distribution for Winter Olympic Games over all the years :

In [None]:
country_medal_overall = pd.DataFrame(winter_games.groupby('Country')['Medal'].count()).reset_index()
fig = px.choropleth(
    country_medal_overall,
    locations="Country",
    color='Medal',
    hover_name='Medal',
    template='plotly_white',
    title="Overall Medal country wise distribution for Winter Olympic Games over all the years",
    color_continuous_scale=px.colors.sequential.Sunsetdark)
fig.show()

*You have reached the end of this Notebook!*
<hr>