# Example 6 - Olympic Efficiency Dashboard

In [20]:
import pandas as pd
import plotly.express as px
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from six import with_metaclass

In [21]:
file_path = 'C:/Users/viole/dev/analytics/kaggle/olympics-data-analysis/data/olympic_countries_efficiency.csv'
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,NOC,ISO3,Year,population,gdp_per_capita,income_group,host_country,athletes_sent,sports_participated,events_participated,female_athlete_percentage,prev_total_medals,prev_medals_per_athlete,Gold,Silver,Bronze,total_medals,medals_per_athlete
0,AFG,AFG,2004,23560654.0,221.763654,Low income,0,5,4,5,40.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,AFG,AFG,2008,26482622.0,381.733238,Low income,0,4,2,4,25.0,0.0,0.0,0.0,0.0,1.0,1.0,0.25
2,AFG,AFG,2012,30560034.0,651.417134,Low income,0,6,4,6,16.666667,1.0,0.25,0.0,0.0,1.0,1.0,0.166667
3,AFG,AFG,2016,34700612.0,522.082216,Low income,0,3,2,3,33.333333,1.0,0.166667,0.0,0.0,0.0,0.0,0.0
4,ALB,ALB,1992,3247039.0,200.85222,Low income,0,7,4,8,22.222222,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
df.columns

Index(['NOC', 'ISO3', 'Year', 'population', 'gdp_per_capita', 'income_group',
       'host_country', 'athletes_sent', 'sports_participated',
       'events_participated', 'female_athlete_percentage', 'prev_total_medals',
       'prev_medals_per_athlete', 'Gold', 'Silver', 'Bronze', 'total_medals',
       'medals_per_athlete'],
      dtype='str')

In [23]:
# Keep only columns needed, create a copy of df
df_small = df[[
    'NOC',
    'Year',
    'population',
    'gdp_per_capita',
    'athletes_sent',
    'total_medals',
    'medals_per_athlete'
]].copy()

df_small.head()

Unnamed: 0,NOC,Year,population,gdp_per_capita,athletes_sent,total_medals,medals_per_athlete
0,AFG,2004,23560654.0,221.763654,5,0.0,0.0
1,AFG,2008,26482622.0,381.733238,4,1.0,0.25
2,AFG,2012,30560034.0,651.417134,6,1.0,0.166667
3,AFG,2016,34700612.0,522.082216,3,0.0,0.0
4,ALB,1992,3247039.0,200.85222,7,0.0,0.0


In [24]:
# For plotly filtering
year_selected = 2016
df_year = df_small[
    df_small['Year'] == year_selected
].copy()

df_year.head()

Unnamed: 0,NOC,Year,population,gdp_per_capita,athletes_sent,total_medals,medals_per_athlete
3,AFG,2016,34700612.0,522.082216,3,0.0,0.0
10,ALB,2016,2689469.0,4457.634122,6,0.0,0.0
21,AND,2016,72181.0,40129.838581,4,0.0,0.0
35,ARG,2016,43900313.0,12699.962314,215,22.0,0.102326
41,ARM,2016,2992300.0,3524.424769,31,4.0,0.129032


In [25]:
# Create rest of variables
# log population
df_year['log_population'] = np.log(df_year['population'])

# log gdp_per_capita
df_year['log_gp_per_capita'] = np.log(df_year['gdp_per_capita'])

# medals per million population
df_year['medals_per_population_millions'] = df_year['total_medals'] / (df_year['population'] / 1_000_000)

# medals per gdp_per_capita
df_year['medals_per_gdp_per_capita'] = df_year['total_medals'] / df_year['gdp_per_capita']

df_year.head()

Unnamed: 0,NOC,Year,population,gdp_per_capita,athletes_sent,total_medals,medals_per_athlete,log_population,log_gp_per_capita,medals_per_population_millions,medals_per_gdp_per_capita
3,AFG,2016,34700612.0,522.082216,3,0.0,0.0,17.362268,6.257825,0.0,0.0
10,ALB,2016,2689469.0,4457.634122,6,0.0,0.0,14.804854,8.402373,0.0,0.0
21,AND,2016,72181.0,40129.838581,4,0.0,0.0,11.186932,10.599875,0.0,0.0
35,ARG,2016,43900313.0,12699.962314,215,22.0,0.102326,17.597432,9.449354,0.501135,0.001732
41,ARM,2016,2992300.0,3524.424769,31,4.0,0.129032,14.911553,8.167473,1.336764,0.001135


In [26]:
df_year[
    df_year['NOC'] == 'CAN'
]

Unnamed: 0,NOC,Year,population,gdp_per_capita,athletes_sent,total_medals,medals_per_athlete,log_population,log_gp_per_capita,medals_per_population_millions,medals_per_gdp_per_capita
183,CAN,2016,36110803.0,42314.061582,310,69.0,0.222581,17.402103,10.652875,1.910786,0.001631


## Medals vs Population (log scale)

In [27]:
fig = px.scatter(
    df_year,
    x='log_population',
    y='total_medals',
    size='gdp_per_capita',
    hover_name='NOC',
    title=f"Medals vs Log Population ({year_selected})",
)

fig.show()

In [28]:
fig = px.scatter(
    df_year,
    x='gdp_per_capita',
    y='total_medals',
    size='population',
    hover_name='NOC',
    title=f"Medals vs Log GDP ({year_selected})",
)

fig.show()

Countries with higher populations win more medals, and countries with higher gdp per capita win more medals. The right side of the "Medals vs Log GDP" graph has very small dots, suggesting that gdp may be more important than population.