In [None]:
#Import the appropriate libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
from scipy.stats import zscore
import seaborn as sns

#This can be removed after the first run
%pip install scipy


Objective
The goal is to investigate whether or not there is a "home field advantage" for a nation that hosts the Olympic games. This analysis will study the Chinese olympic team, and try to determine if there was an unusually high improvement in performance during the 2008 Summer Olympic Games in Beijing.

Metrics to Produce
We will look at the following metrics in order to check for a host advantage, considering medal count as the measure for success in an Olympic Games:

Medal count trends:
Did the total medal count increase during the hosted olympics?
How many standard deviations (z-score) from the average medal count across all olympic games was the medal count for the hosted games?
Where did the host country place on the total medal rankings in their hosted games vs. other games?
How did the host country's performance metrics compare to the rest of the world's metrics? Does that tell us anything?

In [None]:
#read CSV file into a dataframe
olympics_path = "resources/athlete_events.csv"

olympics_df = pd.read_csv(olympics_path, low_memory = False)

#view the first 5 rows
olympics_df.head(5)

In [None]:
# use only the Summer Olympics for Analysis
summer_df = olympics_df[olympics_df['Season'] == 'Summer']


In [None]:
#Create variables for the specific host country
host_country = 'INSERT HOST COUNTRY HERE'
host_NOC = 'INSERT HOST NOC HERE'
host_year = 'INSERT HOST YEAR HERE'
host_city = 'INSERT HOST CITY HERE'

# create a dataframe for only the host country, where the host value is anywhere in the team name
host_df = summer_df[summer_df['Team'].str.contains(host_country, case=False)]

# print the unique values of various host columns to see if they are all related to the host country we are looking for
print(host_df['Team'].unique())
print(host_df['NOC'].unique())
print(host_df['City'].unique())

# Clean up the Team column to have ensure a single value for host
host_df.loc[:, 'Team'] = host_country
print(host_df['Team'].unique())

# It looks like NOC is a more accurate way to describe the country's team,
# and doesn't pose as many issues as the Team column does (e.g. "China-1", "China-2", etc.)
# Let's use NOC instead of Team for the rest of the analysis.

In [None]:
# create a dataframe where NOC is not related to the host country
rest_of_world_df = summer_df[summer_df['NOC'] != host_NOC]

# Clean up the team names to have only the country name
# if "-" appears in the team name, use only the part of the string that comes before "-"
rest_of_world_df.loc[:, 'Team'] = rest_of_world_df['Team'].str.split('-').str[0]

In [None]:
# create a dataframe that is the rest_of_world_df (referred to now as rw) grouped by Games, then NOC
# We can use this to calculate medal counts for each country in each Games
games_df_rw = rest_of_world_df.groupby(['Games', 'NOC']).agg({'Medal': 'count'}).reset_index()

# Add columns for gold_medals, silver_medals, bronze_medals and total_medals
# This will be, for each NOC, the sum of the medals won in each category for each games
games_df_rw['gold_medals'] = rest_of_world_df[rest_of_world_df['Medal'] == 'Gold'].groupby(['Games', 'NOC']).agg({'Medal': 'count'}).reset_index()['Medal']
games_df_rw['silver_medals'] = rest_of_world_df[rest_of_world_df['Medal'] == 'Silver'].groupby(['Games', 'NOC']).agg({'Medal': 'count'}).reset_index()['Medal']
games_df_rw['bronze_medals'] = rest_of_world_df[rest_of_world_df['Medal'] == 'Bronze'].groupby(['Games', 'NOC']).agg({'Medal': 'count'}).reset_index()['Medal']

# create a clean column for total medals
games_df_rw['total_medals'] = games_df_rw['Medal'] + games_df_rw['gold_medals'] + games_df_rw['silver_medals'] + games_df_rw['bronze_medals']

# fill NaN values with 0
games_df_rw = games_df_rw.fillna(0)

# format values in all columns except Games to integers
games_df_rw = games_df_rw.astype({'Medal': 'int', 'gold_medals': 'int', 'silver_medals': 'int', 'bronze_medals': 'int'})

#View the dataframe

games_df_rw

In [None]:
# create a dataframe for the host country in the Summer Olympics grouped by Games.
games_df_host = host_df.groupby('Games').agg({'Medal': 'count'})

# Add columns for gold_medals, silver_medals, bronze_medals and total_medals
games_df_host['gold_medals'] = host_df[host_df['Medal'] == 'Gold'].groupby('Games').agg({'Medal': ['count']})
games_df_host['silver_medals'] = host_df[host_df['Medal'] == 'Silver'].groupby('Games').agg({'Medal': ['count']})
games_df_host['bronze_medals'] = host_df[host_df['Medal'] == 'Bronze'].groupby('Games').agg({'Medal': ['count']})

# fill NaN values with 0
games_df_host = games_df_host.fillna(0)

# format values in all columns except Games to integers
games_df_host = games_df_host.astype(int)

#View the dataframe
games_df_host

In [None]:
# create a cleaner "total_medals" column
games_df_host['total_medals'] = games_df_host[['gold_medals', 'silver_medals', 'bronze_medals']].sum(axis=1)

# Add a column for non-medal participations, which is the count of all rows for Games where Medal is NaN
games_df_host['non_medal_participations'] = host_df[host_df['Medal'].isnull()].groupby('Games').size()

# Add a column for total participations, which is the count of all rows for Games.
# This includes 0 or NaN values for medals
games_df_host['total_participations'] = host_df.groupby('Games').size()
games_df_host

In [None]:
# Create a column called 'medal_rate' which is the ratio of medal count to total participation count
games_df_host['medal_rate'] = games_df_host['total_medals'] / games_df_host['total_participations']
games_df_host

In [None]:
# create columns called gold_percentage, silver_percentage, bronze_percentage and non_medal_percentage
games_df_host['gold_percentage'] = games_df_host['gold_medals'] / games_df_host['total_medals']
games_df_host['silver_percentage'] = games_df_host['silver_medals'] / games_df_host['total_medals']
games_df_host['bronze_percentage'] = games_df_host['bronze_medals'] / games_df_host['total_medals']
games_df_host['non_medal_percentage'] = games_df_host['non_medal_participations'] / games_df_host['total_participations']

games_df_host

In [None]:
# Plot the gold_medals, silver_medals, bronze_medals and non_medal_participations for the host country in the Summer Olympics.
# Use a stacked bar chart, where the total size of the bar is the total_participations

games_df_host[['gold_medals', 'silver_medals', 'bronze_medals']].plot(
                                    kind='bar',
                                    stacked=True,
                                    figsize=(20, 10),
                                    color=['#ffd700', '#c0c0c0', '#cd7f32'],
                                    title='Medals Won by INSERT HOST NAME in the Summer Olympics',)

# use #ffd700 for gold, #c0c0c0 for silver and #cd7f32 for bronze

In [None]:
#Create a different visualization for the same data
sns.set_theme(style="whitegrid")

# use #ffd700 for gold, #c0c0c0 for silver and #cd7f32 for bronze
plt.figure(figsize=(20, 10))
plt.xticks(rotation='vertical')
data = games_df_host[['gold_medals', 'silver_medals', 'bronze_medals']]
palette = {'gold_medals': '#ffd700', 'silver_medals': '#c0c0c0', 'bronze_medals': '#cd7f32'}
sns.lineplot(data=data, palette=palette, linewidth=4.5, dashes=False), plt.title('Medals Won by [INSERT COUNTRY] in the Summer Olympics')

#Save Image for Use in Slides
plt.savefig('resources/medals_won_host_linegraph.png')

In [None]:
# Use a line chart to plot the medal_rate for the host country in the Summer Olympics.
games_df_host['medal_rate'].plot(kind='line', figsize=(20, 10), title='Medal Rate for [INSERT COUNTRY] in the Summer Olympics')

In [None]:
# Use a line chart to plot the medal count for the host country in the Summer Olympics.
games_df_host['total_medals'].plot(kind='line', figsize=(20, 10))

In [None]:
# Calculate the average medal count for your assigned team for the summer olympics
average_medal_count_host = games_df_host['total_medals'].mean()
print(average_medal_count_host)

# Calculate the average medal count for your assigned team  in the games where they won at least one medal
average_medal_count_host_won = games_df_host[games_df_host['total_medals'] > 0]['total_medals'].mean()
print(average_medal_count_host_won)

In [None]:
# Show the medal count for the host year Summer Games
games_df_host.loc[f'{host_year} Summer']



In [None]:
# add z_scores column to the games_df_host dataframe
games_df_host['z_scores'] = zscore(games_df_host['total_medals'])
games_df_host

In [None]:
# Plot the distribution of the medal count for the host country in the Summer Olympics
sns.histplot(games_df_host['total_medals'], bins=10, kde=True)

In [None]:
mean_total_medals_host = games_df_host[games_df_host.index != f'{host_year} Summer']['total_medals'].mean()
print(mean_total_medals_host)

z_score_total_medals_host = zscore(games_df_host[games_df_host.index != f'{host_year} Summer']['total_medals'])
print(z_score_total_medals_host)

In [13]:
# Using our games_df_rw dataframe, let's get the Summer Games data
summer_host_yr_rw = games_df_rw[games_df_rw['Games'] == f'{host_year} Summer']

# let's order this data by descending medal count
summer_host_yr_rw = summer_host_yr_rw.sort_values(by='Medal', ascending=False)

# let's only use the Games, NOC, and Medal columns
summer_host_yr_rw_short = summer_host_yr_rw[['Games', 'NOC', 'Medal']]
print('Medal Count for the Rest of the World:')
summer_host_yr_rw_short.head()

[INSERT COUNTRY] f{host_year} Medal Count for the Rest of the World:


Unnamed: 0,Games,NOC,Medal


In [None]:
# Let's calculate the average medal count for the rest of the world in the Summer Olympics
average_medal_count_rw_host_yr = summer_host_yr_rw['Medal'].mean()
average_medal_count_rw_host_yr