### Compare Medal between Rio and Tokyo Olympics

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import re        
from scipy.stats import linregress
import matplotlib.pyplot as plt

data source:

https://www.kaggle.com/heesoo37/120-years-of-olympic-history-athletes-and-results

https://www.kaggle.com/jcsantiago/covid19-by-country-with-government-response

https://www.kaggle.com/stefanzivanov/olympic-games-2021-medals

https://www.kaggle.com/tanuprabhu/population-by-country-2020


### medal 2021

In [None]:
dm = pd.read_csv('/kaggle/input/olympic-games-2021-medals/Tokyo 2021 dataset.csv')

### Top 10 teams in Tokyo

In [None]:
dm.head(10)

### Rio medal

In [None]:
rio = pd.read_csv('../input/120-years-of-olympic-history-athletes-and-results/athlete_events.csv')
rio = rio[rio['Year']==2016]

rio['gold_rio'] = np.where(rio['Medal']=='Gold',1,0)
rio['silver_rio'] = np.where(rio['Medal']=='Silver',1,0)
rio['bronze_rio'] = np.where(rio['Medal']=='Bronze',1,0)

rio = rio.groupby(['NOC','Event'])[['gold_rio','silver_rio','bronze_rio']].agg('max').reset_index().sort_values('gold_rio')

rio = rio.groupby("NOC")[['gold_rio','silver_rio','bronze_rio']].agg('sum').reset_index().sort_values('gold_rio',ascending=False)

rio.rename(columns = {'NOC':'NOCCode'},inplace=True)
rio.loc[rio['NOCCode']=='RUS','NOCCode'] = 'ROC'

rio['total_rio'] = rio['gold_rio'] + rio['silver_rio'] + rio['bronze_rio']
rio = rio[rio['total_rio'] >0].reset_index(drop=True)
rio = rio.sort_values(['gold_rio','silver_rio','bronze_rio'],ascending=False).reset_index(drop=True)

### Top 10 teams in Rio

In [None]:
rio.head(10)

### population data

In [None]:
dict_name = {'China':"People's Republic of China",
'United States':'United States of America',
 'Russia':'ROC',
 'United Kingdom':'Great Britain',
 'South Korea':'Republic of Korea',
 'North Korea': "People's Republic of Korea",
 'Czech Republic (Czechia)':'Czech Republic',
 'Hong Kong':'Hong Kong, China',
 'Taiwan':'Chinese Taipei',
 'Iran':'Islamic Republic of Iran',
 'Moldova':'Republic of Moldova'

}

dp = pd.read_csv("/kaggle/input/population-by-country-2020/population_by_country_2020.csv")

dp.rename(columns = {'Country (or dependency)':'Country'},inplace=True)

dp['Team/NOC'] = dp['Country']

for k,v in dict_name.items():
    dp.loc[dp['Country']==k,'Team/NOC'] = v

# dp.loc[dp['Team/NOC']=='Kosovo','Population (2020)'] = 1_800_000
# dp.loc[dp['Team/NOC']=='Kosovo','Land Area (Km²)'] = 10887
# dp.loc[dp['Team/NOC']=='Syrian Arab Republic','Population (2020)'] = 1_750_000
# dp.loc[dp['Team/NOC']=='Syrian Arab Republic','Land Area (Km²)'] = 118

dp = pd.concat([dp,pd.DataFrame({'Team/NOC':['Kosovo','Syrian Arab Republic'],
              'Population (2020)':[1_800_000,1_750_000],
             'Land Area (Km²)':[10887,118]})])

In [None]:
#source:
#https://en.wikipedia.org/wiki/Syria
#https://en.wikipedia.org/wiki/Kosovo

### Covid-19 data 27-jul-2021

In [None]:
dc = pd.read_csv('../input/covid19-by-country-with-government-response/covid19_by_country.csv')

dc['ilast'] = dc.groupby('Country').cumcount(ascending=False)

dc = dc[dc['ilast']==0]

dict_name = {'China':"People's Republic of China",
'US':'United States of America',
 'Russia':'ROC',
 'United Kingdom':'Great Britain',
 'Korea, South':'Republic of Korea',
 'North Korea': "People's Republic of Korea",
 'Czechia':'Czech Republic',
 'Hong Kong':'Hong Kong, China',
 'Taiwan':'Chinese Taipei',
 'Iran':'Islamic Republic of Iran',
 'Moldova':'Republic of Moldova'

}

dc['Team/NOC'] = dc['Country']

for k,v in dict_name.items():
    dc.loc[dc['Country']==k,'Team/NOC'] = v

dc = dc.sort_values('confirmed_PopPct',ascending=False).reset_index(drop = True)

### Medal difference

#### -- include all teams with 3 or more total medals in 2 Olympics

#### -- medal change % = (medal_tokyo - medal_rio)/(medal_rio + 5) * 100

In [None]:
df_diff = pd.merge(dm, rio, on = 'NOCCode', how = 'outer')
df_diff.loc[df_diff['NOCCode']=='PRK','Team/NOC'] = "People's Republic of Korea"

#rio >=5
#df_diff = df_diff[df_diff['total_rio'] >=5]

#rio >=5 or tokyo >=5
#df_diff = df_diff[(df_diff['total_rio'] >=5)|(df_diff['Total']>=5)]

#rio + tokyo >=3
df_diff = df_diff[df_diff['total_rio'].fillna(0) + df_diff['Total'].fillna(0) >=3]

#fillna
for c in ['Gold Medal', 'Silver Medal', 'Bronze Medal','Total', 
          'gold_rio','silver_rio','bronze_rio','total_rio'] :

    df_diff[c] = df_diff[c].fillna(0)

df_diff['total_diff'] = (df_diff['Total'] - df_diff['total_rio'])
df_diff['gold_diff'] = (df_diff['Gold Medal'] - df_diff['gold_rio'])

df_diff['total_diff_pct'] = (df_diff['Total'] - df_diff['total_rio'])/(df_diff['total_rio']+5)*100
df_diff['gold_diff_pct'] = (df_diff['Gold Medal'] - df_diff['gold_rio'])/(df_diff['gold_rio']+5)*100

df_diff['total_diff_pct'] = df_diff['total_diff_pct'].round(1)
df_diff['gold_diff_pct'] = df_diff['gold_diff_pct'].round(1)

### Top 10 teams

In [None]:
df_diff.head(10).sort_values('Rank')

In [None]:
fig,ax = plt.subplots(figsize = (8,8))
ax.scatter(df_diff['total_rio'],df_diff['Total'] )
ax.set_title('Tokyo Total vs Rio Total')
ax.plot(np.arange(120),np.arange(120), linestyle = '--')
dlabel = df_diff.head(10)

for i, txt in enumerate(dlabel['Team/NOC'].values):
    ax.annotate(txt, (dlabel['total_rio'].values[i]-5,dlabel['Total'].values[i]+1))

ax.annotate('medal change % = (medal_tokyo - medal_rio)/(medal_rio + 5) * 100', (30,0))    

In [None]:
fig,ax = plt.subplots(figsize = (8,8))
ax.scatter(df_diff['gold_rio'],df_diff['Gold Medal'] )
ax.set_title('Tokyo Gold vs Rio Gold')
ax.plot(np.arange(40),np.arange(40), linestyle = '--')
dlabel = df_diff.head(10).copy()
dlabel.loc[df_diff['Team/NOC']=='Netherlands','Gold Medal'] +=1
for i, txt in enumerate(dlabel['Team/NOC'].values):
    ax.annotate(txt, (dlabel['gold_rio'].values[i]-5,dlabel['Gold Medal'].values[i]+1))

ax.annotate('medal change % = (medal_tokyo - medal_rio)/(medal_rio + 5) * 100', (10,0)) 

### Most improved teams in Gold

In [None]:
df_diff.sort_values('gold_diff_pct',ascending=False).head(10)

### Most improved teams in Total

In [None]:
df_diff.sort_values('total_diff_pct',ascending=False).head(10)

### Covid-19 effects

In [None]:
df = pd.merge(df_diff, dp, on = 'Team/NOC', how = 'left')

df = pd.merge(df,dc[['Team/NOC','confirmed_PopPct','confirmed']], on = 'Team/NOC', how = 'left')

In [None]:
#handle NA data

df.loc[df['Team/NOC']=='Hong Kong, China','confirmed'] = 12013 #10-aug-2021
#https://news.google.com/covid19/map?hl=en-US&mid=%2Fm%2F03h64&gl=US&ceid=US%3Aen

df.loc[df['Team/NOC']=="Côte d'Ivoire",'confirmed'] = 51399 #10-aug-2021
#https://www.worldometers.info/coronavirus/country/cote-d-ivoire/

df.loc[df['confirmed_PopPct'].isnull(),'confirmed_PopPct'] = df['confirmed']/df['Population (2020)']

#no data for north korea
#print(df.loc[df['confirmed_PopPct'].isnull(),['confirmed','Rank','Team/NOC','Total','confirmed_PopPct','Population (2020)']])

df =df[df['confirmed_PopPct'].isnull()==False]

df['confirmed_PopPct'] = df['confirmed_PopPct'].round(5)

### Teams most impacted by Covid-19

In [None]:
cols = ['Rank', 'Team/NOC', 'Gold Medal','gold_rio', 'gold_diff',
       'Total', 'total_rio', 'total_diff',
         'total_diff_pct',
       'gold_diff_pct', 'confirmed_PopPct']
df.sort_values('confirmed_PopPct', ascending=False)[cols].head(10)

### Teams least impacted by Covid-19

In [None]:
df.sort_values('confirmed_PopPct')[['Rank', 'Team/NOC', 'Gold Medal','gold_rio', 'gold_diff',
       'Total', 'total_rio', 'total_diff',
         'total_diff_pct',
       'gold_diff_pct', 'confirmed_PopPct']].head(10)

In [None]:
#label best 10 , highest confirmed and lowest confirmed
best_tokyo = df.head(10).copy()

lowest_confirmed = df.sort_values('confirmed_PopPct').head(10).copy()

highest_confirmed = df.sort_values('confirmed_PopPct',ascending=False).head(10).copy()

cols = ['Rank', 'Team/NOC', 'Gold Medal', 'Total', 'gold_rio', 'total_rio', 
       'total_diff', 'gold_diff', 'total_diff_pct',
       'gold_diff_pct', 
       'confirmed_PopPct', 'confirmed']

labels = pd.concat([best_tokyo,lowest_confirmed, highest_confirmed])['Team/NOC'].unique()
dlabel = df[df['Team/NOC'].isin(labels)].copy()
dlabel.loc[df['Team/NOC']=='Japan','total_diff'] -=1

In [None]:
fig,ax = plt.subplots(figsize = (10,10))
ax.scatter(df['confirmed_PopPct'], df['total_diff'])
ax.set_title('Olympic Medal change vs confirmed covid-19 population %')
for i, txt in enumerate(dlabel['Team/NOC'].values):
    ax.annotate(txt, (dlabel['confirmed_PopPct'].values[i],dlabel['total_diff'].values[i]))
    
ax.annotate('medal change % = (medal_tokyo - medal_rio)/(medal_rio + 5) * 100', (6,-12))     

In [None]:
fig,ax = plt.subplots(figsize = (12,12))
ax.scatter(df['confirmed_PopPct'], df['total_diff_pct'])
ax.set_title('Medal change vs confirmed covid-19 population %')
for i, txt in enumerate(dlabel['Team/NOC'].values):
    ax.annotate(txt, (dlabel['confirmed_PopPct'].values[i],dlabel['total_diff_pct'].values[i]))
ax.annotate('medal change % = (medal_tokyo - medal_rio)/(medal_rio + 5) * 100', (6,-50))     

In [None]:
fig,ax = plt.subplots(figsize = (12,12))
ax.scatter(df['confirmed_PopPct'], df['total_diff_pct'])
ax.set_title('Olympic Medal change vs confirmed covid-19 population %(log x scale)')
for i, txt in enumerate(dlabel['Team/NOC'].values):
    ax.annotate(txt, (dlabel['confirmed_PopPct'].values[i],dlabel['total_diff_pct'].values[i]))
ax.set_xscale('log')
ax.annotate('medal change % = (medal_tokyo - medal_rio)/(medal_rio + 5) * 100', (0.005,-50)) 

In [None]:
fig,ax = plt.subplots(figsize = (12,12))
ax.scatter(df['confirmed_PopPct'], df['gold_diff_pct'])
ax.set_title('Olympic Gold change vs confirmed covid-19 population %(log x scale)')
for i, txt in enumerate(dlabel['Team/NOC'].values):
    ax.annotate(txt, (dlabel['confirmed_PopPct'].values[i],dlabel['gold_diff_pct'].values[i]))
ax.set_xscale('log')
ax.annotate('medal change % = (medal_tokyo - medal_rio)/(medal_rio + 5) * 100', (0.005,-40)) 

In [None]:
d5 = df[df['Total']+df['total_rio']>=10].copy()

#label best 10 , highest confirmed and lowest confirmed
best_tokyo = d5.head(10).copy()
lowest_confirmed = d5.sort_values('confirmed_PopPct').head(10).copy()
highest_confirmed = d5.sort_values('confirmed_PopPct',ascending=False).head(10).copy()
highest_inc = d5.sort_values('total_diff',ascending=False).head(10).copy()

cols = ['Rank', 'Team/NOC', 'Gold Medal', 'Total', 'gold_rio', 'total_rio', 
       'total_diff', 'gold_diff', 'total_diff_pct',
       'gold_diff_pct', 
       'confirmed_PopPct', 'confirmed']

labels = pd.concat([best_tokyo,lowest_confirmed, highest_confirmed,highest_inc])['Team/NOC'].unique()
dlabel = d5[d5['Team/NOC'].isin(labels)].copy()
dlabel.loc[d5['Team/NOC']=='Japan','total_diff'] -=1

fig,ax = plt.subplots(figsize = (12,12))
ax.scatter(d5['confirmed_PopPct'], d5['total_diff_pct'])
ax.set_title('Olympic Medal change[>=10 in 2 olympics] vs confirmed covid-19 population %(log x scale)')
for i, txt in enumerate(dlabel['Team/NOC'].values):
    ax.annotate(txt, (dlabel['confirmed_PopPct'].values[i],dlabel['total_diff_pct'].values[i]))
ax.set_xscale('log')
ax.annotate('medal change % = (medal_tokyo - medal_rio)/(medal_rio + 5) * 100', (0.005,-50)) 

### most improved Teams [>= 10 medals]

In [None]:
d5.sort_values('total_diff',ascending=False).head(20)[['Rank', 'Team/NOC', 'Gold Medal','gold_rio', 'gold_diff',
       'Total', 'total_rio', 'total_diff', 'total_diff_pct',
       'gold_diff_pct', 'confirmed_PopPct']]

In [None]:
#covid cannot explain why those teams improved
median_top_20 = d5.sort_values('total_diff',ascending=False).head(20)[['Rank', 'Team/NOC', 'Gold Medal','gold_rio', 'gold_diff',
       'Total', 'total_rio', 'total_diff', 'total_diff_pct',
       'gold_diff_pct', 'confirmed_PopPct']]['confirmed_PopPct'].median()

median_bottom_20 = d5.sort_values('total_diff',ascending=True).head(20)[['Rank', 'Team/NOC', 'Gold Medal','gold_rio', 'gold_diff',
       'Total', 'total_rio', 'total_diff', 'total_diff_pct',
       'gold_diff_pct', 'confirmed_PopPct']]['confirmed_PopPct'].median()

print('covid population median in most improved teams:',median_top_20)

print('covid population median in least improved teams:',median_bottom_20)

### Highest medal increase

In [None]:
cols = ['Rank','Team/NOC','Total','total_rio','confirmed_PopPct','total_diff_pct']
df.sort_values('total_diff_pct',ascending=False)[cols].head(10)

### Highest medal increase with more than 10 medals

In [None]:
cols = ['Rank','Team/NOC','Total','total_rio','confirmed_PopPct','total_diff_pct']
df[df['Total'] + df['total_rio']>10].sort_values('total_diff_pct',ascending=False)[cols].head(10)

### Highest medal decrease with more than 10 medals

In [None]:
cols = ['Rank','Team/NOC','Total','total_rio','confirmed_PopPct','total_diff_pct']
df[df['Total'] + df['total_rio']>10].sort_values('total_diff_pct',ascending=True)[cols].head(10)

In [None]:
print('median of medal change in HIGHEST 10 covid-19 population teams:',
      f"{df.sort_values('confirmed_PopPct',ascending=False).head(10)['total_diff'].median():+5.2f}"
     )

print('median of medal change in LOWEST 10 covid-19 population teams:',
    f"{df.sort_values('confirmed_PopPct',ascending=True).head(10)['total_diff'].median():+5.2f}"
     )

In [None]:
print('mean of medal change in HIGHEST 10 covid-19 population teams:',
      f"{df.sort_values('confirmed_PopPct',ascending=False).head(10)['total_diff'].mean():+5.2f}"
     )

print('mean of medal change in LOWEST 10 covid-19 population teams:',
    f"{df.sort_values('confirmed_PopPct',ascending=True).head(10)['total_diff'].mean():+5.2f}"
     )

### p-values

In [None]:
linregress(df['confirmed_PopPct'], df['total_diff'])

In [None]:
linregress(df['confirmed_PopPct'], df['total_diff_pct'])

In [None]:
# medal change vs log(confirmed_PopPct) has p-value < 0.02
# mostly because of outliers

linregress(np.log(df['confirmed_PopPct']), df['total_diff_pct'])

In [None]:
linregress(np.log(df['confirmed_PopPct']), df['gold_diff_pct'])

### Olympic Medal Density

In [None]:
#Since I am from Hong Kong, a little shoutout for our athletes

In [None]:
df = pd.merge(dm,dp, on='Team/NOC',how = 'outer')

df['medal_per_mil_people'] = df['Total']/df['Population (2020)']*1e6

df['medal_per_1000_km_sq'] = df['Total']/df['Land Area (Km²)']*1e3

df['gold_per_1000_km_sq'] = df['Gold Medal']/df['Land Area (Km²)']*1e3

d2 = df[df['Total']>=1].copy() #include all teams with at least 1 medal

### #medal per million population

In [None]:
d2.sort_values('medal_per_mil_people',ascending=False).head(40)[['Team/NOC','Total','Gold Medal','Population (2020)','medal_per_mil_people']].reset_index(drop=True)

### Medal per 1000 Km²

In [None]:
cols = ['Team/NOC','Gold Medal','Silver Medal','Bronze Medal','Total','Land Area (Km²)','medal_per_1000_km_sq']
dsq = df.sort_values('medal_per_1000_km_sq',ascending=False).head(10)[cols]

In [None]:
dsq.head(10)

In [None]:
plt.subplots(figsize = (12,3))
plt.barh(dsq['Team/NOC'], dsq['medal_per_1000_km_sq'])
plt.xlim(0,22)
plt.title('Tokyo Olympic Medals per 1000 Km²')
plt.annotate('Bermuda:20.0', (18,2))
plt.annotate('San Marino:50.0', (20,1))
plt.annotate('Hong Kong, China:5.71', (5.5,4))

In [None]:
dsq

In [None]:
cols = ['Team/NOC','Gold Medal','Silver Medal','Bronze Medal','Total','Land Area (Km²)','gold_per_1000_km_sq']
dgold = d2.sort_values('gold_per_1000_km_sq',ascending=False)[cols].head(10)

In [None]:
plt.subplots(figsize = (12,3))
plt.barh(dgold['Team/NOC'], dgold['gold_per_1000_km_sq'])
plt.xlim(0,1.2)
plt.title('Tokyo Olympic Gold Medals per 1000 Km²')
plt.annotate('Bermuda:20.0', (1.05,1))
plt.annotate('Hong Kong, China:0.952', (0.8,2))

In [None]:
dgold

In [None]:
col_ch =dict(zip(['Team/NOC', 'Gold Medal', 'Silver Medal', 'Bronze Medal', 'Total',
       'Land Area (Km²)', 'medal_per_1000_km_sq'], ['團隊/國家奧委會','金牌','銀牌','銅牌','總獎牌',
        '土地面積（平方公里）','每千平方公里總獎牌']))

dict_ch = dict(zip(['San Marino', 'Bermuda', 'Syrian Arab Republic',
       'Hong Kong, China', 'Grenada', 'Bahrain', 'Netherlands', 'Jamaica',
       'Chinese Taipei', 'Switzerland'],['聖馬力諾','百慕大','阿拉伯敘利亞共和國',
        '中國香港','格林納達','巴林','荷蘭','牙買加',
        '中國台北','瑞士']))

dsq_ch = dsq.copy()

dsq_ch['Team/NOC'] = dsq_ch['Team/NOC'].map(dict_ch)

dsq_ch.columns = dsq_ch.columns.map(col_ch)

### 東奧每千平方公里總獎牌

In [None]:
dsq_ch

In [None]:
from matplotlib import font_manager as fm

#https://www.kaggle.com/gpreda/chinese-text-exploration
font_path = './SourceHanSerifTW-Regular.otf'
font_prop = fm.FontProperties(fname=font_path)
!wget 'https://github.com/adobe-fonts/source-han-serif/raw/release/SubsetOTF/TW/SourceHanSerifTW-Regular.otf'

In [None]:
fig,ax = plt.subplots(figsize = (12,6))
ax.barh(dsq_ch['團隊/國家奧委會'], dsq_ch['每千平方公里總獎牌'])
#ax.set_xticks(np.arange(10))
ax.set_yticklabels(dsq_ch['團隊/國家奧委會'], fontproperties=font_prop,fontsize = 15)
plt.xlim(0,22)
plt.title('東奧每千平方公里總獎牌',fontproperties = font_prop,fontsize = 20)
plt.annotate('百慕大:20.0', (18,2),fontproperties = font_prop,fontsize = 15)
plt.annotate('聖馬力諾:50.0', (20,1),fontproperties = font_prop,fontsize = 15)
plt.annotate('中國香港:5.71', (5.5,4),fontproperties = font_prop,fontsize = 15)