**What countries have the highest density of Olympic medals per population at 2021 Tokyo Olympics?**

This small calculation will cheer up small countries! ;)
<br>And the medal table will favor them and look completely different!

In [None]:
import pandas as pd
!pip install wikipedia
import wikipedia as wp
import matplotlib.pyplot as plt
import numpy as np

In [None]:
df_olympics = pd.read_csv("../input/olympic-games-2021-medals/Tokyo 2021 dataset.csv")

df_olympics.describe()

In [None]:
# Getting html for IOC Country codes table from https://en.wikipedia.org/wiki/List_of_IOC_country_codes
html_noc_codes = wp.page('List_of_IOC_country_codes').html()

# Getting html for population table from https://en.wikipedia.org/wiki/List_of_countries_by_population_(United_Nations)
html_population = wp.page('List_of_countries_by_population_(United_Nations)').html()

In [None]:
try:
    # From html table to dataframe
    df_noc_codes = pd.read_html(html_noc_codes)[0]
except IndexError:
    print('Tables not found')

# Clean unnecessary data
df_noc_codes.drop([0], axis = 0, inplace = True)
df_noc_codes.drop(df_noc_codes.columns[[2, 3]], axis = 'columns', inplace = True)
df_noc_codes.columns = ['NOCCode', 'Country']

df_noc_codes.tail()

In [None]:
try:
    # From html table to dataframe
    df_population = pd.read_html(html_population)[0]
except IndexError:
    print('Tables not found')

# Clean unnecessary data
# Remove references after the country name from strings like "China[a]"
df_population.iloc[:, 0].replace('\[.?\]', '', regex = True, inplace = True)
df_population.drop(df_population.columns[[1, 2, 3, 5]], axis = 'columns', inplace = True)
df_population.columns = ['Country', 'Population']

df_population.tail()

In [None]:
# To original data left joining the NOC codes dataframe
df_olympics = df_olympics.merge(df_noc_codes, how = 'left', left_on = 'NOCCode', right_on = 'NOCCode')

df_olympics.columns

In [None]:
# Also left joining the population by country dataframe
df_olympics = df_olympics.merge(df_population, how = 'left', left_on = 'Country', right_on = 'Country')

df_olympics.columns

In [None]:
# Indices of rows with missing population values
idx = list(df_olympics.index[df_olympics['Population'].isnull()])

In [None]:
# Manually enter values for rows with none-found Population NaN (NOCCode -> Population)
dict_pop_replace = {
    'GBR' : 67530172,
    'ROC' : 145872256,
    'TPE' : 23568378,
    'KOS' : 1935259,
    'HKG' : 7436154,
    'BER' : 62506,
    'PUR' : 2933408
}

pop_replace_list = list(dict_pop_replace.values())

In [None]:
for i, ind in enumerate(idx):
    df_olympics.iloc[ind, 9] = pop_replace_list[i]

print(df_olympics.iloc[idx, :])

In [None]:
# NOTE. Instead of density it's more representative to work with the inverse value: population per medal.
# Adding column with population of country divided by number of medals
df_olympics['Pop_per_medal'] = df_olympics['Population'] / df_olympics['Total']

In [None]:
# Adding column with population of country divided by number of GOLD medals
df_olympics['Pop_per_gold_medal'] = df_olympics['Population'] / df_olympics['Gold Medal']

In [None]:
df_olympics.sort_values('Pop_per_medal').head()

In [None]:
df_olympics.sort_values('Pop_per_medal').tail(10)

In [None]:
df_olympics.sort_values('Pop_per_gold_medal').head(10)

In [None]:
# Filtering inf values for countries without gold medals
df_olympics.replace(np.inf, np.nan).dropna().sort_values('Pop_per_gold_medal').tail(10)

In [None]:
pop_by_medal = df_olympics.sort_values('Pop_per_medal').iloc[:, [1, -2]]
plt.figure(figsize=(20,10))
plt.bar(pop_by_medal.iloc[:, 0], pop_by_medal.iloc[:, 1], log = True)
plt.xticks(rotation='vertical')
plt.xlabel('Country')
plt.ylabel('Population Per Medal')
plt.title('Population Per Medal by Countries. 2021 Tokyo Olympics')
plt.show()

In [None]:
pop_by_gold_medal = df_olympics.sort_values('Pop_per_gold_medal').iloc[:, [1, -1]]
pop_by_gold_medal = pop_by_gold_medal[~pop_by_gold_medal.iloc[:, 1].isin([np.inf, -np.inf])]
plt.figure(figsize=(20,10))
plt.bar(pop_by_gold_medal.iloc[:, 0], pop_by_gold_medal.iloc[:, 1], log = True)
plt.xticks(rotation='vertical')
plt.xlabel('Country')
plt.ylabel('Population Per Gold Medal')
plt.title('Population Per Gold Medal by Countries. 2021 Tokyo Olympics')
plt.show()

And the winners with the maximum Olympic spirit per population by gold and total medals are:
<br>
**<big>Bermuda</big>**
<br>and
<br>**<big>San Marino</big>**!
<br><big>Congratulations!</big>