In [None]:
pip install openpyxl

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

In [None]:
athletes = pd.read_excel('../input/2021-olympics-in-tokyo/Athletes.xlsx', engine = 'openpyxl')
athletes.head()

### **How many athletes took part in Olympic Games in Tokio 2020?**

In [None]:
athletes['Name'].unique().size

### **How many countries participated in Games?**

In [None]:
athletes['NOC'].unique().size

### **How many difference sports disciplines were there?**

In [None]:
athletes['Discipline'].unique().size

### **How many athletes did each national team consist of?**

In [None]:
num_of_athletes = athletes.groupby(by = 'NOC').count()['Name']

plt.figure(figsize = (15,15), dpi = 200)
sns.barplot(y = num_of_athletes.sort_values(ascending = False).head(80).index, 
            x = num_of_athletes.sort_values(ascending = False).head(80), orient = "h")
plt.title('Number of athletes')
plt.grid(color = 'b', linestyle = '--')

In [None]:
a = num_of_athletes.apply(pd.Series)
a['rank'] = a[0].rank(ascending = False)
a[a.index == 'Poland']['rank']

### `Poland took 16th place out of all 206 in terms of number of athletes.`

### **What is the ratio of the each national team's athletes to the number of all athletes?**

In [None]:
plt.figure(figsize = (15,15), dpi = 200)
plt.pie(num_of_athletes.sort_values(ascending = False), labels = num_of_athletes.sort_values(ascending = False).index, 
        startangle = 90, autopct = '%1.2f%%', pctdistance = 0.9)
plt.title('Participation of athletes in each national team')
plt.show()
plt.clf()

### `1,76% of all athletes are from Poland.`

### **How many athletes were in each discipline?**

In [None]:
num_of_disciplines = athletes.groupby(by = 'Discipline').count()['Name']
num_of_disciplines.sort_values(ascending = False)

### **What is the distribution of disciplines in terms of the number of participants?**

In [None]:
plt.figure(figsize = (15,15), dpi = 200)
plt.pie(num_of_disciplines.sort_values(ascending = False), labels = num_of_disciplines.sort_values(ascending = False).index, 
        startangle = 90, autopct = '%1.2f%%', pctdistance = 0.9)
plt.title('Participation of athletes in each discipline')
plt.show()
plt.clf()

### **Which national teams did take first 10 places in medal table?** 

In [None]:
medals = pd.read_csv('../input/olympic-games-2021-medals/Tokyo 2021 dataset.csv')
medals.head(10)

In [None]:
medals[medals['Team/NOC'] == 'Poland']['Rank']

### `Poland took 17th place out of 206 in medal table.`

### **What is the distribution of the number of total medals won in terms of countries?**

In [None]:
plt.figure(figsize = (15,15), dpi = 200)
plt.pie(medals['Total'].sort_values(ascending = False), labels = medals.sort_values(by = 'Total', ascending = False)['Team/NOC'], 
        startangle = 90, autopct = '%1.2f%%', pctdistance = 0.9)
plt.title('Distribution of total medals won in terms of each national team')
plt.show()
plt.clf()

In [None]:
medals[medals['Team/NOC'] == 'Poland']['Rank by Total']

### `In terms of medals won Poland took 19th place out of 206. It won 1,3% of all medals.`

### **What was the potential of athletes of each national team?**

In [None]:
for i in num_of_athletes.index:
    medals.loc[medals['Team/NOC'] == i, 'points'] = medals['Gold Medal'] * 3 + medals['Silver Medal'] * 2 + medals['Bronze Medal']
    medals.loc[medals['Team/NOC'] == i, 'num_of_athletes'] = num_of_athletes[i]

In [None]:
medals['potential'] = medals['points'] / medals['num_of_athletes']

scaler = MinMaxScaler()
X = np.array(medals['potential']).reshape(-1, 1)
scaler.fit(X)
X_scaled = scaler.transform(X)
medals['normalized_potential'] = X_scaled.reshape(1,-1)[0]

In [None]:
medals.sort_values(by = 'normalized_potential', ascending = False)[['Team/NOC', 'normalized_potential', 'Gold Medal', 'Silver Medal', 'Bronze Medal', 'num_of_athletes']]

#### **_Bermuda_** sent only 2 athletes to olympic games, one of them won a gold medal. On the other hand, **_Argentina_** disappointed the most, because as many as 180 athletes got only 3 medals (including no gold medal).

In [None]:
plt.figure(figsize = (15,15), dpi = 200)
sns.barplot(y = medals.sort_values(by = 'normalized_potential', ascending = False)['Team/NOC'], 
            x = medals.sort_values(by = 'normalized_potential', ascending = False)['normalized_potential'], orient = "h")
plt.title('Potential of athletes of national teams')
plt.grid(color = 'b', linestyle = '--')

In [None]:
medals['rank_potential'] = medals['normalized_potential'].rank(ascending = False)
medals[medals['Team/NOC'] == 'Poland']['rank_potential']

### `In terms of potential of athletes Poland took 59th place.`

In [None]:
countries_without_medal = [i for i in athletes['NOC'].unique() if i not in medals['Team/NOC'].unique()]

In [None]:
max(dict(num_of_athletes[countries_without_medal]), 
    key = lambda k: dict(num_of_athletes[countries_without_medal])[k]), max(num_of_athletes[countries_without_medal])

#### Among the countries that did not win a medal, the most disappointing was **_Chile_**, which had the most athletes - 56.

### **What is the level of sportiness of each countries?**

In [None]:
countries = pd.read_csv('../input/countries-of-the-world/countries of the world.csv')

In [None]:
countries['Country'] = countries['Country'].map(lambda x: x.strip())

In [None]:
noa = num_of_athletes.apply(pd.Series)
noa = noa.rename(columns = {0: 'num_of_athletes'})

In [None]:
cc = [i for i in num_of_athletes.index if i in list(countries['Country'])]
for i in cc:
    noa.loc[noa.index == i, 'population'] = int(countries[countries['Country'] == i]['Population'])

In [None]:
noa.loc[noa.index == 'Antigua and Barbuda', 'population'] = int(countries[countries['Country'] == 'Antigua & Barbuda']['Population']) 
noa.loc[noa.index == 'Bahamas', 'population'] = int(countries[countries['Country'] == 'Bahamas, The']['Population'])
noa.loc[noa.index == 'Bosnia and Herzegovina', 'population'] = int(countries[countries['Country'] == 'Bosnia & Herzegovina']['Population'])
noa.loc[noa.index == 'Brunei Darussalam', 'population'] = int(countries[countries['Country'] == 'Brunei']['Population'])
noa.loc[noa.index == 'Central African Republic', 'population'] = int(countries[countries['Country'] == 'Central African Rep.']['Population'])
noa.loc[noa.index == 'Chinese Taipei', 'population'] = int(countries[countries['Country'] == 'Taiwan']['Population'])
noa.loc[noa.index == 'Congo', 'population'] = int(countries[countries['Country'] == 'Congo, Repub. of the']['Population'])
noa.loc[noa.index == "Côte d'Ivoire", 'population'] = int(countries[countries['Country'] == "Cote d'Ivoire"]['Population'])
noa.loc[noa.index == 'Democratic Republic of Timor-Leste', 'population'] = int(countries[countries['Country'] == 'East Timor']['Population'])
noa.loc[noa.index == 'Democratic Republic of the Congo', 'population'] = int(countries[countries['Country'] == 'Congo, Dem. Rep.']['Population'])
noa.loc[noa.index == 'Eswatini', 'population'] = 1467152
noa.loc[noa.index == 'Federated States of Micronesia', 'population'] = int(countries[countries['Country'] == 'Micronesia, Fed. St.']['Population'])
noa.loc[noa.index == 'Gambia', 'population'] = int(countries[countries['Country'] == 'Gambia, The']['Population'])
noa.loc[noa.index == 'Great Britain', 'population'] = int(countries[countries['Country'] == 'United Kingdom']['Population'])
noa.loc[noa.index == 'Hong Kong, China', 'population'] = int(countries[countries['Country'] == 'Hong Kong']['Population'])
noa.loc[noa.index == 'Islamic Republic of Iran', 'population'] = int(countries[countries['Country'] == 'Iran']['Population'])
noa.loc[noa.index == 'Kosovo', 'population'] = 1920079
noa.loc[noa.index == "Lao People's Democratic Republic", 'population'] = int(countries[countries['Country'] == 'Laos']['Population'])
noa.loc[noa.index == 'Montenegro', 'population'] = 623000
noa.loc[noa.index == 'Myanmar', 'population'] = int(countries[countries['Country'] == 'Burma']['Population'])
noa.loc[noa.index == 'North Macedonia', 'population'] = int(countries[countries['Country'] == 'Macedonia']['Population']) 
noa.loc[noa.index == 'Palestine', 'population'] = 5159076
noa.loc[noa.index == "People's Republic of China", 'population'] = int(countries[countries['Country'] == 'China']['Population'])
noa.loc[noa.index == 'ROC', 'population'] = int(countries[countries['Country'] == 'Russia']['Population'])
noa.loc[noa.index == 'Refugee Olympic Team', 'population'] = 82400000
noa.loc[noa.index == 'Republic of Korea', 'population'] = int(countries[countries['Country'] == 'Korea, South']['Population']) 
noa.loc[noa.index == 'Republic of Moldova', 'population'] = int(countries[countries['Country'] == 'Moldova']['Population'])
noa.loc[noa.index == 'Saint Kitts and Nevis', 'population'] = int(countries[countries['Country'] == 'Saint Kitts & Nevis']['Population']) 
noa.loc[noa.index == 'Sao Tome and Principe', 'population'] = int(countries[countries['Country'] == 'Sao Tome & Principe']['Population']) 
noa.loc[noa.index == 'South Sudan', 'population'] = 13026129
noa.loc[noa.index == 'St Vincent and the Grenadines', 'population'] = int(countries[countries['Country'] == 'Saint Vincent and the Grenadines']['Population']) 
noa.loc[noa.index == 'Syrian Arab Republic', 'population'] = int(countries[countries['Country'] == 'Syria']['Population']) 
noa.loc[noa.index == 'Trinidad and Tobago', 'population'] = int(countries[countries['Country'] == 'Trinidad & Tobago']['Population']) 
noa.loc[noa.index == 'United Republic of Tanzania', 'population'] = int(countries[countries['Country'] == 'Tanzania']['Population']) 
noa.loc[noa.index == 'United States of America', 'population'] = int(countries[countries['Country'] == 'United States']['Population']) 
noa.loc[noa.index == 'Virgin Islands, British', 'population'] = int(countries[countries['Country'] == 'British Virgin Is.']['Population']) 
noa.loc[noa.index == 'Virgin Islands, US', 'population'] = int(countries[countries['Country'] == 'Virgin Islands']['Population'])                                                                    

In [None]:
noa['sportiness'] = np.log(noa['num_of_athletes']) / np.log(noa['population'])

In [None]:
X = np.array(noa['sportiness']).reshape(-1, 1)
scaler.fit(X)
X_scaled = scaler.transform(X)
noa['normalized_sportiness'] = X_scaled.reshape(1,-1)[0]

In [None]:
noa.sort_values(by = 'normalized_sportiness', ascending = False)[['normalized_sportiness', 'num_of_athletes', 'population']]

#### The most sportiness country is **_Australia_**, which sent as many as 470 athletes from among only 20 000 000 inhabitants. On the other hand, the least sportiness country is **_Myanmar_**. It had only 2 athletes from among as many as above 47 000 000 inhabitants.

In [None]:
noa['rank_sportiness'] = noa['normalized_sportiness'].rank(ascending = False)
noa[noa.index == 'Poland']['rank_sportiness']

### `Poland took 17th place out of 206 in sporiness ranking.`

### **What is the level of satisfaction with Olympic Games in each country?**

In [None]:
cc2 = [i for i in num_of_athletes.index if i in list(medals['Team/NOC'])]
for i in cc2:
    noa.loc[noa.index == i, 'points'] = int(medals[medals['Team/NOC'] == i]['points'])

In [None]:
noa['points'].fillna(0.55, inplace = True)

In [None]:
noa['hapiness'] = np.log(2 * noa['points']) / np.log(noa['population'])

In [None]:
X = np.array(noa['hapiness']).reshape(-1, 1)
scaler.fit(X)
X_scaled = scaler.transform(X)
noa['normalized_hapiness'] = X_scaled.reshape(1,-1)[0]

In [None]:
noa.sort_values(by = 'normalized_hapiness', ascending = False)[['normalized_hapiness', 'points', 'population']]

#### The most satisfied with Olympic Games can be **_Americans_**, which got the highest number of points taking into account the number of inhabitants. The most unsatisfied should be citizens of **_Pakistan_**. There are as many as almost 166 000 000 and they did not get any medal.

In [None]:
noa['rank_hapiness'] = noa['normalized_hapiness'].rank(ascending = False)
noa[noa.index == 'Poland']['rank_hapiness']

### `In satisfaction ranking Poland took 26th place out of 206.`

### **What is the distribution of athletes in terms of region?**

In [None]:
countries['Region'] = countries['Region'].map(lambda x: x.strip())

In [None]:
for i in cc:
    noa.loc[noa.index == i, 'region'] = countries[countries['Country'] == i]['Region'].values

In [None]:
noa.loc[noa.index == 'Antigua and Barbuda', 'region'] = countries[countries['Country'] == 'Antigua & Barbuda']['Region'].values
noa.loc[noa.index == 'Bahamas', 'region'] = countries[countries['Country'] == 'Bahamas, The']['Region'].values
noa.loc[noa.index == 'Bosnia and Herzegovina', 'region'] = countries[countries['Country'] == 'Bosnia & Herzegovina']['Region'].values
noa.loc[noa.index == 'Brunei Darussalam', 'region'] = countries[countries['Country'] == 'Brunei']['Region'].values
noa.loc[noa.index == 'Central African Republic', 'region'] = countries[countries['Country'] == 'Central African Rep.']['Region'].values
noa.loc[noa.index == 'Chinese Taipei', 'region'] = countries[countries['Country'] == 'Taiwan']['Region'].values
noa.loc[noa.index == 'Congo', 'region'] = countries[countries['Country'] == 'Congo, Repub. of the']['Region'].values
noa.loc[noa.index == "Côte d'Ivoire", 'region'] = countries[countries['Country'] == "Cote d'Ivoire"]['Region'].values
noa.loc[noa.index == 'Democratic Republic of Timor-Leste', 'region'] = countries[countries['Country'] == 'East Timor']['Region'].values
noa.loc[noa.index == 'Democratic Republic of the Congo', 'region'] = countries[countries['Country'] == 'Congo, Dem. Rep.']['Region'].values
noa.loc[noa.index == 'Eswatini', 'region'] = 'SUB-SAHARAN AFRICA'
noa.loc[noa.index == 'Federated States of Micronesia', 'region'] = countries[countries['Country'] == 'Micronesia, Fed. St.']['Region'].values
noa.loc[noa.index == 'Gambia', 'region'] = countries[countries['Country'] == 'Gambia, The']['Region'].values
noa.loc[noa.index == 'Great Britain', 'region'] = countries[countries['Country'] == 'United Kingdom']['Region'].values
noa.loc[noa.index == 'Hong Kong, China', 'region'] = countries[countries['Country'] == 'Hong Kong']['Region'].values
noa.loc[noa.index == 'Islamic Republic of Iran', 'region'] = countries[countries['Country'] == 'Iran']['Region'].values
noa.loc[noa.index == 'Kosovo', 'region'] = 'EASTERN EUROPE'
noa.loc[noa.index == "Lao People's Democratic Republic", 'region'] = countries[countries['Country'] == 'Laos']['Region'].values
noa.loc[noa.index == 'Montenegro', 'region'] = 'EASTERN EUROPE'
noa.loc[noa.index == 'Myanmar', 'region'] = countries[countries['Country'] == 'Burma']['Region'].values
noa.loc[noa.index == 'North Macedonia', 'region'] = countries[countries['Country'] == 'Macedonia']['Region'].values
noa.loc[noa.index == 'Palestine', 'region'] = 'NEAR EAST'
noa.loc[noa.index == "People's Republic of China", 'region'] = countries[countries['Country'] == 'China']['Region'].values
noa.loc[noa.index == 'ROC', 'region'] = countries[countries['Country'] == 'Russia']['Region'].values
noa.loc[noa.index == 'Refugee Olympic Team', 'region'] = 'REFUGEE'
noa.loc[noa.index == 'Republic of Korea', 'region'] = countries[countries['Country'] == 'Korea, South']['Region'].values 
noa.loc[noa.index == 'Republic of Moldova', 'region'] = countries[countries['Country'] == 'Moldova']['Region'].values
noa.loc[noa.index == 'Saint Kitts and Nevis', 'region'] = countries[countries['Country'] == 'Saint Kitts & Nevis']['Region'].values 
noa.loc[noa.index == 'Sao Tome and Principe', 'region'] = countries[countries['Country'] == 'Sao Tome & Principe']['Region'].values 
noa.loc[noa.index == 'South Sudan', 'region'] = 'SUB-SAHARAN AFRICA'
noa.loc[noa.index == 'St Vincent and the Grenadines', 'region'] = countries[countries['Country'] == 'Saint Vincent and the Grenadines']['Region'].values
noa.loc[noa.index == 'Syrian Arab Republic', 'region'] = countries[countries['Country'] == 'Syria']['Region'].values
noa.loc[noa.index == 'Trinidad and Tobago', 'region'] = countries[countries['Country'] == 'Trinidad & Tobago']['Region'].values
noa.loc[noa.index == 'United Republic of Tanzania', 'region'] = countries[countries['Country'] == 'Tanzania']['Region'].values 
noa.loc[noa.index == 'United States of America', 'region'] = countries[countries['Country'] == 'United States']['Region'].values 
noa.loc[noa.index == 'Virgin Islands, British', 'region'] = countries[countries['Country'] == 'British Virgin Is.']['Region'].values 
noa.loc[noa.index == 'Virgin Islands, US', 'region'] = countries[countries['Country'] == 'Virgin Islands']['Region'].values                                                                    

In [None]:
noa.groupby(by = 'region').sum()['num_of_athletes']

In [None]:
plt.figure(figsize = (10,10))
plt.pie(x = noa.groupby(by = 'region').sum()['num_of_athletes'], 
        labels = noa.groupby(by = 'region').sum()['num_of_athletes'].index, 
        startangle = 90, autopct = '%1.2f%%', pctdistance = 0.9, explode = (0.04,0.04,0.04,0.04,0.04,0.04,0.04,0.04,0.04,0.04,0.04,0.04))
plt.title('Athletes in regions')
plt.show()
plt.clf()

### **What is the distribution of medals in terms of region?**

In [None]:
for i in cc2:
    noa.loc[noa.index == i, 'total_medals'] = medals[medals['Team/NOC'] == i]['Total'].values
    noa.loc[noa.index == i, 'gold_medals'] = medals[medals['Team/NOC'] == i]['Gold Medal'].values
    noa.loc[noa.index == i, 'silver_medals'] = medals[medals['Team/NOC'] == i]['Silver Medal'].values
    noa.loc[noa.index == i, 'bronze_medals'] = medals[medals['Team/NOC'] == i]['Bronze Medal'].values

In [None]:
noa['total_medals'].fillna(0, inplace = True)
noa['gold_medals'].fillna(0, inplace = True)
noa['silver_medals'].fillna(0, inplace = True)
noa['bronze_medals'].fillna(0, inplace = True)

In [None]:
noa.groupby(by = 'region').sum()[['total_medals', 'gold_medals', 'silver_medals', 'bronze_medals']]

In [None]:
plt.figure(figsize = (20,15))

plt.subplot(2,2,1)
plt.pie(x = noa.groupby(by = 'region').sum()['total_medals'], 
        labels = noa.groupby(by = 'region').sum()['total_medals'].index, 
        startangle = 90, autopct = '%1.2f%%', pctdistance = 0.9, explode = (0.04,0.04,0.04,0.04,0.04,0.04,0.04,0.04,0.04,0.04,0.04,0.04))
plt.title('Total medals')

plt.subplot(2,2,2)
plt.pie(noa.groupby(by = 'region').sum()['gold_medals'], 
        labels = noa.groupby(by = 'region').sum()['gold_medals'].index, 
        startangle = 90, autopct = '%1.2f%%', pctdistance = 0.9, explode = (0.04,0.04,0.04,0.04,0.04,0.04,0.04,0.04,0.04,0.04,0.04,0.04))
plt.title('Gold medals')

plt.subplot(2,2,3)
plt.pie(noa.groupby(by = 'region').sum()['silver_medals'], 
        labels = noa.groupby(by = 'region').sum()['silver_medals'].index, 
        startangle = 90, autopct = '%1.2f%%', pctdistance = 0.9, explode = (0.04,0.04,0.04,0.04,0.04,0.04,0.04,0.04,0.04,0.04,0.04,0.04))
plt.title('SIlver medals')

plt.subplot(2,2,4)
plt.pie(noa.groupby(by = 'region').sum()['bronze_medals'], labels = noa.groupby(by = 'region').sum()['bronze_medals'].index, 
        startangle = 90, autopct = '%1.2f%%', pctdistance = 0.9, explode = (0.04,0.04,0.04,0.04,0.04,0.04,0.04,0.04,0.04,0.04,0.04,0.04))
plt.title('Bronze medals')

plt.show()
plt.clf()