# Olympics 2020 Exploaratory Data Analysis of Medal Tally

### Olympics 2020 has just come to an end, and we have the dataset of the final medal tally.
### Let us try to analyze the medal tally and visualize some features

## Importing Libraries

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Import Dependencies
%matplotlib inline

# Start Python Imports
import math, time, random, datetime

# Data Manipulation
import numpy as np
import pandas as pd

# Visualization 
import matplotlib.pyplot as plt
import missingno
import seaborn as sns
plt.style.use('seaborn-whitegrid')

# Let's be rebels and ignore warnings for now
import warnings
warnings.filterwarnings('ignore')

## Importing Dataset

In [None]:
df = pd.read_csv("../input/2021-olympics-medals-in-tokyo/Tokyo Medals 2021.csv")

In [None]:
df

## Glance of the Dataset

### Q. How many countries won atleast 1 medal in the Olympics?


In [None]:
df.Country.count()

### 93 Countries won atleast 1 medal in the Olympics 2020

In [None]:
for i in df['Country']:
    print(i)

### Q. Which countries finished on the top in the medal tally?

In [None]:
df.head()

### We can see the top five countries in the medal tally
<ol>
    <li>United States of America</li>
    <li>People's Republic of China</li>
    <li>Japan</li>
    <li>Great Britain</li>
    <li>ROC</li>
</ol>

### Q. Which countries finished at the bottom in the medal tally?

In [None]:
df.tail()

### We can see the bottom five countries in the medal tally
<ol>
    <li>Ghana</li>
    <li>Grenada</li>
    <li>Kuwait</li>
    <li>Republic of Moldova</li>
    <li>Syria Arab Republic</li>
</ol>

### Q. What is the shape of the dataset?

In [None]:
df.shape

### The dataset is spread across 93 rows and 6 columns

### Q. What are columns or features in the dataset?

In [None]:
col = df.columns
for i in col:
    print(i)

### The features include: 
<ol>
    <li>Country</li>
    <li>Gold Medal</li>
    <li>Silver Medal</li>
    <li>Bronze Medal</li>
    <li>Total</li>
    <li>Rank by Total</li>
</ol>

### Q. Are there any missing values in the dataset?

In [None]:
# Plot graphic of missing values
missingno.matrix(df, figsize = (16, 5))

### There are no missing values

## Data Visualization

### Q. What is the distribution of the Medal Tally?


In [None]:
plt.figure(figsize=(20, 10))
plt.grid(False)
plt.style.use('dark_background')
plt.xlabel('Country', fontsize=20)
plt.ylabel('Number of Medals', fontsize=20)
plt.title('Distribution of Total Number of Medals in Olympics 2020', fontsize=20)
plt.bar(df['Country'], df['Total'], color ='aqua',width = 0.8, ec='#21209c')
plt.xticks(rotation=90)
plt.show()

### We can see the overall distribution of the medals amongst all the countries. United States of America has the highest number of medals.

### Q. What is the distribution of the Gold Medal?


In [None]:
plt.figure(figsize=(20, 10))
plt.grid(False)
plt.style.use('dark_background')
plt.xlabel('Country', fontsize=20)
plt.ylabel('Number of Gold Medals', fontsize=20)
plt.title('Distribution of Number of Gold Medals', fontsize=20)
plt.bar(df['Country'][0:63], df['Gold Medal'][0:63], color ='gold',width = 0.8, ec='#21209c')
plt.xticks(rotation=90)
plt.show()



### We can see the overall distribution of the gold medals amongst all the countries. United States of America has the highest number of gold medals.

### Q. What is the distribution of the Silver Medal?


In [None]:
plt.figure(figsize=(20, 10))
plt.grid(False)
plt.style.use('dark_background')
plt.xlabel('Country', fontsize=20)
plt.ylabel('Number of Silver Medals', fontsize=20)
plt.title('Distribution of Number of Silver Medals', fontsize=20)
plt.bar(df['Country'][0:84], df['Silver Medal'][0:84], color ='Silver',width = 0.8, ec='#21209c')
plt.xticks(rotation=90)
plt.show()



### We can see the overall distribution of the silver medals amongst all the countries. United States of America has the highest number of silver medals.




### Q. What is the distribution of the Silver Medal?


In [None]:
plt.figure(figsize=(20, 10))
plt.grid(False)
plt.style.use('dark_background')
plt.xlabel('Country', fontsize=20)
plt.ylabel('Number of Bronze Medals', fontsize=20)
plt.title('Distribution of Number of Bronze Medals', fontsize=20)
plt.bar(df['Country'], df['Bronze Medal'], color ='peru',width = 0.8, ec='#21209c')
plt.xticks(rotation=90)
plt.show()



### We can see the overall distribution of the gold medals amongst all the countries. United States of America has the highest number of gold medals.




## Let's analyze the medal count of the top 10 countries in Olympics 2020

In [None]:
top_df = df.head(10)

In [None]:
top_df

## Visualizing the performance of the top 10 countries

In [None]:
plt.figure(figsize=(20, 10))
plt.grid(False)
plt.style.use('dark_background')
plt.xlabel('Country', fontsize=20)
plt.ylabel('Number of Medals', fontsize=20)
plt.title('Distribution of Total Number of Medals in Olympics 2020', fontsize=20)
plt.bar(top_df['Country'], top_df['Total'], color ='aqua',width = 0.8, ec='#21209c')
plt.xticks(rotation=90)
plt.show()

In [None]:
plt.figure(figsize=(20, 10))
plt.grid(False)
plt.style.use('dark_background')
plt.title('Distribution of Total Number of Medals in Olympics 2020', fontsize=20)
plt.pie(top_df['Total'], labels = top_df.Country)
plt.xticks(rotation=90)
plt.show()

## Can we predict the Rank by Total number of medals by giving the number of Medals?

#### I'll be using LGBMRegressor

In [None]:
df_x = df.drop(['Country','Total', 'Rank By Total'], axis=1) # data
df_y = df['Rank By Total'] # labels

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.2, random_state=4)

In [None]:
from lightgbm import LGBMRegressor
regressor = LGBMRegressor()
regressor.fit(X_train, y_train)

In [None]:
print('Training Regression:', regressor.score(X_train, y_train))
print('Test Regression:', regressor.score(X_test, y_test))

## We are able to predict the Rank By Total with a good accuracy