In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns # data visualization
from matplotlib import pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## How is data look like?

In [None]:
data_hosts_path = '/kaggle/input/olympic-games-medals-19862018/olympic_hosts.csv'
data_medals_path = '/kaggle/input/olympic-games-medals-19862018/olympic_medals.csv'
data_results_path = '/kaggle/input/olympic-games-medals-19862018/olympic_results.csv'
data_athletes_path = '/kaggle/input/olympic-games-medals-19862018/olympic_athletes.csv'

### Hosts Dataset

In [None]:
df_hosts = pd.read_csv(data_hosts_path)
df_hosts.head()

In [None]:
df_hosts.info()

### Medals Dataset

In [None]:
df_medals = pd.read_csv(data_medals_path)
df_medals.head()

In [None]:
df_medals.info()

### Results Dataset

In [None]:
df_results = pd.read_csv(data_results_path)
df_results.head()

In [None]:
df_results.info()

### Athletes Dataset

In [None]:
df_athletes = pd.read_csv(data_athletes_path)
df_athletes.head()

In [None]:
df_athletes.info()

## How are join datasets? (example: medals)

In [None]:
data = df_medals.merge(df_hosts, how='left', left_on='slug_game', right_on='game_slug')
data['athlete_full_name'] = data['athlete_full_name'].str.title()
# removes join columns
data.drop(['slug_game', 'game_slug'], inplace=True, axis=1)
# removes unused columns
data.drop(['game_end_date', 'game_start_date'], axis=1, inplace=True)
data.shape

In [None]:
data.head()

In [None]:
# replaces long country names with short analogs
country_dict = {'German Democratic Republic (Germany)':'Germany', 
                'Federal Republic of Germany':'Germany',
                "Democratic People's Republic of Korea":'North Korea',
                'Republic of Korea':'South Korea',
                "People's Republic of China":'China',
                "Islamic Republic of Iran":'Iran',
                "United States of America":'USA'}
data['country_name'] = data['country_name'].replace(country_dict)

## How is preprocess the dataset? (example: medals)


The dataset contains two different rows for both winners in a team competition that consists of two persons, but it is one medal in total. For example, you can check the Tennis Doubles Men competition. Let's split the dataset into parts, group those team competitions, and join data again.

In [None]:
data_athlete = data[data['participant_type']=='Athlete']
data_athlete.shape

In [None]:
data_team = data[data['participant_type']=='GameTeam']
data_team.shape

In [None]:
data_team.head()

In [None]:
data_team_a = data_team[data_team['athlete_full_name'].isna()]
data_team_a.shape

In [None]:
group_columns = ['discipline_title', 'event_title', 'event_gender', 'medal_type', 'participant_type', 'participant_title',
                 'country_name', 'country_3_letter_code', 'game_location', 'game_season', 'game_name', 'game_year']
agg_columns = ['country_code', 'athlete_full_name']
data_team_b = data_team[data_team['athlete_full_name'].notna()].groupby(group_columns)[agg_columns]\
                                                               .agg(lambda x: set(x)).reset_index()
data_team_b['country_code'] = data_team_b['country_code'].apply(lambda x: list(x)[0])
data_team_b['athlete_full_name'] = data_team_b['athlete_full_name'].apply(lambda x: list(x))
data_team_b.shape

In [None]:
data_medal = pd.concat([data_team_a, data_team_b, data_athlete], axis=0)
data_medal.shape

## Timline of disciplines contested at the Olympic Games, 1986-2016

In [None]:
def get_disciplines_game(data, season):
    data = data[data['game_season']==season].reset_index(drop=True)
    disciplines = data.groupby(['discipline_title', 'game_year'])['participant_type'].count().reset_index()
    disciplines_pivot = disciplines.pivot('discipline_title', 'game_year', 'participant_type')
    disciplines_pivot[disciplines_pivot > 0] = 1
    last_column = disciplines_pivot.columns[-1]
    
    disciplines_exist = disciplines_pivot[disciplines_pivot[last_column]==1].sort_values(list(disciplines_pivot.columns))
    disciplines_notexist = disciplines_pivot[disciplines_pivot[last_column]!=1].sort_values(list(disciplines_pivot.columns))
    disciplines_pivot = pd.concat([disciplines_exist, disciplines_notexist])  
    disciplines_pivot.columns = [str(col)[:-2]+'\n'+str(col)[-2:] for col in disciplines_pivot.columns]
    return disciplines_pivot

In [None]:
def plot_disciplines(data, season, size=(18, 18)):
    plt.figure(figsize=size)
    ax = sns.heatmap(data, annot=False, cbar=False, linewidths=.8, cmap='Spectral')
    ax.tick_params(axis='x', which='major', labelsize=16)
    ax.tick_params(axis='y', which='major', labelsize=16)
    ax.xaxis.tick_top()
    ax.set_ylabel('')
    ax.set_xlabel('')
    ax.set_title('{} Games'.format(season), size=20)
    plt.tight_layout()
    plt.show()

In [None]:
season = 'Summer'
size = (20,20)
disciplines = get_disciplines_game(data_medal, season)
plot_disciplines(disciplines, season=season, size=size)

In [None]:
season = 'Winter'
size = (16,16)
disciplines = get_disciplines_game(data_medal, season)
plot_disciplines(disciplines, season=season, size=size)

## Summer Games

In [None]:
data_summer = data_medal[data_medal['game_season']=='Summer'].reset_index(drop=True)
data_summer.drop(['game_season'], axis=1, inplace=True)
data_summer.shape

In [None]:
data_summer.head()

## Medal Distribution by Country

In [None]:
def sort_games_name(game_name_list):
    '''
    Input: ['Sydney 2000', 'Atlanta 1996', 'Beijing 2008', 'Athens 2004']
    Output: ['Atlanta 1996', 'Sydney 2000', 'Athens 2004', 'Beijing 2008'
    '''
    game_name_tuple_split = [(' '.join(i.split(' ')[:-1]), i.split(' ')[-1])  for i in game_name_list]
    game_name_tuple_sorted = sorted(game_name_tuple_split, key=lambda x: x[1])
    game_name_list_sorted = [' '.join(i) for i in game_name_tuple_sorted]
    return game_name_list_sorted

In [None]:
def get_country_medal(data, country):
    data_country = data[data['country_name']==country]
    data_medal = data_country.groupby(['game_name', 'discipline_title'])['participant_type']\
                             .count().reset_index()
    data_medal = data_medal.pivot('discipline_title', 'game_name', 'participant_type')
    data_medal = data_medal[sort_games_name(list(data_medal.columns))]
    if len(list(data_medal.columns))<10:
        data_medal.columns = [col.replace(' ', '\n') for col in data_medal.columns]
    else:
        data_medal.columns = [col.split(' ')[-1] for col in data_medal.columns]
    data_medal['Total'] = data_medal.sum(axis=1)
    data_medal.loc["Total"] = data_medal.sum()
    return data_medal

In [None]:
def plot_country_medal(data, ):
    plt.figure(figsize=(20, 20))
    ax = sns.heatmap(data, annot=True, annot_kws={"fontsize":16},
                     cbar=False, linewidths=.8, fmt='g', cmap='coolwarm')
#     ax.set_title('Team', size=20)
    ax.tick_params(axis='x', which='major', labelsize=14)
    ax.tick_params(axis='y', which='major', labelsize=16)
    ax.xaxis.tick_top()
    ax.set_ylabel('')
    ax.set_xlabel('')
    plt.tight_layout()
#     plt.savefig('medals.png', dpi=300)
    plt.show()

### Top 10 performers

In [None]:
country_medal = data_summer.groupby(['country_name'])['participant_type'].count()\
                           .reset_index().rename(columns={'participant_type':'total_number'})
country_medal.sort_values('total_number', ascending=False)[:10]

### Medals Heat Map: USA (Top 1 performer)

In [None]:
country_medal_ua = get_country_medal(data_summer, 'USA')
plot_country_medal(country_medal_ua)

### Medals Heat Map: Ukraine (Test)

In [None]:
country_medal_ua = get_country_medal(data_summer, 'Ukraine')
plot_country_medal(country_medal_ua)

## Disciplines: Tennis

In [None]:
data_tennis = data_summer[data_summer['discipline_title']=='Tennis']
data_tennis.shape

In [None]:
data_tennis_women = data_tennis[(data_tennis['event_gender']=='Women')&
                                (data_tennis['participant_type']=='Athlete')&
                                (data_tennis['game_year']>=1988)]

data_tennis_men = data_tennis[(data_tennis['event_gender']=='Men')&
                              (data_tennis['participant_type']=='Athlete')&
                              (data_tennis['game_year']>=1988)]

In [None]:
data_tennis_women.groupby(['game_name', 'medal_type'])['country_name'].apply(list).reset_index()\
                 .pivot('game_name', 'medal_type', 'country_name')

In [None]:
data_tennis_men.groupby(['game_name', 'medal_type'])['country_name'].apply(list).reset_index()\
                 .pivot('game_name', 'medal_type', 'country_name')