In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

# for creating gif-images
import imageio
import os.path

# Read data files

In [2]:
# olympic games data
olympics = pd.read_csv('../olympics/olympics_upd.csv')
olympics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284851 entries, 0 to 284850
Data columns (total 14 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   name        284851 non-null  object 
 1   sex         284851 non-null  object 
 2   age         275536 non-null  float64
 3   team        284851 non-null  object 
 4   noc         284851 non-null  object 
 5   games       284851 non-null  object 
 6   year        284851 non-null  int64  
 7   season      284851 non-null  object 
 8   city        284851 non-null  object 
 9   sport       284851 non-null  object 
 10  event       284851 non-null  object 
 11  medal       42220 non-null   object 
 12  birth_year  275536 non-null  float64
 13  country     284828 non-null  object 
dtypes: float64(2), int64(1), object(11)
memory usage: 30.4+ MB


In [3]:
# setting sort order for 'medal' field (by making 'medal' a Categorical field)
medals_order = ['Bronze', 'Silver', 'Gold']
olympics['medal'] = pd.Categorical(olympics['medal'], medals_order)

# rename column
olympics.rename(columns={'sex': 'gender'}, inplace=True)

In [4]:
# olympic games data
countries = pd.read_csv('../olympics/countries_upd.csv')
countries.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 246 entries, 0 to 245
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   continent  246 non-null    object 
 1   region     246 non-null    object 
 2   country    246 non-null    object 
 3   capital    233 non-null    object 
 4   fips       236 non-null    object 
 5   iso_2      238 non-null    object 
 6   iso_3      239 non-null    object 
 7   iso_no     239 non-null    float64
 8   internet   237 non-null    object 
 9   note       15 non-null     float64
dtypes: float64(2), object(8)
memory usage: 19.3+ KB


In [5]:
# Adding DECADES to Olympics data
olympics['decade'] = (olympics['year'] // 10) * 10
olympics.sample(5)

Unnamed: 0,name,gender,age,team,noc,games,year,season,city,sport,event,medal,birth_year,country,decade
243942,Ekaterina Stefanova Dafovska (-Prodanova),F,18.0,Bulgaria,BUL,1994 Winter,1994,Winter,Lillehammer,Biathlon,Biathlon Women's 4 x 7.5 kilometres Relay,,1976.0,Bulgaria,1990
21551,Karin Bormann (-Kusch),F,18.0,West Germany,FRG,1972 Summer,1972,Summer,Munich,Swimming,Swimming Women's 200 metres Backstroke,,1954.0,Germany,1970
61273,Masaya Fukuda,M,22.0,Japan,JPN,1968 Summer,1968,Summer,Mexico City,Fencing,"Fencing Men's Foil, Team",,1946.0,Japan,1960
126558,Cecil Charles McMaster,M,28.0,South Africa,RSA,1924 Summer,1924,Summer,Paris,Athletics,Athletics Men's 10 kilometres Walk,Bronze,1896.0,South Africa,1920
196894,Toshinao Tomie,M,25.0,Japan,JPN,1936 Summer,1936,Summer,Berlin,Athletics,Athletics Men's 800 metres,,1911.0,Japan,1930


# Part 1 - Geting Sports with maximum number of Olympic games

In [6]:
olympics.head(5)

Unnamed: 0,name,gender,age,team,noc,games,year,season,city,sport,event,medal,birth_year,country,decade
0,A Dijiang,M,24.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,,1968.0,China,1990
1,A Lamusi,M,23.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,,1989.0,China,2010
2,Gunnar Nielsen Aaby,M,24.0,Denmark,DEN,1920 Summer,1920,Summer,Antwerpen,Football,Football Men's Football,,1896.0,Denmark,1920
3,Edgar Lindenau Aabye,M,34.0,Denmark/Sweden,DEN,1900 Summer,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold,1866.0,Denmark,1900
4,"Cornelia ""Cor"" Aalten (-Strannood)",F,18.0,Netherlands,NED,1932 Summer,1932,Summer,Los Angeles,Athletics,Athletics Women's 100 metres,,1914.0,Netherlands,1930


In [9]:
# getting sports with all events as separate dataframe

sports_big = olympics[['sport', 'event', 'decade', 'year', 'season', 'city']]
# sports_big.info() # 284851 entries

sports_big.drop_duplicates(inplace=True)
# sports_big.info() # 6537 entries

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sports.drop_duplicates(inplace=True)


In [11]:
# getting aggregation for 'sport' level only
sports = olympics[['sport', 'decade', 'year', 'season', 'city']]
# sports.info() # 284851 entries

sports.drop_duplicates(inplace=True)
sports.info() # 952 entries

<class 'pandas.core.frame.DataFrame'>
Int64Index: 952 entries, 0 to 243681
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   sport   952 non-null    object
 1   decade  952 non-null    int64 
 2   year    952 non-null    int64 
 3   season  952 non-null    object
 4   city    952 non-null    object
dtypes: int64(2), object(3)
memory usage: 44.6+ KB


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sports.drop_duplicates(inplace=True)


### Sports over years

In [25]:
sports_stats_years = sports.groupby(by=['season', 'sport'])['year'].describe()
sports_stats_years.reset_index(inplace=True)
sports_stats_years.sort_values(by='count', ascending=False, inplace=True)
sports_stats_years.head(30)

Unnamed: 0,season,sport,count,mean,std,min,25%,50%,75%,max
57,Summer,Swimming,30.0,1958.733333,38.974734,1896.0,1925.0,1962.0,1991.0,2020.0
29,Summer,Fencing,30.0,1958.733333,38.974734,1896.0,1925.0,1962.0,1991.0,2020.0
7,Summer,Athletics,30.0,1958.733333,38.974734,1896.0,1925.0,1962.0,1991.0,2020.0
33,Summer,Gymnastics,29.0,1956.62069,37.876127,1896.0,1924.0,1960.0,1988.0,2016.0
48,Summer,Rowing,29.0,1960.896552,37.787323,1900.0,1928.0,1964.0,1992.0,2020.0
20,Summer,Cycling,29.0,1956.62069,37.876127,1896.0,1924.0,1960.0,1988.0,2016.0
69,Summer,Wrestling,29.0,1960.758621,38.024104,1896.0,1928.0,1964.0,1992.0,2020.0
31,Summer,Football,28.0,1961.928571,38.062258,1900.0,1927.0,1966.0,1993.0,2020.0
26,Summer,Diving,28.0,1963.071429,36.585697,1904.0,1931.0,1966.0,1993.0,2020.0
52,Summer,Shooting,28.0,1961.785714,38.418126,1896.0,1930.0,1966.0,1993.0,2020.0


In [26]:
# sports included only once in Olympic games
sports_stats_years_rare = sports_stats_years[sports_stats_years['count']==1]
sports_stats_years_rare.sort_values(by='min', ascending=False, inplace=True)
sports_stats_years_rare

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sports_stats_years_rare.sort_values(by='min', ascending=False, inplace=True)


Unnamed: 0,season,sport,count,mean,std,min,25%,50%,75%,max
21,Summer,Cycling BMX Freestyle,1.0,2020.0,,2020.0,2020.0,2020.0,2020.0,2020.0
27,Summer,Equestrian,1.0,2020.0,,2020.0,2020.0,2020.0,2020.0,2020.0
55,Summer,Sport Climbing,1.0,2020.0,,2020.0,2020.0,2020.0,2020.0,2020.0
24,Summer,Cycling Road,1.0,2020.0,,2020.0,2020.0,2020.0,2020.0,2020.0
22,Summer,Cycling BMX Racing,1.0,2020.0,,2020.0,2020.0,2020.0,2020.0,2020.0
39,Summer,Karate,1.0,2020.0,,2020.0,2020.0,2020.0,2020.0,2020.0
41,Summer,Marathon Swimming,1.0,2020.0,,2020.0,2020.0,2020.0,2020.0,2020.0
16,Summer,Canoe Sprint,1.0,2020.0,,2020.0,2020.0,2020.0,2020.0,2020.0
15,Summer,Canoe Slalom,1.0,2020.0,,2020.0,2020.0,2020.0,2020.0,2020.0
53,Summer,Skateboarding,1.0,2020.0,,2020.0,2020.0,2020.0,2020.0,2020.0


### Sports over decades

In [27]:
sports_stats_decades = sports.groupby(by=['season', 'sport'])['decade'].describe()
sports_stats_decades.reset_index(inplace=True)
sports_stats_decades.sort_values(by='count', ascending=False, inplace=True)
sports_stats_decades.head(30)

Unnamed: 0,season,sport,count,mean,std,min,25%,50%,75%,max
57,Summer,Swimming,30.0,1954.666667,39.280892,1890.0,1920.0,1960.0,1987.5,2020.0
29,Summer,Fencing,30.0,1954.666667,39.280892,1890.0,1920.0,1960.0,1987.5,2020.0
7,Summer,Athletics,30.0,1954.666667,39.280892,1890.0,1920.0,1960.0,1987.5,2020.0
33,Summer,Gymnastics,29.0,1952.413793,37.952524,1890.0,1920.0,1960.0,1980.0,2010.0
48,Summer,Rowing,29.0,1956.896552,37.994685,1900.0,1920.0,1960.0,1990.0,2020.0
20,Summer,Cycling,29.0,1952.413793,37.952524,1890.0,1920.0,1960.0,1980.0,2010.0
69,Summer,Wrestling,29.0,1956.551724,38.570516,1890.0,1920.0,1960.0,1990.0,2020.0
31,Summer,Football,28.0,1957.857143,38.331608,1900.0,1920.0,1960.0,1990.0,2020.0
26,Summer,Diving,28.0,1958.928571,37.052444,1900.0,1927.5,1960.0,1990.0,2020.0
52,Summer,Shooting,28.0,1957.857143,38.620388,1890.0,1927.5,1960.0,1990.0,2020.0


In [29]:
# sports included only once in Olympic games
sports_stats_decades_rare = sports_stats_decades[sports_stats_decades['count']==1]
sports_stats_decades_rare.sort_values(by='min', ascending=False, inplace=True)
sports_stats_decades_rare

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sports_stats_decades_rare.sort_values(by='min', ascending=False, inplace=True)


Unnamed: 0,season,sport,count,mean,std,min,25%,50%,75%,max
21,Summer,Cycling BMX Freestyle,1.0,2020.0,,2020.0,2020.0,2020.0,2020.0,2020.0
53,Summer,Skateboarding,1.0,2020.0,,2020.0,2020.0,2020.0,2020.0,2020.0
25,Summer,Cycling Track,1.0,2020.0,,2020.0,2020.0,2020.0,2020.0,2020.0
55,Summer,Sport Climbing,1.0,2020.0,,2020.0,2020.0,2020.0,2020.0,2020.0
24,Summer,Cycling Road,1.0,2020.0,,2020.0,2020.0,2020.0,2020.0,2020.0
22,Summer,Cycling BMX Racing,1.0,2020.0,,2020.0,2020.0,2020.0,2020.0,2020.0
39,Summer,Karate,1.0,2020.0,,2020.0,2020.0,2020.0,2020.0,2020.0
41,Summer,Marathon Swimming,1.0,2020.0,,2020.0,2020.0,2020.0,2020.0,2020.0
16,Summer,Canoe Sprint,1.0,2020.0,,2020.0,2020.0,2020.0,2020.0,2020.0
15,Summer,Canoe Slalom,1.0,2020.0,,2020.0,2020.0,2020.0,2020.0,2020.0
