In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.io as pio

# for creating gif-images
import imageio
import os.path

### User-defined functions

In [2]:
def create_gif(range_min, range_max, input_image_folder, input_file_name_part, output_file_name='output', pics_per_second=1):
    images = []

    for range_date in range(range_min, range_max+1):
        filename = f'../images_work/{input_image_folder}/{input_file_name_part}_{range_date}.png'
        if not os.path.exists(filename):
            continue
    
        images.append(imageio.imread(filename))
        imageio.mimsave(f'../images/{output_file_name}.gif', images, fps=pics_per_second)

# Read data files

In [3]:
# olympic games data
olympics = pd.read_csv('../olympics/olympics_upd.csv')
olympics

Unnamed: 0,name,sex,age,team,noc,games,year,season,city,sport,event,medal,birth_year,country,decade
0,A Dijiang,M,24.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,,1968.0,China,1990
1,A Lamusi,M,23.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,,1989.0,China,2010
2,Gunnar Nielsen Aaby,M,24.0,Denmark,DEN,1920 Summer,1920,Summer,Antwerpen,Football,Football Men's Football,,1896.0,Denmark,1920
3,Edgar Lindenau Aabye,M,34.0,Denmark/Sweden,DEN,1900 Summer,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold,1866.0,Denmark,1900
4,"Cornelia ""Cor"" Aalten (-Strannood)",F,18.0,Netherlands,NED,1932 Summer,1932,Summer,Los Angeles,Athletics,Athletics Women's 100 metres,,1914.0,Netherlands,1930
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284846,Andrzej ya,M,29.0,Poland-1,POL,1976 Winter,1976,Winter,Innsbruck,Luge,Luge Mixed (Men)'s Doubles,,1947.0,Poland,1970
284847,Piotr ya,M,27.0,Poland,POL,2014 Winter,2014,Winter,Sochi,Ski Jumping,"Ski Jumping Men's Large Hill, Individual",,1987.0,Poland,2010
284848,Piotr ya,M,27.0,Poland,POL,2014 Winter,2014,Winter,Sochi,Ski Jumping,"Ski Jumping Men's Large Hill, Team",,1987.0,Poland,2010
284849,Tomasz Ireneusz ya,M,30.0,Poland,POL,1998 Winter,1998,Winter,Nagano,Bobsleigh,Bobsleigh Men's Four,,1968.0,Poland,1990


In [4]:
olympics['medal'].unique()

array([nan, 'Gold', 'Bronze', 'Silver'], dtype=object)

In [5]:
# setting sort order for 'medal' field (by making 'medal' a Categorical field)
medals_order = ['Bronze', 'Silver', 'Gold']
olympics['medal'] = pd.Categorical(olympics['medal'], medals_order)

In [6]:
olympics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284851 entries, 0 to 284850
Data columns (total 15 columns):
 #   Column      Non-Null Count   Dtype   
---  ------      --------------   -----   
 0   name        284851 non-null  object  
 1   sex         284851 non-null  object  
 2   age         275536 non-null  float64 
 3   team        284851 non-null  object  
 4   noc         284851 non-null  object  
 5   games       284851 non-null  object  
 6   year        284851 non-null  int64   
 7   season      284851 non-null  object  
 8   city        284851 non-null  object  
 9   sport       284851 non-null  object  
 10  event       284851 non-null  object  
 11  medal       42220 non-null   category
 12  birth_year  275536 non-null  float64 
 13  country     284828 non-null  object  
 14  decade      284851 non-null  int64   
dtypes: category(1), float64(2), int64(2), object(10)
memory usage: 30.7+ MB


In [7]:
# iso3 codes for olympic countries
iso3_codes = pd.read_csv('../olympics/iso3_pairs.csv')
iso3_codes

Unnamed: 0,country,iso3
0,Lebanon,LBN
1,Singapore,SGP
2,Russia,RUS
3,Afghanistan,AFG
4,Albania,ALB
...,...,...
200,Republic of Congo,COG
201,Democratic Republic of the Congo,COD
202,"Virgin Islands, British",VGB
203,South Sudan,SSD


# Data Wrangling

In [8]:
# Adding DECADES to Olympics data
olympics['decade'] = (olympics['year'] // 10) * 10
olympics.sample(5)

Unnamed: 0,name,sex,age,team,noc,games,year,season,city,sport,event,medal,birth_year,country,decade
32158,Andrs Charadia Alfieri,M,26.0,Argentina,ARG,1992 Summer,1992,Summer,Barcelona,Athletics,Athletics Men's Hammer Throw,,1966.0,Argentina,1990
3996,Rnar (Ruslan-) Alexandersson (Ovtinnikov-),M,23.0,Iceland,ISL,2000 Summer,2000,Summer,Sydney,Gymnastics,Gymnastics Men's Individual All-Around,,1977.0,Iceland,2000
115899,Hctor Lpez,M,19.0,Venezuela,VEN,1972 Summer,1972,Summer,Munich,Athletics,Athletics Men's 800 metres,,1953.0,Venezuela,1970
50125,Maryvonne Dupureur (Samson-),F,27.0,France,FRA,1964 Summer,1964,Summer,Tokyo,Athletics,Athletics Women's 800 metres,Silver,1937.0,France,1960
52494,Alexis Elizalde Estvez,M,28.0,Cuba,CUB,1996 Summer,1996,Summer,Atlanta,Athletics,Athletics Men's Discus Throw,,1968.0,Cuba,1990


### Part 1 - Medals for national teams over the years

In [9]:
# temporary DataFrame for getting grouped data for countries
mask_medals = ~(olympics['medal'].isnull())
country_medals_tmp = olympics[mask_medals][['country', 'decade', 'year', 'medal', 'games', 'season']]
country_medals_tmp # 42220 rows

Unnamed: 0,country,decade,year,medal,games,season
3,Denmark,1900,1900,Gold,1900 Summer,Summer
12,Finland,1920,1920,Bronze,1920 Summer,Summer
13,Finland,1920,1920,Bronze,1920 Summer,Summer
15,Finland,1940,1948,Bronze,1948 Summer,Summer
16,Finland,1940,1948,Gold,1948 Summer,Summer
...,...,...,...,...,...,...
284692,Russia,1990,1992,Gold,1992 Winter,Winter
284736,Serbia,1980,1988,Silver,1988 Winter,Winter
284768,Switzerland,1980,1988,Gold,1988 Winter,Winter
284770,Switzerland,1980,1988,Bronze,1988 Winter,Winter


In [10]:
# checking the filtered data for number of rows and nulls
country_medals_tmp.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 42220 entries, 3 to 284778
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   country  42220 non-null  object  
 1   decade   42220 non-null  int64   
 2   year     42220 non-null  int64   
 3   medal    42220 non-null  category
 4   games    42220 non-null  object  
 5   season   42220 non-null  object  
dtypes: category(1), int64(2), object(3)
memory usage: 2.0+ MB


In [11]:
# adding iso3 country name
# country_medals_tmp and iso3_codes

country_medals_tmp = pd.merge(left=country_medals_tmp, right=iso3_codes, how='left', on='country', suffixes=('_x', '_y'))
country_medals_tmp

Unnamed: 0,country,decade,year,medal,games,season,iso3
0,Denmark,1900,1900,Gold,1900 Summer,Summer,DNK
1,Finland,1920,1920,Bronze,1920 Summer,Summer,FIN
2,Finland,1920,1920,Bronze,1920 Summer,Summer,FIN
3,Finland,1940,1948,Bronze,1948 Summer,Summer,FIN
4,Finland,1940,1948,Gold,1948 Summer,Summer,FIN
...,...,...,...,...,...,...,...
42215,Russia,1990,1992,Gold,1992 Winter,Winter,RUS
42216,Serbia,1980,1988,Silver,1988 Winter,Winter,SRB
42217,Switzerland,1980,1988,Gold,1988 Winter,Winter,CHE
42218,Switzerland,1980,1988,Bronze,1988 Winter,Winter,CHE


**All medals per country per year:**

In [12]:
# group data 
country_medals_all = country_medals_tmp.groupby(by=['country', 'iso3', 'year'])[['games']].count()
country_medals_all.reset_index(inplace=True)
#country_medals_all.columns = ['country', 'year', 'number_of_medals']
country_medals_all.rename(columns={'games': 'number_of_medals'}, inplace=True)
country_medals_all.sort_values(by=['year', 'number_of_medals'], ascending=False, inplace=True)
country_medals_all

Unnamed: 0,country,iso3,year,number_of_medals
1464,USA,USA,2020,298
277,China,CHN,2020,149
1133,Russia,RUS,2020,149
502,France,FRA,2020,141
1428,UK,GBR,2020,141
...,...,...,...,...
354,Denmark,DNK,1896,6
579,Hungary,HUN,1896,6
71,Austria,AUT,1896,5
35,Australia,AUS,1896,3


In [13]:
# range of years in dataframe:
min_year = country_medals_all['year'].min()
max_year = country_medals_all['year'].max()

In [46]:
    xaxis_max = country_medals_all['number_of_medals'].max()+30
    year = 2020
    bar_title = f'Medals per country in {year}'

    df=country_medals_all[country_medals_all['year']==year].head(10)[['country', 'number_of_medals']]
    df.sort_values(by='number_of_medals', ascending=False, inplace=True)
    
    # if there is no data for current year, skip plotting and process next year data
    #if df.size==0:
    #    continue

    fig = px.bar(df, x='number_of_medals', y='country', orientation='h', 
             #title = f'Number of medals in {year}', 
             labels = {'country':'Country', 'number_of_medals': 'Number of '},
             color = 'country',
             color_discrete_sequence = px.colors.sequential.Plasma_r); #Antique);

    # for annotations 
    x_data = df['number_of_medals'].to_list()
    y_data = df['country'].to_list()
    annotations = []

    for yd, xd in zip(y_data, x_data):
        # labeling each bar
        annotations.append(dict(xref='x', yref='y',
                            x=xd + 20, y=yd,
                            text=str(xd),
                            font=dict(family='Times New Roman',   # 'Arial', 
                                      size=18, # color='rgb(248, 248, 255)'
                                      color='RebeccaPurple'  #'cornflowerblue'
                                     ),
                            showarrow=False))

    fig.update_layout(
        xaxis=dict(
            showgrid=False,
            showline=True,
            showticklabels=True,
            zeroline=False,
            #domain=[0.15, 1]
        ),
        yaxis=dict(
            showgrid=False,
            showline=True,
            showticklabels=True,
            zeroline=False,
        ),
        xaxis_range=[0,xaxis_max], # making x_axis of fixed range
        #barmode='stack',
        paper_bgcolor='rgb(248, 248, 255)',
        plot_bgcolor='rgb(248, 248, 255)',
        margin=dict(l=150, r=20, t=50, b=80),
        showlegend=False,
        
        title={
            'text': f'Number of medals in {year}',
            'y':0.95,
            'x':0.50,
            'xanchor': 'center',
            'yanchor': 'top'},
        
        font=dict(
            family='Times New Roman',   #"Courier New, monospace",
            size=18,
            color="RebeccaPurple"),
    
        annotations=annotations
    )

    #fig.write_image(f'../images_work/medals_all/country_medals_all_barplot_{year}.png')
    pio.write_image(fig, f'../images/country_medals_all_barplot_{year}.png',scale=6, width=512, height=384)
    fig.show()

In [32]:
# to plot dynamic barchart
# use plotly, save pics
xaxis_max = country_medals_all['number_of_medals'].max()+30

for curr_year in range(min_year, max_year + 1):
        
    year = curr_year
    bar_title = f'Medals per country in {year}'

    df=country_medals_all[country_medals_all['year']==year].head(10)[['country', 'number_of_medals']]
    df.sort_values(by='number_of_medals', ascending=False, inplace=True)
    
    # if there is no data for current year, skip plotting and process next year data
    if df.size==0:
        continue

    fig = px.bar(df, x='number_of_medals', y='country', orientation='h', 
             #title = bar_title, 
             labels = {'country':'Country', 'number_of_medals': f'Number of medals in {year}'},
             color = 'country',
             color_discrete_sequence = px.colors.sequential.Plasma_r); #Antique);

    # for annotations 
    x_data = df['number_of_medals'].to_list()
    y_data = df['country'].to_list()
    annotations = []

    for yd, xd in zip(y_data, x_data):
        # labeling each bar
        annotations.append(dict(xref='x', yref='y',
                            x=xd + 20, y=yd,
                            text=str(xd),
                            font=dict(family='Times New Roman',   # 'Arial', 
                                      size=16, # color='rgb(248, 248, 255)'
                                      color='cornflowerblue'
                                     ),
                            showarrow=False))

    fig.update_layout(
        xaxis=dict(
            showgrid=False,
            showline=True,
            showticklabels=True,
            zeroline=False,
            #domain=[0.15, 1]
        ),
        yaxis=dict(
            showgrid=False,
            showline=True,
            showticklabels=True,
            zeroline=False,
        ),
        xaxis_range=[0,xaxis_max], # making x_axis of fixed range
        #barmode='stack',
        paper_bgcolor='rgb(248, 248, 255)',
        plot_bgcolor='rgb(248, 248, 255)',
        margin=dict(l=150, r=20, t=50, b=80),
        showlegend=False,
    
        annotations=annotations
    )

    #fig.write_image(f'../images_work/medals_all/country_medals_all_barplot_{year}.png')
    pio.write_image(fig, f'../images_work/medals_all/country_medals_all_barplot_{year}.png',scale=6, width=512, height=384)
    #fig.show()

In [33]:
# make an animation over years
# use used-defined function
create_gif(range_min=min_year, 
           range_max=max_year, 
           input_image_folder='medals_all', 
           input_file_name_part='country_medals_all_barplot', 
           output_file_name='all_medals_over_years', # optional; default is 'output'
           pics_per_second=1.5) # optional; default is '1' seconds





**All medals per country per decade:**

In [16]:
# group data 
country_medals_all_decade = country_medals_tmp.groupby(by=['country','iso3', 'decade'])[['games']].count()
country_medals_all_decade.reset_index(inplace=True)
country_medals_all_decade.rename(columns={'games': 'number_of_medals'}, inplace=True)
country_medals_all_decade.sort_values(by=['decade', 'number_of_medals'], ascending=False, inplace=True)


In [17]:
country_medals_all_decade['decade'].nunique()

14

In [18]:
# plot ?


### Part 2 - Medals for national teams ever (all years)

In [19]:
# National Olympic Committees with maximum number of all medals ever (all years):
country_medals_ever = country_medals_tmp.groupby(by=['country', 'iso3'])[['games']].count()
country_medals_ever.reset_index(inplace=True)
country_medals_ever.rename(columns={'games': 'number_of_medals'}, inplace=True)
country_medals_ever.sort_values(by='number_of_medals', ascending=False, inplace=True)
country_medals_ever.reset_index(inplace=True, drop=True)
country_medals_ever

Unnamed: 0,country,iso3,number_of_medals
0,USA,USA,5935
1,Russia,RUS,4096
2,Germany,DEU,3837
3,UK,GBR,2208
4,France,FRA,1908
...,...,...,...
132,Tonga,TON,1
133,Togo,TGO,1
134,Sudan,SDN,1
135,Eritrea,ERI,1


In [39]:
# plotting a map 'medals ever'

fig = px.choropleth(data_frame=country_medals_ever, 
    locations="iso3", 
    projection='natural earth',    # 'orthographic',
    color="number_of_medals", # lifeExp is a column of gapminder
    hover_name="country", # column to add to hover information
    color_continuous_scale=px.colors.sequential.Plasma, #Blues
    labels={'number_of_medals': 'Number of Medals'}
)

fig.update_layout(
    font=dict(
        family='Times New Roman',   #"Courier New, monospace",
        size=18,
        color="RebeccaPurple"))

"""
fig.update_layout(
    title={
        'text': 'Medals for National Teams ever for Summer Olympic Games',
        'y':0.95,
        'x':0.45,
        'xanchor': 'center',
        'yanchor': 'top'},
    font=dict(
        family='Times New Roman',   #"Courier New, monospace",
        size=18,
        color="RebeccaPurple"))
"""

fig.write_html('../images/country_medals_ever.html', include_plotlyjs='cdn') # "content delivery network"
#fig.write_image('../images/country_medals_ever.png')
pio.write_image(fig, '../images/country_medals_ever.png',scale=10, width=1024, height=540)
fig.show()

**Medals for national teams ever (all years) - Summer:**

In [21]:
# National Olympic Committees with maximum number of all medals ever (all years):
summer_mask = country_medals_tmp['season']=='Summer'
country_medals_ever_summer = country_medals_tmp[summer_mask].groupby(by=['country', 'iso3'])[['games']].count()
country_medals_ever_summer.reset_index(inplace=True)
country_medals_ever_summer.rename(columns={'games': 'number_of_medals'}, inplace=True)
country_medals_ever_summer.sort_values(by='number_of_medals', ascending=False, inplace=True)
country_medals_ever_summer.reset_index(inplace=True, drop=True)
country_medals_ever_summer

Unnamed: 0,country,iso3,number_of_medals
0,USA,USA,5300
1,Russia,RUS,3337
2,Germany,DEU,3207
3,UK,GBR,2125
4,France,FRA,1758
...,...,...,...
130,Gabon,GAB,1
131,Tonga,TON,1
132,Togo,TGO,1
133,Senegal,SEN,1


In [34]:
# plot the Summer map
fig = px.choropleth(data_frame=country_medals_ever_summer, 
    locations="iso3", 
    projection='natural earth',    # 'orthographic',
    color="number_of_medals", # lifeExp is a column of gapminder
    hover_name="country", # column to add to hover information
    color_continuous_scale=px.colors.sequential.Plasma, #Blues
    labels={'number_of_medals': 'Number of Medals'}
)

fig.update_layout(
    font=dict(
        family='Times New Roman',   #"Courier New, monospace",
        size=18,
        color="RebeccaPurple"))

"""
fig.update_layout(
    title={
        'text': 'Medals for National Teams ever for Summer Olympic Games',
        'y':0.95,
        'x':0.45,
        'xanchor': 'center',
        'yanchor': 'top'},
    font=dict(
        family='Times New Roman',   #"Courier New, monospace",
        size=18,
        color="RebeccaPurple"))
"""
fig.write_html('../images/country_medals_ever_summer.html', include_plotlyjs='cdn') # "content delivery network"
pio.write_image(fig, '../images/country_medals_ever_summer.png',scale=10, width=1024, height=540)
#fig.write_image('../images/country_medals_ever_summer.png', scale=6, width=1080, height=1080)
fig.show()

**Medals for national teams ever (all years) - Winter:**

In [23]:
# National Olympic Committees with maximum number of all medals ever (all years):
winter_mask = country_medals_tmp['season']=='Winter'
country_medals_ever_winter = country_medals_tmp[winter_mask].groupby(by=['country', 'iso3'])[['games']].count()
country_medals_ever_winter.reset_index(inplace=True)
country_medals_ever_winter.rename(columns={'games': 'number_of_medals'}, inplace=True)
country_medals_ever_winter.sort_values(by='number_of_medals', ascending=False, inplace=True)
country_medals_ever_winter.reset_index(inplace=True, drop=True)
country_medals_ever_winter

Unnamed: 0,country,iso3,number_of_medals
0,Russia,RUS,759
1,USA,USA,635
2,Germany,DEU,630
3,Canada,CAN,611
4,Norway,NOR,443
5,Sweden,SWE,428
6,Finland,FIN,426
7,Austria,AUT,280
8,Switzerland,CHE,275
9,Czech Republic,CZE,231


In [24]:
# plot the Winter map
fig = px.choropleth(data_frame=country_medals_ever_winter, 
    locations="iso3", 
    projection='natural earth',    # 'orthographic',
    color="number_of_medals", # lifeExp is a column of gapminder
    hover_name="country", # column to add to hover information
    color_continuous_scale=px.colors.sequential.Plasma, #Blues
    labels={'number_of_medals': 'Number of Medals'}
)

fig.update_layout(
    font=dict(
        family='Times New Roman',   #"Courier New, monospace",
        size=18,
        color="RebeccaPurple"))

"""
fig.update_layout(
    title={
        'text': 'Medals for National Teams ever for Summer Olympic Games',
        'y':0.95,
        'x':0.45,
        'xanchor': 'center',
        'yanchor': 'top'},
    font=dict(
        family='Times New Roman',   #"Courier New, monospace",
        size=18,
        color="RebeccaPurple"))
"""

fig.write_html('../images/country_medals_ever_winter.html', include_plotlyjs='cdn') # "content delivery network"
#fig.write_image('../images/country_medals_ever_winter.png')
pio.write_image(fig, '../images/country_medals_ever_winter.png',scale=6, width=1024, height=540)
fig.show()

**List of countries with 1 olympic medal ever:**

In [25]:
# getting list of countries with 1 medal ever
mask = country_medals_ever['number_of_medals']==1

single_medal_list = country_medals_ever[mask]['country'] # len(single_medal_list) = 19 countries

olympisc_single_medal = olympics[(olympics['country'].isin(single_medal_list)) & ~(olympics['medal'].isnull())]
olympisc_single_medal.sort_values(by=['season', 'year', 'medal'], ascending=False, inplace=True)

olympisc_single_medal[[ 'country', 'season', 'year', 'city', 'medal', 'sport', 'event', 'name', 'sex', 'age']]



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,country,season,year,city,medal,sport,event,name,sex,age
241687,Nepal,Winter,1924,Chamonix,Gold,Alpinism,Alpinism Mixed Alpinism,Tejbir Bura,M,
226005,Turkmenistan,Summer,2020,Tokyo,Silver,Weightlifting,Women's 59kg,GURYEVA Polina,F,21.0
236091,Burkina Faso,Summer,2020,Tokyo,Bronze,Athletics,Men's Triple Jump,ZANGO Hugues Fabrice,M,28.0
12825,Guatemala,Summer,2012,London,Silver,Athletics,Athletics Men's 20 kilometres Walk,Erick Bernab Barrondo Garca,M,21.0
101257,Cyprus,Summer,2012,London,Silver,Sailing,Sailing Men's One Person Dinghy,Pavlos Kontides,M,22.0
142761,Gabon,Summer,2012,London,Silver,Taekwondo,Taekwondo Men's Heavyweight,Anthony Obame Mylann,M,23.0
85300,Sudan,Summer,2008,Beijing,Silver,Athletics,Athletics Men's 800 metres,Ismail Ahmed Ismail,M,23.0
22375,Togo,Summer,2008,Beijing,Bronze,Canoeing,"Canoeing Men's Kayak Singles, Slalom",Benjamin Kudjow Thomas Boukpeti,M,27.0
91156,Mauritius,Summer,2008,Beijing,Bronze,Boxing,Boxing Men's Bantamweight,Louis Richard Bruno Julie,M,30.0
191457,Eritrea,Summer,2004,Athina,Bronze,Athletics,"Athletics Men's 10,000 metres",Zersenay Tadesse Habtesilase,M,22.0


In [26]:
# save as file
olympics.to_csv(path_or_buf='../olympics/olympics_upd.csv', index=False,
                sep=',', na_rep='', header=True, mode='w', encoding='utf-8', decimal='.')

In [27]:
# National Olympic Committees with maximum number of all medals ever (all years):
country_medals_ever_by_season = country_medals_tmp.groupby(by=['country', 'year', 'season'])[['games']].count()
country_medals_ever_by_season.reset_index(inplace=True)
country_medals_ever_by_season.rename(columns={'games': 'number_of_medals'}, inplace=True)
country_medals_ever_by_season.sort_values(by='number_of_medals', ascending=False, inplace=True)
country_medals_ever_by_season

Unnamed: 0,country,year,season,number_of_medals
1270,Russia,1980,Summer,442
1632,USA,1904,Summer,394
1589,UK,1908,Summer,368
1662,USA,1984,Summer,352
1675,USA,2008,Summer,317
...,...,...,...,...
1394,Spain,1972,Summer,1
1392,Spain,1952,Summer,1
1390,Spain,1932,Summer,1
721,India,1996,Summer,1


# --------------------------------------------------------------------------  
# DEBUG NOTES

In [28]:
# list of unique SUMMER Sports:
#summer['sport'].nunique(), summer['sport'].unique()

```python
"""
(70,
 array(['Basketball', 'Judo', 'Football', 'Tug-Of-War', 'Athletics',
        'Swimming', 'Badminton', 'Sailing', 'Gymnastics',
        'Art Competitions', 'Handball', 'Weightlifting', 'Wrestling',
        'Water Polo', 'Hockey', 'Rowing', 'Fencing', 'Equestrianism',
        'Shooting', 'Boxing', 'Taekwondo', 'Cycling', 'Diving', 'Canoeing',
        'Tennis', 'Modern Pentathlon', 'Golf', 'Softball', 'Archery',
        'Volleyball', 'Synchronized Swimming', 'Table Tennis', 'Baseball',
        'Rhythmic Gymnastics', 'Rugby Sevens', 'Trampolining',
        'Beach Volleyball', 'Triathlon', 'Rugby', 'Lacrosse', 'Polo',
        'Cricket', 'Ice Hockey', 'Racquets', 'Motorboating', 'Croquet',
        'Figure Skating', 'Jeu De Paume', 'Roque', 'Basque Pelota',
        'Alpinism', 'Aeronautics', 'Cycling Road', 'Artistic Gymnastics',
        'Karate', 'Baseball/Softball', 'Trampoline Gymnastics',
        'Marathon Swimming', 'Canoe Slalom', 'Surfing', 'Canoe Sprint',
        'Cycling BMX Racing', 'Equestrian', 'Artistic Swimming',
        'Cycling Track', 'Skateboarding', 'Cycling Mountain Bike',
        '3x3 Basketball', 'Cycling BMX Freestyle', 'Sport Climbing'],
       dtype=object))
"""
```

In [29]:
# CODE EXAMPLE
# barplot with hue

#sns.histplot(data=categories, x="clicks", hue="season", multiple="dodge", shrink=.8, bins=8);
#plt.savefig("categories_seasonal_clicks_distribution_histplot.png")
#plt.show()

In [30]:
# CODE EXAMPLE:
# getting DECADE and CENTURY out of YEAR values

# DECADE = (YEAR // 10) * 10 
# CENTURY = YEAR // 100 + 1

In [31]:
# CODE EXAMPLE:
# number of unique values for each column

#for col in categories.columns.to_list():
#    print(f'{col}: ', categories[f'{col}'].nunique(), categories[f'{col}'].unique())