In [1]:
import pandas as pd
import numpy as np

### User-defined functions

In [2]:
# formatting column names
# the function gets dataframe as input and formats its column names; None is ruturned 
def format_columns(df):
    columns_lst = df.columns
    columns_lst = [x.lower() for x in columns_lst] # convert column names to lower case
    columns_lst = [x.replace(' (', '_') for x in columns_lst] # replace ' (' to underscore
    columns_lst = [x.replace(')', '') for x in columns_lst] # drop ')'
    df.columns = columns_lst

# Part 1 - Read data files

### Summer Olympics data

In [3]:
# read csv-file
summer = pd.read_csv('../olympics/Athletes_summer_games.csv')
summer.drop(columns='Unnamed: 0', inplace=True)
summer

Unnamed: 0,Name,Sex,Age,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
0,A Dijiang,M,24.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,
1,A Lamusi,M,23.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,
2,Gunnar Nielsen Aaby,M,24.0,Denmark,DEN,1920 Summer,1920,Summer,Antwerpen,Football,Football Men's Football,
3,Edgar Lindenau Aabye,M,34.0,Denmark/Sweden,DEN,1900 Summer,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
4,"Cornelia ""Cor"" Aalten (-Strannood)",F,18.0,Netherlands,NED,1932 Summer,1932,Summer,Los Angeles,Athletics,Athletics Women's 100 metres,
...,...,...,...,...,...,...,...,...,...,...,...,...
237668,ZWOLINSKA Klaudia,F,22.0,Poland,POL,2020 Summer,2020,Summer,Tokyo,Canoe Slalom,Women's Kayak,
237669,ZYKOVA Yulia,F,25.0,Russia,ROC,2020 Summer,2020,Summer,Tokyo,Shooting,50m Rifle 3 Positions Women,Silver
237670,ZYUZINA Ekaterina,F,24.0,Russia,ROC,2020 Summer,2020,Summer,Tokyo,Sailing,Women's One Person Dinghy - Laser Radial,
237671,ZYZANSKA Sylwia,F,24.0,Poland,POL,2020 Summer,2020,Summer,Tokyo,Archery,Women's Individual,


In [4]:
# update column names to lower case
format_columns(summer)
summer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 237673 entries, 0 to 237672
Data columns (total 12 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   name    237673 non-null  object 
 1   sex     237673 non-null  object 
 2   age     228484 non-null  float64
 3   team    237673 non-null  object 
 4   noc     237673 non-null  object 
 5   games   237673 non-null  object 
 6   year    237673 non-null  int64  
 7   season  237673 non-null  object 
 8   city    237673 non-null  object 
 9   sport   237673 non-null  object 
 10  event   237673 non-null  object 
 11  medal   36537 non-null   object 
dtypes: float64(1), int64(1), object(10)
memory usage: 21.8+ MB


In [5]:
# getting the 'complete row' duplicates: 1386 rows
summer_dups = summer[summer.duplicated()]
summer_dups

Unnamed: 0,name,sex,age,team,noc,games,year,season,city,sport,event,medal
1066,Dsir Antoine Acket,M,27.0,Belgium,BEL,1932 Summer,1932,Summer,Los Angeles,Art Competitions,"Art Competitions Mixed Painting, Unknown Event",
3772,William Truman Aldrich,M,48.0,United States,USA,1928 Summer,1928,Summer,Amsterdam,Art Competitions,"Art Competitions Mixed Painting, Drawings And ...",
3773,William Truman Aldrich,M,48.0,United States,USA,1928 Summer,1928,Summer,Amsterdam,Art Competitions,"Art Competitions Mixed Painting, Drawings And ...",
4307,Hermann Reinhard Alker,M,43.0,Germany,GER,1928 Summer,1928,Summer,Amsterdam,Art Competitions,"Art Competitions Mixed Architecture, Designs F...",
4309,Hermann Reinhard Alker,M,43.0,Germany,GER,1928 Summer,1928,Summer,Amsterdam,Art Competitions,"Art Competitions Mixed Architecture, Architect...",
...,...,...,...,...,...,...,...,...,...,...,...,...
221722,Anna Katrina Zinkeisen (-Heseltine),F,46.0,Great Britain,GBR,1948 Summer,1948,Summer,London,Art Competitions,"Art Competitions Mixed Painting, Paintings",
221723,Anna Katrina Zinkeisen (-Heseltine),F,46.0,Great Britain,GBR,1948 Summer,1948,Summer,London,Art Competitions,"Art Competitions Mixed Painting, Paintings",
221725,Anna Katrina Zinkeisen (-Heseltine),F,46.0,Great Britain,GBR,1948 Summer,1948,Summer,London,Art Competitions,"Art Competitions Mixed Painting, Unknown Event",
221727,Doris Clare Zinkeisen (-Johnstone),F,49.0,Great Britain,GBR,1948 Summer,1948,Summer,London,Art Competitions,"Art Competitions Mixed Painting, Unknown Event",


In [6]:
# number of duplicated summer rows to drop:
237_673 - 236_287, (237_673 - 236_287)*100 / 237_673

(1386, 0.5831541655972702)

In [7]:
# dropping dups from SUMMER data
summer.drop_duplicates(inplace=True)
summer.info() # 237_673 -> 236_287 = 1_386 (0.58 % of initial summer dataset)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 236287 entries, 0 to 237672
Data columns (total 12 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   name    236287 non-null  object 
 1   sex     236287 non-null  object 
 2   age     227257 non-null  float64
 3   team    236287 non-null  object 
 4   noc     236287 non-null  object 
 5   games   236287 non-null  object 
 6   year    236287 non-null  int64  
 7   season  236287 non-null  object 
 8   city    236287 non-null  object 
 9   sport   236287 non-null  object 
 10  event   236287 non-null  object 
 11  medal   36525 non-null   object 
dtypes: float64(1), int64(1), object(10)
memory usage: 23.4+ MB


In [8]:
# adding birth_year
# summer['birth_year'] = summer['year'] - summer['age']. astype('Int64')

In [9]:
# checking for nulls in SUMMER data # 9_030 records
summer.isnull().sum()

name           0
sex            0
age         9030
team           0
noc            0
games          0
year           0
season         0
city           0
sport          0
event          0
medal     199762
dtype: int64

In [10]:
# no Age SUMMER rows
summer_no_age = summer[summer['age'].isnull()].sort_values(by='year', ascending=False)
summer_no_age

Unnamed: 0,name,sex,age,team,noc,games,year,season,city,sport,event,medal
31746,Cha Yong-Hwa,F,,North Korea,PRK,2008 Summer,2008,Summer,Beijing,Gymnastics,Gymnastics Women's Individual All-Around,
31747,Cha Yong-Hwa,F,,North Korea,PRK,2008 Summer,2008,Summer,Beijing,Gymnastics,Gymnastics Women's Uneven Bars,
133920,Moosaka,F,,Uganda,UGA,2000 Summer,2000,Summer,Sydney,Table Tennis,Table Tennis Women's Singles,
149323,Raymond Anthony Papa,M,,Philippines,PHI,1996 Summer,1996,Summer,Atlanta,Swimming,Swimming Men's 200 metres Backstroke,
122153,Abdou Manzo,M,,Niger,NIG,1996 Summer,1996,Summer,Atlanta,Athletics,Athletics Men's Marathon,
...,...,...,...,...,...,...,...,...,...,...,...,...
168531,Richard Rstel,M,,Germany,GER,1896 Summer,1896,Summer,Athina,Gymnastics,Gymnastics Men's Parallel Bars,
168530,Richard Rstel,M,,Germany,GER,1896 Summer,1896,Summer,Athina,Gymnastics,Gymnastics Men's Horse Vault,
40865,Evangelos Damaskos,M,,Greece,GRE,1896 Summer,1896,Summer,Athina,Athletics,Athletics Men's Pole Vault,Bronze
109508,Ioannis Georgios Lavrentis,M,,Greece,GRE,1896 Summer,1896,Summer,Athina,Athletics,Athletics Men's Marathon,


In [11]:
# O P T I O N A L:
# try to complete data with NaN 'age' values with the data from  different years...
# T O   D E B U G (in progress...)

# join to initial dataframe to get birth_year
# summer_no_age_upd = summer_no_age.merge(summer_no_age_source, how='inner', on=['name', 'noc'], suffixes=('_x', '_y'), copy=True)
# summer_no_age_upd

In [12]:
# checking if null 'age' values could be fixed
# summer_no_age['name'].nunique() # 
names_list_summer = summer_no_age['name'].unique()

summer[~(summer['age'].isnull()) & (summer['name'].isin(names_list_summer))] # 90 rows

Unnamed: 0,name,sex,age,team,noc,games,year,season,city,sport,event,medal
4123,Abdullah Ali,M,28.0,Egypt,EGY,1964 Summer,1964,Summer,Tokyo,Rowing,Rowing Men's Coxed Pairs,
4124,Abdullah Ali,M,28.0,Egypt,EGY,1964 Summer,1964,Summer,Tokyo,Rowing,Rowing Men's Coxed Fours,
13340,Muhammad Bashir,M,25.0,Pakistan,PAK,1960 Summer,1960,Summer,Roma,Wrestling,"Wrestling Men's Welterweight, Freestyle",Bronze
13341,Muhammad Bashir,M,29.0,Pakistan,PAK,1964 Summer,1964,Summer,Tokyo,Wrestling,"Wrestling Men's Lightweight, Freestyle",
13343,Muhammad Bashir,M,17.0,Pakistan,PAK,1952 Summer,1952,Summer,Helsinki,Swimming,Swimming Men's 200 metres Breaststroke,
...,...,...,...,...,...,...,...,...,...,...,...,...
198052,Josef Tom,M,30.0,Czechoslovakia,TCH,1964 Summer,1964,Summer,Tokyo,Athletics,"Athletics Men's 10,000 metres",
205258,Ji Vank,M,22.0,Czech Republic,CZE,2000 Summer,2000,Summer,Sydney,Tennis,Tennis Men's Singles,
205259,Ji Vank,M,30.0,Czech Republic,CZE,2008 Summer,2008,Summer,Beijing,Tennis,Tennis Men's Singles,
210310,Hugo Walser,M,24.0,Liechtenstein,LIE,1964 Summer,1964,Summer,Tokyo,Athletics,Athletics Men's 800 metres,


In [13]:
# number of unique names for which we have missed age and other records with filled age
summer[~(summer['age'].isnull()) & (summer['name'].isin(names_list_summer))]['name'].nunique()

45

In [14]:
# filtering our names which could be potentially fixed using another records with filled age
names_list_summer_source = summer[~(summer['age'].isnull()) & (summer['name'].isin(names_list_summer))]['name'].unique()
names_list_summer_source

array(['Abdullah Ali', 'Muhammad Bashir', 'Franois Bonnet',
       'Jaroslav Bro', 'Fernando Cavallini', 'Eugne Coulon',
       'Dimitrios Deligiannis', 'Mamadou Diallo', 'Mohamed El-Sayed',
       'Ernst Gerlach', 'Francisco Gonzlez Suaste', 'Luis Gonzlez',
       'Luis Hernndez', 'William Hill', 'Karl Hoffmann', 'Ernst Huber',
       'Mohamed Ibrahim', 'Luis Jimnez Pea', 'Nikolay Karasyov',
       'Ioannis Karyofyllis', 'Kim Yeong-Su', 'Kim Yong-Bae',
       'Kazuo Kobayashi', 'Anatoly Ivanovich Kolesov',
       'Konstantinos Konstantinou', 'Jzsef Kovcs', 'Lee Sang-Hun',
       'James McKenzie', 'Pedro Mercado', 'Hans Mller', 'Bla Nagy',
       'Alexandros Nikolopoulos', 'Henry Perry', 'Frantiek ez',
       'Francisco Rodrguez', 'Sonja Roman', 'Hans Roth', 'Josef Schejbal',
       'Johann Schulz', 'Erich Siebert', 'Baldev Singh', 'Devi Singh',
       'Josef Tom', 'Ji Vank', 'Hugo Walser'], dtype=object)

In [15]:
# checking 1 particular name: if it is possible to fill missed 'age' 
summer[summer['name'] == 'Abdullah Ali']

# IMPORTANT: there could be the same 'name' with different 'noc'/'team';
#            also take care of dates ('age' to 'year' to 'year_with_missed_age')

Unnamed: 0,name,sex,age,team,noc,games,year,season,city,sport,event,medal
4123,Abdullah Ali,M,28.0,Egypt,EGY,1964 Summer,1964,Summer,Tokyo,Rowing,Rowing Men's Coxed Pairs,
4124,Abdullah Ali,M,28.0,Egypt,EGY,1964 Summer,1964,Summer,Tokyo,Rowing,Rowing Men's Coxed Fours,
4125,Abdullah Ali,M,,Bahrain,BRN,1984 Summer,1984,Summer,Los Angeles,Shooting,"Shooting Men's Small-Bore Rifle, Prone, 50 metres",


**How important missed 'age' data?**

In [16]:
# how many medals among the sportsmen with no Age:
summer_no_age['medal'].value_counts()


Silver    292
Bronze    287
Gold      138
Name: medal, dtype: int64

In [17]:
# number of unique years for SUMMER sportsmen with no Age
summer_no_age['year'].nunique(), summer_no_age['year'].unique()

(26,
 array([2008, 2000, 1996, 1992, 1988, 1984, 1980, 1976, 1972, 1968, 1964,
        1960, 1956, 1952, 1948, 1936, 1932, 1928, 1924, 1920, 1912, 1908,
        1906, 1904, 1900, 1896], dtype=int64))

**Resolution:**  
    Missed data could be essential for counting medals per age for particular sportsmen.  
    However medals per country could be calculated correctly!

### Winter Olympics data

In [18]:
winter = pd.read_csv('../olympics/Athletes_winter_games.csv')
winter.drop(columns='Unnamed: 0', inplace=True)
winter

Unnamed: 0,Name,Sex,Age,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
0,Christine Jacoba Aaftink,F,21.0,Netherlands,NED,1988 Winter,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,
1,Christine Jacoba Aaftink,F,21.0,Netherlands,NED,1988 Winter,1988,Winter,Calgary,Speed Skating,"Speed Skating Women's 1,000 metres",
2,Christine Jacoba Aaftink,F,25.0,Netherlands,NED,1992 Winter,1992,Winter,Albertville,Speed Skating,Speed Skating Women's 500 metres,
3,Christine Jacoba Aaftink,F,25.0,Netherlands,NED,1992 Winter,1992,Winter,Albertville,Speed Skating,"Speed Skating Women's 1,000 metres",
4,Christine Jacoba Aaftink,F,27.0,Netherlands,NED,1994 Winter,1994,Winter,Lillehammer,Speed Skating,Speed Skating Women's 500 metres,
...,...,...,...,...,...,...,...,...,...,...,...,...
48559,Andrzej ya,M,29.0,Poland-1,POL,1976 Winter,1976,Winter,Innsbruck,Luge,Luge Mixed (Men)'s Doubles,
48560,Piotr ya,M,27.0,Poland,POL,2014 Winter,2014,Winter,Sochi,Ski Jumping,"Ski Jumping Men's Large Hill, Individual",
48561,Piotr ya,M,27.0,Poland,POL,2014 Winter,2014,Winter,Sochi,Ski Jumping,"Ski Jumping Men's Large Hill, Team",
48562,Tomasz Ireneusz ya,M,30.0,Poland,POL,1998 Winter,1998,Winter,Nagano,Bobsleigh,Bobsleigh Men's Four,


In [19]:
# update column names to lower case
format_columns(winter)

In [20]:
# getting the 'complete row' duplicates - NO DUPS!
winter_dups = winter[winter.duplicated()]
winter_dups

Unnamed: 0,name,sex,age,team,noc,games,year,season,city,sport,event,medal


In [21]:
# checking data types  - ALL CORRECT!
winter.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48564 entries, 0 to 48563
Data columns (total 12 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   name    48564 non-null  object 
 1   sex     48564 non-null  object 
 2   age     48279 non-null  float64
 3   team    48564 non-null  object 
 4   noc     48564 non-null  object 
 5   games   48564 non-null  object 
 6   year    48564 non-null  int64  
 7   season  48564 non-null  object 
 8   city    48564 non-null  object 
 9   sport   48564 non-null  object 
 10  event   48564 non-null  object 
 11  medal   5695 non-null   object 
dtypes: float64(1), int64(1), object(10)
memory usage: 4.4+ MB


In [22]:
# adding birth_year
# winter['birth_year'] = winter['year'] - winter['age']. astype('Int64')

In [23]:
# checking for nulls
winter.isnull().sum()

name          0
sex           0
age         285
team          0
noc           0
games         0
year          0
season        0
city          0
sport         0
event         0
medal     42869
dtype: int64

In [24]:
# checking if null 'Age' values could be fixed
winter_no_age = winter[winter['age'].isnull()].sort_values(by='year', ascending=False) #, ignore_index=False)
winter_no_age

Unnamed: 0,name,sex,age,team,noc,games,year,season,city,sport,event,medal
25608,"Christopher Paul ""Chris"" Lori",M,,Canada-2,CAN,1998 Winter,1998,Winter,Nagano,Bobsleigh,Bobsleigh Men's Two,
25609,"Christopher Paul ""Chris"" Lori",M,,Canada-2,CAN,1998 Winter,1998,Winter,Nagano,Bobsleigh,Bobsleigh Men's Four,
25606,"Christopher Paul ""Chris"" Lori",M,,Canada-2,CAN,1994 Winter,1994,Winter,Lillehammer,Bobsleigh,Bobsleigh Men's Two,
25607,"Christopher Paul ""Chris"" Lori",M,,Canada-2,CAN,1994 Winter,1994,Winter,Lillehammer,Bobsleigh,Bobsleigh Men's Four,
25605,"Christopher Paul ""Chris"" Lori",M,,Canada-1,CAN,1992 Winter,1992,Winter,Albertville,Bobsleigh,Bobsleigh Men's Four,
...,...,...,...,...,...,...,...,...,...,...,...,...
16176,"Antoine ""Antonio"" Herin",M,,Italy,ITA,1924 Winter,1924,Winter,Chamonix,Cross Country Skiing,Cross Country Skiing Men's 18 kilometres,
16164,Georgette Herbos,F,,Belgium,BEL,1924 Winter,1924,Winter,Chamonix,Figure Skating,Figure Skating Mixed Pairs,
16163,Paolo Herbert,M,,Italy-1,ITA,1924 Winter,1924,Winter,Chamonix,Bobsleigh,Bobsleigh Men's Four/Five,
14513,Anton Guldener,M,,Switzerland-2,SUI,1924 Winter,1924,Winter,Chamonix,Bobsleigh,Bobsleigh Men's Four/Five,


In [25]:
# unique 'names' with missed 'age' data
#winter_no_age['name'].nunique() # 194
names_list = winter_no_age['name'].unique()

winter[~(winter['age'].isnull()) & (winter['name'].isin(names_list))]  # only 1 name = 'Giuseppe Steiner' (born in 1930)

Unnamed: 0,name,sex,age,team,noc,games,year,season,city,sport,event,medal
40867,Giuseppe Steiner,M,30.0,Italy,ITA,1960 Winter,1960,Winter,Squaw Valley,Cross Country Skiing,Cross Country Skiing Men's 15 kilometres,
40868,Giuseppe Steiner,M,30.0,Italy,ITA,1960 Winter,1960,Winter,Squaw Valley,Cross Country Skiing,Cross Country Skiing Men's 30 kilometres,
40869,Giuseppe Steiner,M,30.0,Italy,ITA,1960 Winter,1960,Winter,Squaw Valley,Cross Country Skiing,Cross Country Skiing Men's 4 x 10 kilometres R...,
40870,Giuseppe Steiner,M,34.0,Italy,ITA,1964 Winter,1964,Winter,Innsbruck,Cross Country Skiing,Cross Country Skiing Men's 15 kilometres,
40871,Giuseppe Steiner,M,34.0,Italy,ITA,1964 Winter,1964,Winter,Innsbruck,Cross Country Skiing,Cross Country Skiing Men's 30 kilometres,
40872,Giuseppe Steiner,M,34.0,Italy,ITA,1964 Winter,1964,Winter,Innsbruck,Cross Country Skiing,Cross Country Skiing Men's 4 x 10 kilometres R...,


In [26]:
# NO missed Age could be fixed because there is only 1 sportsmen with different participation year and he is too old
# it's a different Giuseppe Steiner!
winter_no_age[winter_no_age['name']=='Giuseppe Steiner']

Unnamed: 0,name,sex,age,team,noc,games,year,season,city,sport,event,medal
40873,Giuseppe Steiner,M,,Italy-1,ITA,1924 Winter,1924,Winter,Chamonix,Bobsleigh,Bobsleigh Men's Four/Five,


### Regions data (a part of Olympics data set)

In [27]:
regions = pd.read_csv('../olympics/regions.csv')
regions

Unnamed: 0.1,Unnamed: 0,NOC,region,notes
0,0,EOR,Refugee,
1,1,LBN,Lebanon,
2,2,SGP,Singapore,
3,3,ROC,Russia,
4,4,AFG,Afghanistan,
...,...,...,...,...
229,229,YEM,Yemen,
230,230,YMD,Yemen,South Yemen
231,231,YUG,Serbia,Yugoslavia
232,232,ZAM,Zambia,


In [28]:
# regions.columns
#Index(['Unnamed: 0', 'NOC', 'region', 'notes'], dtype='object')

regions.drop(columns='Unnamed: 0', inplace=True)

# update column names to lower case
format_columns(regions)

regions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 234 entries, 0 to 233
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   noc     234 non-null    object
 1   region  231 non-null    object
 2   notes   21 non-null     object
dtypes: object(3)
memory usage: 5.6+ KB


In [29]:
# getting the 'complete row' duplicates      - NO DUPS!
regions_dups = regions[regions.duplicated()]
regions_dups

Unnamed: 0,noc,region,notes


### Getting the single dataset (winter and summer)

In [30]:
# joining summer and winter data
olympics = pd.concat([summer, winter], axis=0, ignore_index=True) #concatenating df1 and df2 along rows; reset_index
olympics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284851 entries, 0 to 284850
Data columns (total 12 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   name    284851 non-null  object 
 1   sex     284851 non-null  object 
 2   age     275536 non-null  float64
 3   team    284851 non-null  object 
 4   noc     284851 non-null  object 
 5   games   284851 non-null  object 
 6   year    284851 non-null  int64  
 7   season  284851 non-null  object 
 8   city    284851 non-null  object 
 9   sport   284851 non-null  object 
 10  event   284851 non-null  object 
 11  medal   42220 non-null   object 
dtypes: float64(1), int64(1), object(10)
memory usage: 26.1+ MB


In [31]:
olympics

Unnamed: 0,name,sex,age,team,noc,games,year,season,city,sport,event,medal
0,A Dijiang,M,24.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,
1,A Lamusi,M,23.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,
2,Gunnar Nielsen Aaby,M,24.0,Denmark,DEN,1920 Summer,1920,Summer,Antwerpen,Football,Football Men's Football,
3,Edgar Lindenau Aabye,M,34.0,Denmark/Sweden,DEN,1900 Summer,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
4,"Cornelia ""Cor"" Aalten (-Strannood)",F,18.0,Netherlands,NED,1932 Summer,1932,Summer,Los Angeles,Athletics,Athletics Women's 100 metres,
...,...,...,...,...,...,...,...,...,...,...,...,...
284846,Andrzej ya,M,29.0,Poland-1,POL,1976 Winter,1976,Winter,Innsbruck,Luge,Luge Mixed (Men)'s Doubles,
284847,Piotr ya,M,27.0,Poland,POL,2014 Winter,2014,Winter,Sochi,Ski Jumping,"Ski Jumping Men's Large Hill, Individual",
284848,Piotr ya,M,27.0,Poland,POL,2014 Winter,2014,Winter,Sochi,Ski Jumping,"Ski Jumping Men's Large Hill, Team",
284849,Tomasz Ireneusz ya,M,30.0,Poland,POL,1998 Winter,1998,Winter,Nagano,Bobsleigh,Bobsleigh Men's Four,


In [32]:
winter.shape, summer.shape, olympics.shape

((48564, 12), (236287, 12), (284851, 12))

### Adding 'region' (Country Name) to olympics data from regions

In [33]:
# join to initial dataframe to get birth_year
olympics = olympics.merge(regions[['noc', 'region']], how='left', on=['noc'], copy=True)
olympics.rename(columns={"region": "country"}, inplace=True)
olympics

Unnamed: 0,name,sex,age,team,noc,games,year,season,city,sport,event,medal,country
0,A Dijiang,M,24.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,,China
1,A Lamusi,M,23.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,,China
2,Gunnar Nielsen Aaby,M,24.0,Denmark,DEN,1920 Summer,1920,Summer,Antwerpen,Football,Football Men's Football,,Denmark
3,Edgar Lindenau Aabye,M,34.0,Denmark/Sweden,DEN,1900 Summer,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold,Denmark
4,"Cornelia ""Cor"" Aalten (-Strannood)",F,18.0,Netherlands,NED,1932 Summer,1932,Summer,Los Angeles,Athletics,Athletics Women's 100 metres,,Netherlands
...,...,...,...,...,...,...,...,...,...,...,...,...,...
284846,Andrzej ya,M,29.0,Poland-1,POL,1976 Winter,1976,Winter,Innsbruck,Luge,Luge Mixed (Men)'s Doubles,,Poland
284847,Piotr ya,M,27.0,Poland,POL,2014 Winter,2014,Winter,Sochi,Ski Jumping,"Ski Jumping Men's Large Hill, Individual",,Poland
284848,Piotr ya,M,27.0,Poland,POL,2014 Winter,2014,Winter,Sochi,Ski Jumping,"Ski Jumping Men's Large Hill, Team",,Poland
284849,Tomasz Ireneusz ya,M,30.0,Poland,POL,1998 Winter,1998,Winter,Nagano,Bobsleigh,Bobsleigh Men's Four,,Poland


In [34]:
# save olympics data as csv
olympics.to_csv(path_or_buf='../olympics/olympics_upd.csv', index=False,
                sep=',', na_rep='', header=True, mode='w', encoding='utf-8', decimal='.')

### Countries data (external source)

In [35]:
# https://cloford.com/resources/codes/index.htm

In [36]:
countries_main = pd.read_csv('../olympics/countries.csv')
countries_main

Unnamed: 0,Continent,Region,Country,Capital,FIPS,ISO (2),ISO (3),ISO (No),Internet,Note
0,Asia,South Asia,Afghanistan,Kabul,AF,AF,AFG,4.0,AF,
1,Europe,South East Europe,Albania,Tirana,AL,AL,ALB,8.0,AL,
2,Africa,Northern Africa,Algeria,Algiers,AG,DZ,DZA,12.0,DZ,
3,Oceania,Pacific,American Samoa,Pago Pago,AQ,AS,ASM,16.0,AS,
4,Europe,South West Europe,Andorra,Andorra la Vella,AN,AD,AND,20.0,AD,
...,...,...,...,...,...,...,...,...,...,...
227,Oceania,Pacific,Western Samoa,Apia,WS,WS,WSM,882.0,WS,
228,Asia,South West Asia,Yemen,Sanaa,YM,YE,YEM,887.0,YE,
229,Africa,Central Africa,Zaire (Dem Rep of Congo),Kinshasa,CG,ZR,ZAR,180.0,ZR,8.0
230,Africa,Southern Africa,Zambia,Lusaka,ZA,ZM,ZWB,894.0,ZM,


In [37]:
# updating column names to lower case (and replacing parentheses with underscore)
format_columns(countries_main)

In [38]:
countries_add = pd.read_csv('../olympics/countries_additional.csv')
countries_add

Unnamed: 0,Continent,Region,Country,Capital,FIPS,ISO (2),ISO (3),ISO (No),Internet,Note
0,Asia,East Asia,Hong Kong (China),Victoria,HK,HK,HKG,344.0,HK,
1,Asia,East Asia,Macau (China),Macau,MC,MO,MAC,446.0,MO,
2,Antarctica,Antarctica,Antarctica,,AY,AQ,ATA,10.0,AQ,9.0
3,Atlantic Ocean,South Atlantic Ocean,Bouvet Island,,BV,BV,BVT,74.0,BV,
4,Asia,South Asia,British Indian Ocean Territory,,IO,IO,IOT,86.0,IO,
5,Asia,South East Asia,East Timor,,--,TP,TMP,626.0,TP,10.0
6,Europe,Western Europe,"France, Metropolitan",,--,FX,FXX,249.0,FX,11.0
7,Indian Ocean,Southern Indian Ocean,French Southern and Antarctic Lands,,FS,TF,ATF,260.0,--,12.0
8,Indian Ocean,Southern Indian Ocean,Heard Island and McDonald Islands,,HM,HM,HMD,334.0,HM,
9,Europe,South East Europe,Montenegro,,MW,--,--,,--,5.0


In [39]:
# updating column names to lower case (and replacing parentheses with underscore)
format_columns(countries_add)

In [40]:
# checking if contries sets are overlaping by iso3 codes
countries_add[countries_add['iso_3'].isin(countries_main['iso_3'])]

Unnamed: 0,continent,region,country,capital,fips,iso_2,iso_3,iso_no,internet,note
9,Europe,South East Europe,Montenegro,,MW,--,--,,--,5.0


In [41]:
# checking if countries sets are overlaping by iso3 codes (backwards)
countries_main[countries_main['iso_3'].isin(countries_add['iso_3'])]

Unnamed: 0,continent,region,country,capital,fips,iso_2,iso_3,iso_no,internet,note
84,Europe,Western Europe,Guernsey,Saint Peter Port,--,--,--,,--,
101,Europe,Northern Europe,Jan Mayen,-,--,--,--,,--,
103,Europe,Western Europe,Jersey,Saint Helier,--,--,--,,--,
128,Europe,Western Europe,"Man, Isle of",Douglas,--,--,--,,--,
160,Asia,South West Asia,Palestine,,--,--,--,,--,
183,Europe,South East Europe,Serbia and Montenegro,Belgrade / Podgorica,SR,--,--,,--,5.0


In [42]:
# concatenating COUNTRIES data
countries = pd.concat([countries_main, countries_add], axis=0, ignore_index=True) #concatenating df1 and df2 along rows
countries.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 246 entries, 0 to 245
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   continent  246 non-null    object 
 1   region     246 non-null    object 
 2   country    246 non-null    object 
 3   capital    233 non-null    object 
 4   fips       245 non-null    object 
 5   iso_2      245 non-null    object 
 6   iso_3      246 non-null    object 
 7   iso_no     239 non-null    float64
 8   internet   245 non-null    object 
 9   note       15 non-null     float64
dtypes: float64(2), object(8)
memory usage: 19.3+ KB


In [43]:
# replacing '--' values with NaN values
for col in countries.columns:
    countries.loc[countries[col] == '--', col] = np.nan

In [44]:
countries.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 246 entries, 0 to 245
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   continent  246 non-null    object 
 1   region     246 non-null    object 
 2   country    246 non-null    object 
 3   capital    233 non-null    object 
 4   fips       236 non-null    object 
 5   iso_2      238 non-null    object 
 6   iso_3      239 non-null    object 
 7   iso_no     239 non-null    float64
 8   internet   237 non-null    object 
 9   note       15 non-null     float64
dtypes: float64(2), object(8)
memory usage: 19.3+ KB


In [45]:
# save countries cleaned data
countries.to_csv(path_or_buf='../olympics/countries_upd.csv', index=False,
                sep=',', na_rep='', header=True, mode='w', encoding='utf-8', decimal='.')

In [46]:
# tmp
# comparing list of iso-3 countries with 'noc' from 'olympics'
countries['country'].nunique() , countries['iso_3'].nunique(),  olympics['noc'].nunique()
# (246, 239, 233)

(246, 239, 233)

### Getting 'country' name source field (for farther manipulation)

In [47]:
# Example of National Olympic Committee code which differs from iso3 country code:
olympics[olympics['noc']=='URS'] # Soviet Union

Unnamed: 0,name,sex,age,team,noc,games,year,season,city,sport,event,medal,country
130,Rost'om Omeris dze Abashidze,M,29.0,Soviet Union,URS,1964 Summer,1964,Summer,Tokyo,Wrestling,"Wrestling Men's Light-Heavyweight, Greco-Roman",,Russia
131,Nunu Dzhansuhivna Abashydze (-Myslaieva),F,25.0,Soviet Union,URS,1980 Summer,1980,Summer,Moskva,Athletics,Athletics Women's Shot Put,,Russia
481,Zagalav Abdulbekovich Abdulbekov,M,26.0,Soviet Union,URS,1972 Summer,1972,Summer,Munich,Wrestling,"Wrestling Men's Featherweight, Freestyle",Gold,Russia
553,Dzhamolidin Mirgarifanovich Abduzhaparov,M,24.0,Soviet Union,URS,1988 Summer,1988,Summer,Seoul,Cycling,"Cycling Men's Road Race, Individual",,Russia
706,Viktor Andreyevich Aboimov,M,22.0,Soviet Union,URS,1972 Summer,1972,Summer,Munich,Swimming,Swimming Men's 400 metres Freestyle,,Russia
...,...,...,...,...,...,...,...,...,...,...,...,...,...
284519,Nikolay Semyonovich Zimyatov,M,28.0,Soviet Union,URS,1984 Winter,1984,Winter,Sarajevo,Cross Country Skiing,Cross Country Skiing Men's 4 x 10 kilometres R...,Silver,Russia
284522,Viktor Aleksandrovich Zinger,M,26.0,Soviet Union,URS,1968 Winter,1968,Winter,Grenoble,Ice Hockey,Ice Hockey Men's Ice Hockey,Gold,Russia
284659,Vera Zozua (-Pyatnizin),F,20.0,Soviet Union,URS,1976 Winter,1976,Winter,Innsbruck,Luge,Luge Women's Singles,,Russia
284660,Vera Zozua (-Pyatnizin),F,24.0,Soviet Union,URS,1980 Winter,1980,Winter,Lake Placid,Luge,Luge Women's Singles,Gold,Russia


In [48]:
# getting 'regions' data for particular National Olympic Committee code (which differs from iso3 country code) 
regions[regions['noc']=='URS']

Unnamed: 0,noc,region,notes
218,URS,Russia,


In [49]:
# checking if 'region' could be used as 'country_name' - OK!
regions[regions['region']=='Russia']

Unnamed: 0,noc,region,notes
3,ROC,Russia,
70,EUN,Russia,
175,RUS,Russia,
218,URS,Russia,


In [50]:
# checking if 'region' could vary for a particular country -> NO, all correct!
regions[regions['region'].str.contains('USA', case=False, na=False)] # analog for ' where col_val like "%USA%" '

Unnamed: 0,noc,region,notes
220,USA,USA,


In [51]:
# checking the completeness of 'regions' data (supposed to be OK)
regions.isnull().sum()

noc         0
region      3
notes     213
dtype: int64

In [54]:
# cheking example of 'team' values for particular National Olympic Committee
olympics[olympics['noc']=='ARG']['team'].unique()

# RESULTS: 'team' could not be used as 'country' name !!

array(['Argentina', 'Matrero II', 'Blue Red', 'Argentina-2', 'Pampero',
       'Wiking', 'Cupidon III', 'Arcturus', 'Covunco III', 'Tango',
       'Argentina-1', 'Acturus', 'Antares', 'Mizar', 'Gullvinge', 'Djinn',
       'Rampage', 'Ardilla'], dtype=object)

In [55]:
# Checking if a contry name (regions.region) unique: NO!
regions.shape, regions['region'].nunique()

((234, 3), 207)

### Joining 'regions' and 'countries' data to get 'iso3' codes for country names (the way they used in Olympics data)

In [56]:
# joining 'regions' and 'countries'
iso3_for_country = pd.merge(left=regions, right=countries, how='left', left_on='region', right_on='country', suffixes=('_x', '_y'))
# 234 rows in iso3_for_country

missed_iso3 = iso3_for_country[iso3_for_country['country'].isnull()]
missed_iso3 =  missed_iso3[['noc', 'region_x', 'notes']] # 32 rows (with missed iso3)

iso3_for_country.shape, missed_iso3.shape, missed_iso3['region_x'].nunique()

((234, 13), (32, 3), 26)

In [57]:
# list of successfully merged countries
mapped_iso3 = iso3_for_country[~(iso3_for_country['country'].isnull())]
#mapped_iso3
mapped_iso3.shape, mapped_iso3['region_x'].nunique()

((202, 13), 181)

In [58]:
missed_iso3[missed_iso3['region_x'].isin(mapped_iso3['region_x'].unique())]

Unnamed: 0,noc,region_x,notes


In [59]:
# missed iso3 mapping list
missed_iso3_countries = missed_iso3['region_x'].unique()
missed_iso3_countries

array(['Refugee', 'Curacao', 'Antigua', 'Bahamas', 'Boliva',
       'Republic of Congo', 'Ivory Coast',
       'Democratic Republic of the Congo', 'Micronesia', 'Gambia', 'UK',
       'Individual Olympic Athletes', 'Virgin Islands, US',
       'Virgin Islands, British', 'South Korea', 'Kosovo', 'Myanmar',
       'North Korea', nan, 'Samoa', 'Serbia', 'Saint Kitts',
       'South Sudan', 'Timor-Leste', 'Trinidad', 'USA', 'Saint Vincent'],
      dtype=object)

In [60]:
missed_iso3.reset_index(drop=True, inplace=True)
missed_iso3

Unnamed: 0,noc,region_x,notes
0,EOR,Refugee,
1,AHO,Curacao,Netherlands Antilles
2,ANT,Antigua,Antigua and Barbuda
3,BAH,Bahamas,
4,BOL,Boliva,
5,CGO,Republic of Congo,
6,CIV,Ivory Coast,
7,COD,Democratic Republic of the Congo,
8,FSM,Micronesia,
9,GAM,Gambia,


In [61]:
# examples of missed mapping

# Python syntaxis:
# Series.str.contains(pat, case=True, flags=0, na=None, regex=True)

country = 'USA' # missed_iso3['region_x']
noc = regions[regions['region']==country]['noc'].iloc[0]
print(noc)
iso3_by_region = (countries[countries['country'].str.contains(country, regex = False)]['iso_3'].to_list()[:1] or [None])[0]
print(iso3_by_region) # None

iso3_by_noc = (countries[countries['iso_3']== noc]['iso_3'].to_list()[:1] or [None])[0]
print(iso3_by_noc)

USA
None
USA


In [62]:
# update missed_iso3 DataFrame:
# - add 'iso3_by_region' column
# - add 'iso3_by_noc' column
# - add 'iso3_by_notes' column
# - add 'iso3' as 'iso3_by_region' or 'iso3_by_notes' or 'iso3_by_noc' (TO CHECK: could be incorrect!)

missed_iso3['iso3_by_region'] = None
missed_iso3['iso3_by_notes'] = None
missed_iso3['iso3_by_noc'] = None
missed_iso3['iso3'] = None

# update mised 'region_x' values ('NaN') with 'notes'
missed_iso3['region_x'] = np.where(missed_iso3['region_x'].isna(), missed_iso3['notes'], missed_iso3['region_x'])

# tie to 'countries' data and get 'iso3' values using country name or 'noc'
for i in range(len(missed_iso3)):
    country = missed_iso3.at[i, 'region_x']
    noc = missed_iso3.at[i, 'noc']
    
    notes = missed_iso3.at[i, 'notes'] 
    if str(notes) == 'nan': # replace 'NaN' values with some text
        notes = 'Not defined value'+str(i)
    
    missed_iso3.at[i, 'iso3_by_region'] = (countries[countries['country'].str.contains(country, regex = False)]['iso_3'].to_list()[:1] or [None])[0]
    missed_iso3.at[i, 'iso3_by_notes'] = (countries[countries['country'].str.contains(notes, regex = False)]['iso_3'].to_list()[:1] or [None])[0]
    missed_iso3.at[i, 'iso3_by_noc'] = (countries[countries['iso_3']== noc]['iso_3'].to_list()[:1] or [None])[0]

# updating 'iso3' column in DataFrame using numpy: where(condition, value_if_True, value_if_False)
missed_iso3['iso3'] = np.where(missed_iso3['iso3_by_region'].isna(), missed_iso3['iso3_by_notes'], missed_iso3['iso3_by_region'])
missed_iso3['iso3'] = np.where(missed_iso3['iso3'].isna(), missed_iso3['iso3_by_noc'], missed_iso3['iso3'])

missed_iso3[['region_x', 'iso3']]


Unnamed: 0,region_x,iso3
0,Refugee,
1,Curacao,ANT
2,Antigua,ATG
3,Bahamas,BHS
4,Boliva,BOL
5,Republic of Congo,
6,Ivory Coast,CIV
7,Democratic Republic of the Congo,
8,Micronesia,FSM
9,Gambia,GMB


In [63]:
# checking the computed results
missed_iso3[~(missed_iso3['iso3'].isnull())][['region_x', 'iso3']]

# All correct except for 'Serbia': 'YUG' should be updated to 'SRB' !!!

Unnamed: 0,region_x,iso3
1,Curacao,ANT
2,Antigua,ATG
3,Bahamas,BHS
4,Boliva,BOL
6,Ivory Coast,CIV
8,Micronesia,FSM
9,Gambia,GMB
10,UK,GBR
12,"Virgin Islands, US",VGB
14,South Korea,KOR


In [64]:
# country values without 'iso3'
# checking the computed results
missed_iso3[missed_iso3['iso3'].isnull()][['region_x', 'iso3']]

Unnamed: 0,region_x,iso3
0,Refugee,
5,Republic of Congo,
7,Democratic Republic of the Congo,
11,Individual Olympic Athletes,
13,"Virgin Islands, British",
15,Kosovo,
18,Refugee Olympic Team,
20,Serbia,
22,Serbia,
23,South Sudan,


In [65]:
# iso3 to update (see previous list)

dict_iso3_to_update = {'iso3' : ['COG', 'COD', 'VGB', 'SRB', 'SSD', 'TLS'],
        'region_x' : ['Republic of Congo', 'Democratic Republic of the Congo', 
                      'Virgin Islands, British', 'Serbia', 'South Sudan', 'Timor-Leste']}

iso3_to_update = pd.DataFrame(dict_iso3_to_update)
iso3_to_update

Unnamed: 0,iso3,region_x
0,COG,Republic of Congo
1,COD,Democratic Republic of the Congo
2,VGB,"Virgin Islands, British"
3,SRB,Serbia
4,SSD,South Sudan
5,TLS,Timor-Leste


**Final List for (country_name,iso3) pairs to use:**

In [66]:
# getting correctly mapped (country_name, iso3) pairs after joining countries data (external) with regions (olympics data)
mapped_iso3_pairs = mapped_iso3[['region_x', 'iso_3']]

# rename a 'iso_3' column
mapped_iso3_pairs.rename(columns={'iso_3': 'iso3'}, inplace=True)

mapped_iso3_pairs['iso3'].nunique() # 202 records, 181 unique country_names, 179 unique ISO3 codes...

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mapped_iso3_pairs.rename(columns={'iso_3': 'iso3'}, inplace=True)


179

In [67]:
# displaying (country_name, iso3) duplicates
iso3_dups = mapped_iso3_pairs[mapped_iso3_pairs.duplicated()]
iso3_dups

Unnamed: 0,region_x,iso3
16,Australia,AUS
58,Czech Republic,CZE
70,Russia,RUS
80,Germany,DEU
83,Germany,DEU
85,Greece,GRC
92,China,CHN
123,Lebanon,LBN
130,Malaysia,MYS
147,Malaysia,MYS


In [68]:
# removing duplicated (country_name,iso3) pairs
mapped_iso3_pairs.drop_duplicates(inplace=True)
mapped_iso3_pairs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 181 entries, 1 to 232
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   region_x  181 non-null    object
 1   iso3      179 non-null    object
dtypes: object(2)
memory usage: 4.2+ KB


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mapped_iso3_pairs.drop_duplicates(inplace=True)


In [69]:
# adding 2nd part (calculated iso3 from 'region' part or from NOC as iso3)
mapped_iso3_calc = missed_iso3[~(missed_iso3['iso3'].isnull())][['region_x', 'iso3']]

# checking for dups: YES
# mapped_iso3_calc.shape, mapped_iso3_calc['region_x'].nunique(), mapped_iso3_calc['iso3'].nunique()
# RETURNED: ((20, 2), 19, 19)

# removing dups
mapped_iso3_calc.drop_duplicates(inplace=True)

# concatenating 1st and 2nd part (calculated) and 3rd part (6 googled codes)
iso3_pairs = pd.concat([mapped_iso3_pairs, mapped_iso3_calc, iso3_to_update], axis='index', ignore_index=True)

# checking for duplicates in contatenated list: NO, all good
# iso3_pairs.shape, iso3_pairs['region_x'].nunique(), iso3_pairs['iso3'].nunique()
# RETURNED: ((206, 2), 205, 202) # 2 entries for 'Serbia'

# fix 'Serbia' with correct iso3 = 'SRB'
iso3_pairs.loc[iso3_pairs['region_x'] == 'Serbia', 'iso3'] = 'SRB'

# remove dups from the final list
iso3_pairs.drop_duplicates(inplace=True)

# rename 'region_x' column
iso3_pairs.rename(columns={'region_x': 'country'}, inplace=True)
iso3_pairs # 205 rows

Unnamed: 0,country,iso3
0,Lebanon,LBN
1,Singapore,SGP
2,Russia,RUS
3,Afghanistan,AFG
4,Albania,ALB
...,...,...
200,Republic of Congo,COG
201,Democratic Republic of the Congo,COD
202,"Virgin Islands, British",VGB
204,South Sudan,SSD


In [70]:
# save countries with iso3 codes as csv-file
iso3_pairs.to_csv(path_or_buf='../olympics/iso3_pairs.csv', index=False,
                sep=',', na_rep='', header=True, mode='w', encoding='utf-8', decimal='.')