In [27]:
import pandas as pd
import plotly.express as px
import hashlib

olympic_data = pd.read_csv("Data/athlete_events.csv")
olympic_data.head()

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
0,1,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,
1,2,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,
2,3,Gunnar Nielsen Aaby,M,24.0,,,Denmark,DEN,1920 Summer,1920,Summer,Antwerpen,Football,Football Men's Football,
3,4,Edgar Lindenau Aabye,M,34.0,,,Denmark/Sweden,DEN,1900 Summer,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
4,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,NED,1988 Winter,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,


#### Anonymisering av namn

In [28]:
# Get data for finland only
olympic_data_finland = olympic_data.loc[olympic_data['NOC'] == 'FIN']

# Use labmda function to iterate over and hash all names using sha256
olympic_data_finland['Name'] = olympic_data_finland['Name'].apply(lambda x: hashlib.sha256(x.encode()).hexdigest())
olympic_data_finland = olympic_data_finland.rename(columns={"Name": "Name (hashed)"})
olympic_data_finland.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,ID,Name (hashed),Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
28,9,a11b41b7e02d7d5ffa770946f8e9cc7f793eefdf62dd90...,M,26.0,186.0,96.0,Finland,FIN,2002 Winter,2002,Winter,Salt Lake City,Ice Hockey,Ice Hockey Men's Ice Hockey,
29,10,2d7e50fbb880ec79a325d5fd229bc71cecff9a9b6bc94a...,M,26.0,,,Finland,FIN,1952 Summer,1952,Summer,Helsinki,Swimming,Swimming Men's 400 metres Freestyle,
30,11,bb375e48ab8e8d2ac8546a7cf4285499d6e09818223736...,M,22.0,182.0,76.5,Finland,FIN,1980 Winter,1980,Winter,Lake Placid,Cross Country Skiing,Cross Country Skiing Men's 30 kilometres,
31,12,6d013d3b6d55f13c9c50711ca29dbe8bb7beac06861b39...,M,31.0,172.0,70.0,Finland,FIN,2000 Summer,2000,Summer,Sydney,Badminton,Badminton Men's Singles,
32,13,8ab98498141c522fe5ab0b199ad0dcb06158b3aff95b8d...,F,30.0,159.0,55.5,Finland,FIN,1996 Summer,1996,Summer,Atlanta,Sailing,Sailing Women's Windsurfer,


In [29]:
# Get all rows where a player managed to score a medal
olympic_data_finland_with_medals = olympic_data_finland[olympic_data_finland['Medal'].notna()]

#### De sporter landet fått flest medaljer i

In [30]:
# Filter the rows into groups of sports, than sum the "Medals" column for each given sport
medals_per_finland_sport = olympic_data_finland_with_medals.groupby('Sport')['Medal'].count()

medals_per_finland_sport = medals_per_finland_sport.sort_values(ascending=False)

# Take the top 5 sports with most medals scored
medals_per_sport_top5_finland = medals_per_finland_sport[:5]

fig = px.bar(
    x=medals_per_sport_top5_finland.index,
    y=medals_per_sport_top5_finland,
    title="Medal distribution across the olympic sports in finland",
    labels={
        'x': '',
        'y': 'Medals'
    }
)

fig.show()

#### antal medaljer per OS

In [31]:
# Filter the rows into groups of games, than sum the "Medals" column for each given game
medals_per_os = olympic_data_finland_with_medals.groupby('Games')['Medal'].count()

medals_per_os.index = medals_per_os.index.sort_values()

fig = px.bar(
    x=medals_per_os.index,
    y=medals_per_os,
    title="Medal distribution across the olympic games",
    labels={
        'x': '',
        'y': 'Medals'
    }
)

fig.show()

#### histogram över åldrar

In [32]:
fig = px.histogram(
    data_frame=olympic_data_finland,
    x="Age",
    title="Age distribution across the OS contenders"
)

fig.show()

#### Height distrubution of players

In [33]:
# Remove duplicate players (by looking at name hash)
olympic_data_finland_unique_players = olympic_data_finland.drop_duplicates('Name (hashed)')

fig = px.histogram(
    data_frame=olympic_data_finland_unique_players,
    x="Height",
    title="height distribution"
)

fig.show()

#### Weight distrubution of players

In [34]:
# Get a series of the value counts of the ages
olympic_data_finland_weights = olympic_data_finland_unique_players['Weight'].value_counts()

olympic_data_finland_weights = olympic_data_finland_weights.sort_values(ascending=False)


fig = px.histogram(
    x=olympic_data_finland_weights.index,
    y=olympic_data_finland_weights,
    title="Weight distribution",
    
    labels={
        'x': 'Kg',
        'y': 'Number of people'
    }
)

fig.update_layout(yaxis_title="Number of people")
fig.show()

Another graph??