# Two Centuries of Ultra Marathon

**In this project, I will conduct an `Exploratory Data Analysis (EDA)` on the dataset <a href="https://www.kaggle.com/search?q=ultra+marathon+in%3Adatasets"> The big dataset of ultra-marathon running</a>,  which is available on Kaggle. Although my dataset is not exceptionally large (around 1 gigabyte in CSV format), it is still significant. My aim is to perform data manipulation and EDA on this large dataset using the `Dask` library.**

In [2]:
import dask.dataframe as dd
import seaborn as sns
import pandas as pd
import numpy as np

In [35]:
data = dd.read_csv("TWO_CENTURIES_OF_UM_RACES.csv", dtype={'Athlete average speed': 'object'})
data

Unnamed: 0_level_0,Year of event,Event dates,Event name,Event distance/length,Event number of finishers,Athlete performance,Athlete club,Athlete country,Athlete year of birth,Athlete gender,Athlete age category,Athlete average speed,Athlete ID
npartitions=12,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
,int64,string,string,string,int64,string,string,string,float64,string,string,string,int64
,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...


In [36]:
data.head()

Unnamed: 0,Year of event,Event dates,Event name,Event distance/length,Event number of finishers,Athlete performance,Athlete club,Athlete country,Athlete year of birth,Athlete gender,Athlete age category,Athlete average speed,Athlete ID
0,2018,06.01.2018,Selva Costera (CHI),50km,22,4:51:39 h,Tnfrc,CHI,1978.0,M,M35,10.286,0
1,2018,06.01.2018,Selva Costera (CHI),50km,22,5:15:45 h,Roberto Echeverría,CHI,1981.0,M,M35,9.501,1
2,2018,06.01.2018,Selva Costera (CHI),50km,22,5:16:44 h,Puro Trail Osorno,CHI,1987.0,M,M23,9.472,2
3,2018,06.01.2018,Selva Costera (CHI),50km,22,5:34:13 h,Columbia,ARG,1976.0,M,M40,8.976,3
4,2018,06.01.2018,Selva Costera (CHI),50km,22,5:54:14 h,Baguales Trail,CHI,1992.0,M,M23,8.469,4


# Rename columns 
To make it easier to continue, I will rename all column titles to lowercase and remove any spaces

In [37]:
data.columns = ["year_of_event", "event_dates", "event_name",
              "event_distance/length", "event_number_of_finishers",
               "athlete_performance",  "athlete_club", "athlete_country",
               "athlete_year_of_birth", "athlete_gender", "athlete_age_category",
               "athlete_average_speed", "athlete_id"]

In [38]:
data.head()

Unnamed: 0,year_of_event,event_dates,event_name,event_distance/length,event_number_of_finishers,athlete_performance,athlete_club,athlete_country,athlete_year_of_birth,athlete_gender,athlete_age_category,athlete_average_speed,athlete_id
0,2018,06.01.2018,Selva Costera (CHI),50km,22,4:51:39 h,Tnfrc,CHI,1978.0,M,M35,10.286,0
1,2018,06.01.2018,Selva Costera (CHI),50km,22,5:15:45 h,Roberto Echeverría,CHI,1981.0,M,M35,9.501,1
2,2018,06.01.2018,Selva Costera (CHI),50km,22,5:16:44 h,Puro Trail Osorno,CHI,1987.0,M,M23,9.472,2
3,2018,06.01.2018,Selva Costera (CHI),50km,22,5:34:13 h,Columbia,ARG,1976.0,M,M40,8.976,3
4,2018,06.01.2018,Selva Costera (CHI),50km,22,5:54:14 h,Baguales Trail,CHI,1992.0,M,M23,8.469,4


In [39]:
data.tail()

Unnamed: 0,year_of_event,event_dates,event_name,event_distance/length,event_number_of_finishers,athlete_performance,athlete_club,athlete_country,athlete_year_of_birth,athlete_gender,athlete_age_category,athlete_average_speed,athlete_id
630653,1995,00.00.1995,La SainteLyon 65 km (FRA),65km,2,4:33:20 h,,FRA,,M,,14268.0,1151060
630654,1995,00.00.1995,La SainteLyon 65 km (FRA),65km,2,6:05:15 h,,FRA,,F,,10678.0,1073520
630655,1995,00.00.1995,Szombathely 24 hours running Race (HUN),24h,3,241.000 km,*Budapest,HUN,1950.0,M,M40,10042.0,1047373
630656,1995,00.00.1995,Szombathely 24 hours running Race (HUN),24h,3,228.000 km,*Szeged,HUN,1959.0,M,M35,9500.0,380150
630657,1995,00.00.1995,Szombathely 24 hours running Race (HUN),24h,3,224.000 km,*Pecs,HUN,1958.0,M,M35,9333.0,1070482


# Handling missing Values

In [61]:
data.isna().sum().compute()

year_of_event                   0
event_dates                     0
event_name                      0
event_distance/length           0
event_number_of_finishers       0
athlete_performance             0
athlete_club                    0
athlete_country                 0
athlete_year_of_birth        3420
athlete_gender                  0
athlete_age_category            0
athlete_average_speed           0
athlete_id                      0
dtype: int64

In [60]:
data['athlete_club'] = data['athlete_club'].fillna('-')
data = data.dropna(subset=['athlete_age_category']) 
data = data.dropna(subset=["event_distance/length"])
data = data.dropna(subset=["athlete_average_speed"])
data.compute()

Unnamed: 0,year_of_event,event_dates,event_name,event_distance/length,event_number_of_finishers,athlete_performance,athlete_club,athlete_country,athlete_year_of_birth,athlete_gender,athlete_age_category,athlete_average_speed,athlete_id
0,2018,06.01.2018,Selva Costera (CHI),50km,22,4:51:39 h,Tnfrc,CHI,1978.0,M,M35,10.286,0
1,2018,06.01.2018,Selva Costera (CHI),50km,22,5:15:45 h,Roberto Echeverría,CHI,1981.0,M,M35,9.501,1
2,2018,06.01.2018,Selva Costera (CHI),50km,22,5:16:44 h,Puro Trail Osorno,CHI,1987.0,M,M23,9.472,2
3,2018,06.01.2018,Selva Costera (CHI),50km,22,5:34:13 h,Columbia,ARG,1976.0,M,M40,8.976,3
4,2018,06.01.2018,Selva Costera (CHI),50km,22,5:54:14 h,Baguales Trail,CHI,1992.0,M,M23,8.469,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
630651,1995,00.00.1995,Les 24 heures de Fleurbaix (FRA),24h,2,232.810 km,-,FRA,1958.0,M,M35,9700.0,1069476
630652,1995,00.00.1995,Les 24 heures de Fleurbaix (FRA),24h,2,221.374 km,-,BEL,1951.0,M,M40,9224.0,1045647
630655,1995,00.00.1995,Szombathely 24 hours running Race (HUN),24h,3,241.000 km,*Budapest,HUN,1950.0,M,M40,10042.0,1047373
630656,1995,00.00.1995,Szombathely 24 hours running Race (HUN),24h,3,228.000 km,*Szeged,HUN,1959.0,M,M35,9500.0,380150


In [57]:
shape = data.shape
f"rows : {shape[0].compute()}, columns : {shape[1]}"

'rows : 6876257.0, columns : 13'

In [64]:
data[data.athlete_year_of_birth.isna() == True].head()

Unnamed: 0,year_of_event,event_dates,event_name,event_distance/length,event_number_of_finishers,athlete_performance,athlete_club,athlete_country,athlete_year_of_birth,athlete_gender,athlete_age_category,athlete_average_speed,athlete_id
2621,2018,18.03.2018,6-Stunden-Lauf Herne (GER),6h,29,71.445 km,OTV Endurance Team,GER,,M,M50,11.908,1998
4661,2018,18.03.2018,Endurance Day Hamme 100 km (BEL),100km,6,11:35:54 h,-,XXX,,M,M35,8.622,4033
5897,2018,18.03.2018,Trail du Petit Ballon (FRA),52.6km,876,5:37:08 h,-,FRA,,M,M23,9.361,5265
7186,2018,17.-18.03.2018,Monasterio De Tarlac Ultra Marathon (PHI),50km,198,9:46:52 h,-,XXX,,M,M35,5.112,4033
8294,2018,17.-18.03.2018,EcoTrail de Paris - Ile de France 80 km (FRA),80km,1695,10:14:04 h,-,FRA,,M,M35,7.817,5265
