# Two Centuries of Ultra Marathon

**In this project, I will conduct an `Exploratory Data Analysis (EDA)` on the dataset <a href="https://www.kaggle.com/search?q=ultra+marathon+in%3Adatasets"> The big dataset of ultra-marathon running</a>,  which is available on Kaggle. Although my dataset is not exceptionally large (around 1 gigabyte in CSV format), it is still significant. My aim is to perform data manipulation and EDA on this large dataset using the `Pandas` library.**

In [52]:
import seaborn as sns
import pandas as pd
import numpy as np

In [86]:
data = pd.read_csv("TWO_CENTURIES_OF_UM_RACES.csv", dtype={'Athlete average speed': 'object'})
data

Unnamed: 0,Year of event,Event dates,Event name,Event distance/length,Event number of finishers,Athlete performance,Athlete club,Athlete country,Athlete year of birth,Athlete gender,Athlete age category,Athlete average speed,Athlete ID
0,2018,06.01.2018,Selva Costera (CHI),50km,22,4:51:39 h,Tnfrc,CHI,1978.0,M,M35,10.286,0
1,2018,06.01.2018,Selva Costera (CHI),50km,22,5:15:45 h,Roberto Echeverría,CHI,1981.0,M,M35,9.501,1
2,2018,06.01.2018,Selva Costera (CHI),50km,22,5:16:44 h,Puro Trail Osorno,CHI,1987.0,M,M23,9.472,2
3,2018,06.01.2018,Selva Costera (CHI),50km,22,5:34:13 h,Columbia,ARG,1976.0,M,M40,8.976,3
4,2018,06.01.2018,Selva Costera (CHI),50km,22,5:54:14 h,Baguales Trail,CHI,1992.0,M,M23,8.469,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7461190,1995,00.00.1995,La SainteLyon 65 km (FRA),65km,2,4:33:20 h,,FRA,,M,,14268.0,1151060
7461191,1995,00.00.1995,La SainteLyon 65 km (FRA),65km,2,6:05:15 h,,FRA,,F,,10678.0,1073520
7461192,1995,00.00.1995,Szombathely 24 hours running Race (HUN),24h,3,241.000 km,*Budapest,HUN,1950.0,M,M40,10042.0,1047373
7461193,1995,00.00.1995,Szombathely 24 hours running Race (HUN),24h,3,228.000 km,*Szeged,HUN,1959.0,M,M35,9500.0,380150


In [87]:
data.head(3)

Unnamed: 0,Year of event,Event dates,Event name,Event distance/length,Event number of finishers,Athlete performance,Athlete club,Athlete country,Athlete year of birth,Athlete gender,Athlete age category,Athlete average speed,Athlete ID
0,2018,06.01.2018,Selva Costera (CHI),50km,22,4:51:39 h,Tnfrc,CHI,1978.0,M,M35,10.286,0
1,2018,06.01.2018,Selva Costera (CHI),50km,22,5:15:45 h,Roberto Echeverría,CHI,1981.0,M,M35,9.501,1
2,2018,06.01.2018,Selva Costera (CHI),50km,22,5:16:44 h,Puro Trail Osorno,CHI,1987.0,M,M23,9.472,2


# Rename columns 
To make it easier to continue, I will rename all column titles to lowercase and remove any spaces

In [88]:
data.columns = ["year_of_event", "event_dates", "event_name",
              "event_distance/length", "event_number_of_finishers",
               "athlete_performance",  "athlete_club", "athlete_country",
               "athlete_year_of_birth", "athlete_gender", "athlete_age_category",
               "athlete_average_speed", "athlete_id"]

In [56]:
data.head(2)

Unnamed: 0,year_of_event,event_dates,event_name,event_distance/length,event_number_of_finishers,athlete_performance,athlete_club,athlete_country,athlete_year_of_birth,athlete_gender,athlete_age_category,athlete_average_speed,athlete_id
0,2018,06.01.2018,Selva Costera (CHI),50km,22,4:51:39 h,Tnfrc,CHI,1978.0,M,M35,10.286,0
1,2018,06.01.2018,Selva Costera (CHI),50km,22,5:15:45 h,Roberto Echeverría,CHI,1981.0,M,M35,9.501,1
2,2018,06.01.2018,Selva Costera (CHI),50km,22,5:16:44 h,Puro Trail Osorno,CHI,1987.0,M,M23,9.472,2
3,2018,06.01.2018,Selva Costera (CHI),50km,22,5:34:13 h,Columbia,ARG,1976.0,M,M40,8.976,3
4,2018,06.01.2018,Selva Costera (CHI),50km,22,5:54:14 h,Baguales Trail,CHI,1992.0,M,M23,8.469,4


In [89]:
data.tail(3)

Unnamed: 0,year_of_event,event_dates,event_name,event_distance/length,event_number_of_finishers,athlete_performance,athlete_club,athlete_country,athlete_year_of_birth,athlete_gender,athlete_age_category,athlete_average_speed,athlete_id
7461192,1995,00.00.1995,Szombathely 24 hours running Race (HUN),24h,3,241.000 km,*Budapest,HUN,1950.0,M,M40,10042.0,1047373
7461193,1995,00.00.1995,Szombathely 24 hours running Race (HUN),24h,3,228.000 km,*Szeged,HUN,1959.0,M,M35,9500.0,380150
7461194,1995,00.00.1995,Szombathely 24 hours running Race (HUN),24h,3,224.000 km,*Pecs,HUN,1958.0,M,M35,9333.0,1070482


# Handling missing Values

In [90]:
data.isna().sum()

year_of_event                      0
event_dates                        0
event_name                         0
event_distance/length           1053
event_number_of_finishers          0
athlete_performance                2
athlete_club                 2826524
athlete_country                    3
athlete_year_of_birth         588161
athlete_gender                     7
athlete_age_category          584938
athlete_average_speed            224
athlete_id                         0
dtype: int64

In [92]:
data.rename(columns={"athlete_age_category" : "athlete_age"}, inplace=True)
data['athlete_club'] = data['athlete_club'].fillna('-')
data = data.dropna(subset=['athlete_age']) 
data = data.dropna(subset=["event_distance/length"])
data = data.dropna(subset=["athlete_average_speed"])
data["athlete_age"] = data["athlete_age"].str.extract('(\\d+)').astype(int)

Some rows lack the year of birth but contain the athlete's age and the year of the event. By calculating the difference between these two elements for each row, we can determine the birth year of each athlete.

In [106]:
def calculate_year_of_birth(row):

    if row.athlete_year_of_birth != None:
        return row.athlete_year_of_birth
    
    elif row.athlete_year_of_birth == None:
        return row.year_of_event - row.athlete_age


data["athlete_year_of_birth"] = data.apply(calculate_year_of_birth, axis=1)
data

Unnamed: 0,year_of_event,event_dates,event_name,event_distance/length,event_number_of_finishers,athlete_performance,athlete_club,athlete_country,athlete_year_of_birth,athlete_gender,athlete_age,athlete_average_speed,athlete_id
0,2018,06.01.2018,Selva Costera (CHI),50km,22,4:51:39 h,Tnfrc,CHI,1983,M,35,10.286,0
1,2018,06.01.2018,Selva Costera (CHI),50km,22,5:15:45 h,Roberto Echeverría,CHI,1983,M,35,9.501,1
2,2018,06.01.2018,Selva Costera (CHI),50km,22,5:16:44 h,Puro Trail Osorno,CHI,1995,M,23,9.472,2
3,2018,06.01.2018,Selva Costera (CHI),50km,22,5:34:13 h,Columbia,ARG,1978,M,40,8.976,3
4,2018,06.01.2018,Selva Costera (CHI),50km,22,5:54:14 h,Baguales Trail,CHI,1995,M,23,8.469,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7461188,1995,00.00.1995,Les 24 heures de Fleurbaix (FRA),24h,2,232.810 km,-,FRA,1960,M,35,9700.0,1069476
7461189,1995,00.00.1995,Les 24 heures de Fleurbaix (FRA),24h,2,221.374 km,-,BEL,1955,M,40,9224.0,1045647
7461192,1995,00.00.1995,Szombathely 24 hours running Race (HUN),24h,3,241.000 km,*Budapest,HUN,1955,M,40,10042.0,1047373
7461193,1995,00.00.1995,Szombathely 24 hours running Race (HUN),24h,3,228.000 km,*Szeged,HUN,1960,M,35,9500.0,380150


In [110]:
# data["athlete_performance"] = data["athlete_performance"].str.split(" |h| h| H|H")

In [111]:
data

Unnamed: 0,year_of_event,event_dates,event_name,event_distance/length,event_number_of_finishers,athlete_performance,athlete_club,athlete_country,athlete_year_of_birth,athlete_gender,athlete_age,athlete_average_speed,athlete_id
0,2018,06.01.2018,Selva Costera (CHI),50km,22,"[4:51:39, , ]",Tnfrc,CHI,1983,M,35,10.286,0
1,2018,06.01.2018,Selva Costera (CHI),50km,22,"[5:15:45, , ]",Roberto Echeverría,CHI,1983,M,35,9.501,1
2,2018,06.01.2018,Selva Costera (CHI),50km,22,"[5:16:44, , ]",Puro Trail Osorno,CHI,1995,M,23,9.472,2
3,2018,06.01.2018,Selva Costera (CHI),50km,22,"[5:34:13, , ]",Columbia,ARG,1978,M,40,8.976,3
4,2018,06.01.2018,Selva Costera (CHI),50km,22,"[5:54:14, , ]",Baguales Trail,CHI,1995,M,23,8.469,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7461188,1995,00.00.1995,Les 24 heures de Fleurbaix (FRA),24h,2,"[232.810, km]",-,FRA,1960,M,35,9700.0,1069476
7461189,1995,00.00.1995,Les 24 heures de Fleurbaix (FRA),24h,2,"[221.374, km]",-,BEL,1955,M,40,9224.0,1045647
7461192,1995,00.00.1995,Szombathely 24 hours running Race (HUN),24h,3,"[241.000, km]",*Budapest,HUN,1955,M,40,10042.0,1047373
7461193,1995,00.00.1995,Szombathely 24 hours running Race (HUN),24h,3,"[228.000, km]",*Szeged,HUN,1960,M,35,9500.0,380150


In [104]:
data.athlete_age.dtype

dtype('int32')