In [106]:
# import dependencies
import pandas as pd
from datetime import datetime as dt
import time

In [122]:
# read in csv files

file_path = "Resources/"
marathon_2015_df = pd.read_csv(f"{file_path}marathon_results_2015.csv")
marathon_2016_df = pd.read_csv(f"{file_path}marathon_results_2016.csv")
marathon_2017_df = pd.read_csv(f"{file_path}marathon_results_2017.csv")

Unnamed: 0.1,Unnamed: 0,Bib,Name,Age,M/F,City,State,Country,Citizen,Unnamed: 9,...,25K,30K,35K,40K,Pace,Proj Time,Official Time,Overall,Gender,Division
0,0,3,"Desisa, Lelisa",25,M,Ambo,,ETH,,,...,1:16:07,1:32:00,1:47:59,2:02:39,0:04:56,-,2:09:17,1,1,1
1,1,4,"Tsegay, Yemane Adhane",30,M,Addis Ababa,,ETH,,,...,1:16:07,1:31:59,1:47:59,2:02:42,0:04:58,-,2:09:48,2,2,2
2,2,8,"Chebet, Wilson",29,M,Marakwet,,KEN,,,...,1:16:07,1:32:00,1:47:59,2:03:01,0:04:59,-,2:10:22,3,3,3
3,3,11,"Kipyego, Bernard",28,M,Eldoret,,KEN,,,...,1:16:07,1:32:00,1:48:03,2:03:47,0:05:00,-,2:10:47,4,4,4
4,4,10,"Korir, Wesley",32,M,Kitale,,KEN,,,...,1:16:07,1:32:00,1:47:59,2:03:27,0:05:00,-,2:10:49,5,5,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26593,26593,25656,"Prescott, Francine J.",64,F,Boynton Beach,FL,USA,,MI,...,4:25:53,5:19:44,6:17:19,7:13:34,0:17:31,-,7:38:56,26594,12015,269
26594,26594,31359,"Emerson, Annette C.",61,F,Old Town,ME,USA,,,...,4:32:44,5:31:58,6:28:56,7:26:19,0:17:59,-,7:51:30,26595,12016,270
26595,26595,25559,"Cerveny, Nona",66,F,Scottsdale,AZ,USA,,,...,4:33:51,5:38:56,6:38:51,7:36:18,0:18:18,-,7:59:33,26596,12017,91
26596,26596,29396,"Buccilli, Alfred P.",53,M,Lynnfield,MA,USA,,,...,4:29:20,5:31:11,6:33:35,7:35:38,0:18:20,-,8:00:37,26597,14580,2055


In [108]:
# function to convert columns from string to datetime

def to_time(df_col):
    # for each item in the column, append yymmdd, to string, then convert to datetime
    time_df_col=[]
    yymmdd='04/20/15'

    for item in df_col:
        # add the yymmdd to the hhmmss string
        full_date=f"{yymmdd} {item}"
        
        # try to convert string to datetime, if theres an error, replace item with "-"
        try:
            # convert to datetime, then append to the time_df_col
            time_df_col.append(dt.strptime(full_date, '%m/%d/%y %H:%M:%S'))
        except ValueError:
            #time_df_col.append("error")
            time_df_col.append("-")
            
    # return the converted column
    return time_df_col

In [109]:
def to_minutes(df_col):
    df_col_minutes=[]
    # for each item in the column
    for item in df_col:
        # extract the hour, minute, second
        h, m, s = item.split(":")
        
        # convert to int
        h=int(h)
        m=int(m)
        s=int(s)
        
        # calculate the minutes, round to 2 decimal places
        minutes=round(m+h*60+s/60, 2)
        
        # append to new col
        df_col_minutes.append(minutes)

    return df_col_minutes

In [113]:
def to_int(df_col):
    df_col_int=[]
    
    # for each value in the column
    for item in df_col:
        #convert to int
        item=int(item)
        
        #append to new list
        df_col_int.append(item)
    # return the new list
    return df_col_int

In [117]:
def clean_data(marathon_df):
    # look at all column labels/names
    marathon_df.columns
    
    # remove any unnecessary columns
    marathon_df=marathon_df.drop(labels=["Unnamed: 0", "Citizen", "Unnamed: 9", "Half", "Proj Time", "Overall", "Gender", "Division"], axis=1)
    
    # check dataframe data types
    marathon_df.dtypes
        
    # convert 5K, 10K, 15K, 20K, 25K, 30K, 35K, 40K, Official Time columns to datetime
    # The unit for these columns is just time hh:mm:ss
    
    marathon_df["5K"]=to_time(marathon_df["5K"])
    marathon_df["10K"]=to_time(marathon_df["10K"])
    marathon_df["15K"]=to_time(marathon_df["15K"])
    marathon_df["20K"]=to_time(marathon_df["20K"])
    marathon_df["25K"]=to_time(marathon_df["25K"])
    marathon_df["30K"]=to_time(marathon_df["30K"])
    marathon_df["35K"]=to_time(marathon_df["35K"])
    marathon_df["40K"]=to_time(marathon_df["40K"])
    marathon_df["Official Time"]=to_time(marathon_df["Official Time"])
    
    # convert Pace column from mm:ss to minutes only, rounded to 2 decimal places
    # unit is minutes/Kilometer, for overall race
    marathon_df["Pace"]=to_minutes(marathon_df["Pace"])
    
    # Remove rows where there is a "-" value

    for index in marathon_df.index:

        # check each relavent column. if value is -, then delete entire row.

        if marathon_df.loc[index, "5K"] == "-":
            marathon_df=marathon_df.drop(index, axis=0)
        elif marathon_df.loc[index, "10K"] == "-":
            marathon_df=marathon_df.drop(index, axis=0)
        elif marathon_df.loc[index, "15K"] == "-":
            marathon_df=marathon_df.drop(index, axis=0)
        elif marathon_df.loc[index, "20K"] == "-":
            marathon_df=marathon_df.drop(index, axis=0)
        elif marathon_df.loc[index, "25K"] == "-":
            marathon_df=marathon_df.drop(index, axis=0)
        elif marathon_df.loc[index, "30K"] == "-":
            marathon_df=marathon_df.drop(index, axis=0)
        elif marathon_df.loc[index, "35K"] == "-":
            marathon_df=marathon_df.drop(index, axis=0)
        elif marathon_df.loc[index, "40K"] == "-":
            marathon_df=marathon_df.drop(index, axis=0)
        elif marathon_df.loc[index, "Pace"] == "-":
            marathon_df=marathon_df.drop(index, axis=0)
        elif marathon_df.loc[index, "Official Time"] == "-":
            marathon_df=marathon_df.drop(index, axis=0)
            
    return marathon_df

Bib               object
Name              object
Age                int64
M/F               object
City              object
State             object
Country           object
5K                object
10K               object
15K               object
20K               object
25K               object
30K               object
35K               object
40K               object
Pace             float64
Official Time     object
dtype: object

Unnamed: 0,Bib,Name,Age,M/F,City,State,Country,5K,10K,15K,20K,25K,30K,35K,40K,Pace,Official Time
0,3,"Desisa, Lelisa",25,M,Ambo,,ETH,2015-04-20 00:14:43,2015-04-20 00:29:43,2015-04-20 00:44:57,2015-04-20 01:00:29,2015-04-20 01:16:07,2015-04-20 01:32:00,2015-04-20 01:47:59,2015-04-20 02:02:39,4.93,2015-04-20 02:09:17
1,4,"Tsegay, Yemane Adhane",30,M,Addis Ababa,,ETH,2015-04-20 00:14:43,2015-04-20 00:29:43,2015-04-20 00:44:58,2015-04-20 01:00:28,2015-04-20 01:16:07,2015-04-20 01:31:59,2015-04-20 01:47:59,2015-04-20 02:02:42,4.97,2015-04-20 02:09:48
2,8,"Chebet, Wilson",29,M,Marakwet,,KEN,2015-04-20 00:14:43,2015-04-20 00:29:43,2015-04-20 00:44:57,2015-04-20 01:00:29,2015-04-20 01:16:07,2015-04-20 01:32:00,2015-04-20 01:47:59,2015-04-20 02:03:01,4.98,2015-04-20 02:10:22
3,11,"Kipyego, Bernard",28,M,Eldoret,,KEN,2015-04-20 00:14:43,2015-04-20 00:29:44,2015-04-20 00:45:01,2015-04-20 01:00:29,2015-04-20 01:16:07,2015-04-20 01:32:00,2015-04-20 01:48:03,2015-04-20 02:03:47,5.00,2015-04-20 02:10:47
4,10,"Korir, Wesley",32,M,Kitale,,KEN,2015-04-20 00:14:43,2015-04-20 00:29:44,2015-04-20 00:44:58,2015-04-20 01:00:28,2015-04-20 01:16:07,2015-04-20 01:32:00,2015-04-20 01:47:59,2015-04-20 02:03:27,5.00,2015-04-20 02:10:49
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26593,25656,"Prescott, Francine J.",64,F,Boynton Beach,FL,USA,2015-04-20 00:50:15,2015-04-20 01:43:31,2015-04-20 02:36:53,2015-04-20 03:32:26,2015-04-20 04:25:53,2015-04-20 05:19:44,2015-04-20 06:17:19,2015-04-20 07:13:34,17.52,2015-04-20 07:38:56
26594,31359,"Emerson, Annette C.",61,F,Old Town,ME,USA,2015-04-20 00:48:36,2015-04-20 01:39:39,2015-04-20 02:39:13,2015-04-20 03:35:58,2015-04-20 04:32:44,2015-04-20 05:31:58,2015-04-20 06:28:56,2015-04-20 07:26:19,17.98,2015-04-20 07:51:30
26595,25559,"Cerveny, Nona",66,F,Scottsdale,AZ,USA,2015-04-20 00:53:03,2015-04-20 01:47:16,2015-04-20 02:41:45,2015-04-20 03:37:07,2015-04-20 04:33:51,2015-04-20 05:38:56,2015-04-20 06:38:51,2015-04-20 07:36:18,18.30,2015-04-20 07:59:33
26596,29396,"Buccilli, Alfred P.",53,M,Lynnfield,MA,USA,2015-04-20 00:49:04,2015-04-20 01:40:12,2015-04-20 02:33:31,2015-04-20 03:31:41,2015-04-20 04:29:20,2015-04-20 05:31:11,2015-04-20 06:33:35,2015-04-20 07:35:38,18.33,2015-04-20 08:00:37


0       0 days 01:47:56
1       0 days 01:47:59
2       0 days 01:48:18
3       0 days 01:49:04
4       0 days 01:48:44
              ...      
26593   0 days 06:23:19
26594   0 days 06:37:43
26595   0 days 06:43:15
26596   0 days 06:46:34
26597   0 days 07:01:14
Length: 26304, dtype: timedelta64[ns]

[4.93,
 4.97,
 4.98,
 5.0,
 5.0,
 5.0,
 5.02,
 5.07,
 5.1,
 5.12,
 5.12,
 5.13,
 5.22,
 5.27,
 5.3,
 5.32,
 5.32,
 5.35,
 5.37,
 5.37,
 5.37,
 5.4,
 5.4,
 5.42,
 5.43,
 5.45,
 5.48,
 5.52,
 5.53,
 5.53,
 5.53,
 5.53,
 5.55,
 5.55,
 5.55,
 5.55,
 5.55,
 5.55,
 5.55,
 5.57,
 5.57,
 5.57,
 5.58,
 5.58,
 5.58,
 5.58,
 5.6,
 5.6,
 5.6,
 5.6,
 5.6,
 5.6,
 5.6,
 5.62,
 5.62,
 5.62,
 5.62,
 5.62,
 5.63,
 5.63,
 5.63,
 5.65,
 5.65,
 5.65,
 5.65,
 5.65,
 5.67,
 5.67,
 5.67,
 5.67,
 5.67,
 5.67,
 5.67,
 5.68,
 5.7,
 5.7,
 5.7,
 5.7,
 5.72,
 5.72,
 5.72,
 5.72,
 5.72,
 5.72,
 5.73,
 5.73,
 5.73,
 5.73,
 5.75,
 5.75,
 5.77,
 5.78,
 5.78,
 5.78,
 5.78,
 5.8,
 5.8,
 5.8,
 5.8,
 5.82,
 5.82,
 5.82,
 5.82,
 5.82,
 5.82,
 5.82,
 5.82,
 5.83,
 5.83,
 5.83,
 5.83,
 5.83,
 5.83,
 5.85,
 5.85,
 5.85,
 5.85,
 5.85,
 5.85,
 5.85,
 5.85,
 5.87,
 5.87,
 5.87,
 5.87,
 5.87,
 5.87,
 5.87,
 5.88,
 5.88,
 5.88,
 5.88,
 5.88,
 5.88,
 5.88,
 5.88,
 5.88,
 5.9,
 5.9,
 5.92,
 5.92,
 5.92,
 5.92,
 5.92,
 5.93,
 5.93,
 5