# Cleaning the Dataset

## Setup

In [1]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
import json
import os.path
import string
from scripts.scrape import get_json, get_metadata

In [2]:
data_folder = os.path.join('..', 'data')

dirty_filename = os.path.join(data_folder, 'berlin_marathon_times_dirty.csv')
clean_filename = os.path.join(data_folder, 'berlin_marathon_times.csv')

dirty_filename

'../data/berlin_marathon_times_dirty.csv'

In [3]:
df = pd.read_csv(dirty_filename)
df.head()

Unnamed: 0,id,place,bib,surname,forename,team,nationality,yob,sex,age_class,age_class_place,net_time,clock_time,year
0,2,19431,2002,Ohnemueller,Bert Martin,,GER,1959,M,M45,2758,04:23:26,04:23:48,2005
1,3,10609,29356,Klinginger,Kurt,smart running team,GER,1947,M,M55,344,03:53:01,03:56:43,2005
2,4,23031,2034,Eleftheriadis,Georgios,,GRE,1962,M,M40,4470,04:38:47,04:42:41,2005
3,5,14104,2035,Scholz,Detlef,,GER,1970,M,M35,2573,04:03:50,04:05:14,2005
4,6,30239,F105,Frischke,Lucie,LAV Tempelhof Berlin,GER,1932,W,W70,2,06:07:10,06:09:04,2005


## Completeness Check

In [4]:
participants_year = df.year.value_counts().sort_index()
participants_year

2005    30373
2006    30182
2007    32486
2008    35731
2009    35015
2010    34001
2011    32914
2012    34270
2013    36473
2014    28945
2015    36753
2016    35991
Name: year, dtype: int64

In [5]:
participants_year_meta = participants_year.copy()

for year in participants_year.index:
    with open('../data/{0}-1.json'.format(year)) as f:
        j = json.load(f)
        m = get_metadata(j)
        participants_year_meta[year] = m.n_rows
        
participants_year_meta

2005    30373
2006    30182
2007    32486
2008    35731
2009    35015
2010    34001
2011    32914
2012    34270
2013    36473
2014    28945
2015    36753
2016    35991
Name: year, dtype: int64

In [6]:
len(participants_year[participants_year != participants_year_meta])

0

## Overview

In [7]:
len(df)

403134

In [8]:
df.dtypes

id                  int64
place               int64
bib                object
surname            object
forename           object
team               object
nationality        object
yob                 int64
sex                object
age_class          object
age_class_place     int64
net_time           object
clock_time         object
year                int64
dtype: object

In [9]:
df.describe()

Unnamed: 0,id,place,yob,age_class_place,year
count,403134.0,403134.0,403134.0,403134.0,403134.0
mean,26240.607041,16899.358593,1967.243013,1596.684058,2010.619734
std,25441.663187,9861.602089,47.518969,1321.93204,3.426661
min,2.0,1.0,0.0,0.0,2005.0
25%,10150.0,8402.0,1962.0,517.0,2008.0
50%,20216.0,16804.0,1968.0,1208.0,2011.0
75%,31350.0,25206.0,1976.0,2456.0,2014.0
max,141120.0,36768.0,1998.0,6101.0,2016.0


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 403134 entries, 0 to 403133
Data columns (total 14 columns):
id                 403134 non-null int64
place              403134 non-null int64
bib                403127 non-null object
surname            403096 non-null object
forename           403089 non-null object
team               181573 non-null object
nationality        403120 non-null object
yob                403134 non-null int64
sex                403132 non-null object
age_class          403105 non-null object
age_class_place    403134 non-null int64
net_time           403134 non-null object
clock_time         403134 non-null object
year               403134 non-null int64
dtypes: int64(5), object(9)
memory usage: 43.1+ MB


# Unique IDs

In [11]:
id_counts = df.groupby('id').year.count()
id_problems = id_counts[id_counts > 1]
len(id_problems)

58616

In [12]:
id_problems.describe()

count    58616.000000
mean         6.602378
std          3.688039
min          2.000000
25%          2.000000
50%          8.000000
75%         10.000000
max         12.000000
Name: year, dtype: float64

In [13]:
ids = df.groupby(['id', 'year']).place.count()
ids_problems = ids[ids > 1]
len(ids_problems)

0

In [14]:
df.id = (df.year.astype('str') + df.id.astype('str')).astype('int64')
df.head()

Unnamed: 0,id,place,bib,surname,forename,team,nationality,yob,sex,age_class,age_class_place,net_time,clock_time,year
0,20052,19431,2002,Ohnemueller,Bert Martin,,GER,1959,M,M45,2758,04:23:26,04:23:48,2005
1,20053,10609,29356,Klinginger,Kurt,smart running team,GER,1947,M,M55,344,03:53:01,03:56:43,2005
2,20054,23031,2034,Eleftheriadis,Georgios,,GRE,1962,M,M40,4470,04:38:47,04:42:41,2005
3,20055,14104,2035,Scholz,Detlef,,GER,1970,M,M35,2573,04:03:50,04:05:14,2005
4,20056,30239,F105,Frischke,Lucie,LAV Tempelhof Berlin,GER,1932,W,W70,2,06:07:10,06:09:04,2005


In [15]:
id_counts = df.groupby('id').year.count()
id_problems = id_counts[id_counts > 1]
len(id_problems)

0

In [16]:
df = df.set_index('id')

## Place

In [17]:
len(df[df.place.isnull()])

0

In [18]:
df.place.describe()

count    403134.000000
mean      16899.358593
std        9861.602089
min           1.000000
25%        8402.000000
50%       16804.000000
75%       25206.000000
max       36768.000000
Name: place, dtype: float64

## Times

In [19]:
len(df[df.net_time.isnull()])

0

In [20]:
len(df[df.clock_time.isnull()])

0

In [21]:
df.net_time = pd.to_timedelta(df.net_time)
df.clock_time = pd.to_timedelta(df.clock_time)
df.head()

Unnamed: 0_level_0,place,bib,surname,forename,team,nationality,yob,sex,age_class,age_class_place,net_time,clock_time,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
20052,19431,2002,Ohnemueller,Bert Martin,,GER,1959,M,M45,2758,04:23:26,04:23:48,2005
20053,10609,29356,Klinginger,Kurt,smart running team,GER,1947,M,M55,344,03:53:01,03:56:43,2005
20054,23031,2034,Eleftheriadis,Georgios,,GRE,1962,M,M40,4470,04:38:47,04:42:41,2005
20055,14104,2035,Scholz,Detlef,,GER,1970,M,M35,2573,04:03:50,04:05:14,2005
20056,30239,F105,Frischke,Lucie,LAV Tempelhof Berlin,GER,1932,W,W70,2,06:07:10,06:09:04,2005


In [22]:
df.net_time.describe()

count                    403134
mean     0 days 04:09:00.199402
std      0 days 00:42:32.086175
min             0 days 00:00:00
25%             0 days 03:39:12
50%             0 days 04:04:23
75%             0 days 04:35:43
max             0 days 08:41:33
Name: net_time, dtype: object

In [23]:
df.clock_time.describe()

count                    403134
mean     0 days 04:19:57.418917
std      0 days 00:48:21.491980
min             0 days 00:00:00
25%             0 days 03:45:24
50%             0 days 04:16:00
75%      0 days 04:51:08.750000
max             0 days 11:01:11
Name: clock_time, dtype: object

The maximum times are above the 6 hour cutoff.

In [24]:
zero = pd.Timedelta(0)
len(df[df.net_time == zero])

2

In [25]:
df.net_time = df.net_time.replace(zero, np.nan)
len(df[df.net_time == zero])

0

In [26]:
zero_clock_times = df[df.clock_time == zero]
zero_clock_times

Unnamed: 0_level_0,place,bib,surname,forename,team,nationality,yob,sex,age_class,age_class_place,net_time,clock_time,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
20132212,31946,WF33,Power,Patrice,,IRL,1964,W,45,1372,04:53:26,0 days,2013
201313031,28509,WF25,Meissner,Sabine,"SV Turbine Neubrandenburg, Wal",DEU,1959,W,50,717,04:35:28,0 days,2013
201334468,28563,34343,Oltmans,Egon,,NLD,1960,M,50,3122,NaT,0 days,2013
201429961,17446,32000,Müller,Heino,,DEU,1953,M,60,300,04:14:04,0 days,2014
201519713,9277,39035,Fennel,Sascha,,GER,1964,M,50,879,03:36:20,0 days,2015
201579851,2783,39046,Schmidt,Andreas,,GER,1973,M,40,585,03:08:17,0 days,2015


In [27]:
df.clock_time = df.clock_time.replace(zero, np.nan)
len(df[df.clock_time == zero])

0

In [28]:
times = df[df.net_time > df.clock_time]
times

Unnamed: 0_level_0,place,bib,surname,forename,team,nationality,yob,sex,age_class,age_class_place,net_time,clock_time,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
200641853,22138,F4217,Vogel,Esther,,DEU,1973,W,W30,524,04:41:51,04:26:20,2006
201411182,2673,13549,Sowisnki,Fabrice,SAINT VENANT ATHLETISME,FRA,1974,M,40,595,03:13:55,02:58:59,2014


In [29]:
ev = df.loc[200641853]
fs = df.loc[201411182]

In [30]:
df[(df.year == ev.year) & (ev.place - 5 < df.place) & (df.place < ev.place + 5)].sort_values('place')

Unnamed: 0_level_0,place,bib,surname,forename,team,nationality,yob,sex,age_class,age_class_place,net_time,clock_time,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
20063560,22134,18419,Arens,Marc,KJG Mü/Si/Wa,DEU,1984,M,MH,1816,04:41:50,04:43:58,2006
20069778,22135,12254,von Blume,Marc,SV Blitzenreute,DEU,1962,M,M40,4357,04:41:50,04:43:48,2006
200613258,22136,8523,Kolke,Reinhard,,DEU,1968,M,M35,3544,04:41:50,04:43:41,2006
200626030,22137,26910,Wojczewski,Axel-Rainer,,DEU,1949,M,M55,936,04:41:51,04:46:29,2006
200617899,22138,F3482,Lorch,Renate,,DEU,1960,W,W45,579,04:41:51,04:47:42,2006
200641853,22138,F4217,Vogel,Esther,,DEU,1973,W,W30,524,04:41:51,04:26:20,2006
200614787,22140,16772,Kaspar,Werner,,DEU,1953,M,M50,1985,04:41:52,04:45:27,2006
200643398,22141,13745,Ruholl,Steve,,DEU,1979,M,MH,1817,04:41:52,04:45:31,2006
200628360,22142,29253,Fortune,Jean Pierre,,FRA,1955,M,M50,1986,04:41:52,04:46:23,2006


In [31]:
df.set_value(ev.name, 'clock_time', ev.clock_time + pd.Timedelta('20 min'))
df.loc[ev.name]

place                        22138
bib                          F4217
surname                      Vogel
forename                    Esther
team                           NaN
nationality                    DEU
yob                           1973
sex                              W
age_class                      W30
age_class_place                524
net_time           0 days 04:41:51
clock_time         0 days 04:46:20
year                          2006
Name: 200641853, dtype: object

In [32]:
df[(df.year == fs.year) & (fs.place - 5 < df.place) & (df.place < fs.place + 5)].sort_values('place')

Unnamed: 0_level_0,place,bib,surname,forename,team,nationality,yob,sex,age_class,age_class_place,net_time,clock_time,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
201428140,2669,24681,Rousing,Allan,MTC,DNK,1969,M,45,441,03:13:55,03:18:26,2014
20143965,2670,6318,Borck,Jan,Der Kleine Muck,DEU,1964,M,50,200,03:13:55,03:14:12,2014
20146920,2671,26033,Schuller,Eelco,PAC,NLD,1986,M,H,264,03:13:55,03:15:41,2014
201428776,2672,6185,Boge,Georg,,DEU,1980,M,30,464,03:13:55,03:18:29,2014
201411182,2673,13549,Sowisnki,Fabrice,SAINT VENANT ATHLETISME,FRA,1974,M,40,595,03:13:55,02:58:59,2014
20144960,2674,F597,Aljancic,Natasa,AK DOMALE,SVN,1969,W,45,13,03:13:56,03:15:45,2014
20143080,2675,7519,Chambers,Dion,,GBR,1989,M,H,265,03:13:56,03:14:53,2014
20145671,2676,7933,Colsman,Philipp,,DEU,1967,M,45,442,03:13:57,03:16:41,2014
201426721,2677,13784,Hinrichsen,Thomas,HRLK / BLL,DNK,1975,M,35,517,03:13:58,03:14:12,2014


In [33]:
df.set_value(fs.name, 'clock_time', ev.clock_time + pd.Timedelta('20 min'))
df.loc[fs.name]

place                                 2673
bib                                  13549
surname                           Sowisnki
forename                           Fabrice
team               SAINT VENANT ATHLETISME
nationality                            FRA
yob                                   1974
sex                                      M
age_class                               40
age_class_place                        595
net_time                   0 days 03:13:55
clock_time                 0 days 04:46:20
year                                  2014
Name: 201411182, dtype: object

In [34]:
len(df[df.net_time > df.clock_time])

0

## Sex

In [35]:
df.sex.describe()

count     403132
unique         2
top            M
freq      313715
Name: sex, dtype: object

In [36]:
df.sex.value_counts()

M    313715
W     89417
Name: sex, dtype: int64

In [37]:
no_sex = df[df.sex.isnull()]
len(no_sex)

2

In [38]:
no_sex

Unnamed: 0_level_0,place,bib,surname,forename,team,nationality,yob,sex,age_class,age_class_place,net_time,clock_time,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
201677124,27472,6956,Barris,Wolfram,,,1951,,,191,04:43:58,05:00:42,2016
201678774,26936,40651,Schnetzer,Wolfgang,,,1961,,,1509,04:41:32,05:15:56,2016


In [39]:
df.sex = df.sex.fillna('M').astype('category')
df.sex.isnull().max()

False

## Year of Birth (YOB)

In [40]:
df.yob.describe()

count    403134.000000
mean       1967.243013
std          47.518969
min           0.000000
25%        1962.000000
50%        1968.000000
75%        1976.000000
max        1998.000000
Name: yob, dtype: float64

In [41]:
len(df[df.yob.isnull()])

0

In [42]:
no_yob = df[df.yob == 0]
no_yob.head()

Unnamed: 0_level_0,place,bib,surname,forename,team,nationality,yob,sex,age_class,age_class_place,net_time,clock_time,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2005717,22020,2119,Romero,Michael,,GER,0,M,MH,1964,04:34:08,04:38:11,2005
2005946,7679,2681,Tofte,Poul Erik,TRI 4,DEN,0,M,MH,746,03:42:02,03:45:02,2005
20051058,12767,3574,Lauterbach,Klaus,,GER,0,M,MH,1170,03:58:59,04:01:17,2005
20051546,30256,F1495,Ewald,Gisela,,GER,0,W,WH,874,06:09:28,06:11:03,2005
20051853,11550,3249,Breidenstein,Volker,,GER,0,M,MH,1068,03:55:56,03:58:50,2005


In [43]:
df.yob = df.yob.replace(0, np.nan)
df.yob.describe()

count    402910.000000
mean       1968.336713
std          10.321517
min        1901.000000
25%        1962.000000
50%        1968.000000
75%        1976.000000
max        1998.000000
Name: yob, dtype: float64

It is unlikely to have a YOB of 1901.

In [44]:
df.yob.value_counts().sort_index().head(10)

1901.0    22
1902.0     3
1923.0     2
1925.0     2
1926.0     5
1927.0    12
1928.0    12
1929.0     8
1930.0    11
1931.0    22
Name: yob, dtype: int64

In [45]:
df['age'] = df.year - df.yob
df[df.yob <= 1925].sort_values(['yob', 'age'], ascending=[True, False])[['year', 'yob', 'age', 'age_class', 'net_time', 'place', 'age_class_place']]

Unnamed: 0_level_0,year,yob,age,age_class,net_time,place,age_class_place
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
201681102,2016,1901.0,115.0,,03:05:59,2126,0
20105394,2010,1901.0,109.0,W,04:00:01,16646,0
20108584,2010,1901.0,109.0,M,04:09:04,19394,0
20109381,2010,1901.0,109.0,M,03:54:42,14287,0
201015909,2010,1901.0,109.0,M,04:50:29,29460,0
201016059,2010,1901.0,109.0,W,05:02:11,31018,0
201017069,2010,1901.0,109.0,M,04:09:35,19565,0
201020199,2010,1901.0,109.0,M,03:08:16,2071,0
201026575,2010,1901.0,109.0,M,03:40:54,9474,0
201027772,2010,1901.0,109.0,M,03:41:56,9751,0


In [46]:
df.yob = df.yob.replace(1901, np.nan).replace(1902, np.nan)

Once we have cleaned up the age class, we can use this information to guess the YOB.

Are some listed as too young to participate?

In [47]:
df['age'] = df.year - df.yob
df[df.age < 18]

Unnamed: 0_level_0,place,bib,surname,forename,team,nationality,yob,sex,age_class,age_class_place,net_time,clock_time,year,age
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
20052685,7246,5257,Koepke,Mathias,Mahlower SV,GER,1988.0,M,MJA,18,03:40:18,03:40:58,2005,17.0
20055704,26060,5466,Jacobsen,Eystein André,,NOR,1988.0,M,MJA,97,04:55:34,04:59:23,2005,17.0
200531437,4107,29736,Petrausch,Janek,Schenker-TEAM,GER,1988.0,M,MJA,9,03:26:21,03:28:55,2005,17.0
2007915,20059,2446,Musielak,Norman,,DEU,1990.0,M,MJA,84,04:15:32,04:15:52,2007,17.0
20071154,30028,F1332,Scharpf,Maria,Scharpf-Family,DEU,1990.0,W,WJA,23,05:09:06,05:15:09,2007,17.0
200717477,32343,F6340,Garbe,Tabitha,,DEU,1990.0,W,WJA,29,06:07:53,06:10:04,2007,17.0
200910843,24928,22567,Oppermann,Patrick,,DEU,1995.0,M,MJA,79,04:35:04,04:42:02,2009,14.0
20104134,20039,F8916,Forsberg,Sofie,MTC,DNK,1993.0,W,WJA,7,04:11:04,04:33:41,2010,17.0


## Age Class

In [48]:
df.age_class.describe()

count     403105
unique        42
top           45
freq       38433
Name: age_class, dtype: object

In [49]:
no_age_class = df[df.age_class.isnull()]
len(no_age_class)

29

In [50]:
no_age_class.head()

Unnamed: 0_level_0,place,bib,surname,forename,team,nationality,yob,sex,age_class,age_class_place,net_time,clock_time,year,age
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
201220259,12554,3612,Adriaansen,Dave,,NLD,1977.0,M,,0,03:49:46,04:00:05,2012,35.0
201228086,22822,33294,Hocke,Michael,DB,DEU,,M,,0,04:21:03,04:43:22,2012,
201228871,32527,27010,Steffensen,Bjarne,,DNK,,M,,0,05:22:28,05:45:36,2012,
201232991,585,15426,van Keulen,Louran,AP-running,NLD,,M,,0,02:52:23,02:52:31,2012,
201233353,18654,F7913,Szpakowska,Agnieszka,,DEU,,W,,0,04:06:25,04:23:04,2012,


In [51]:
df.age_class.value_counts().sort_index()

30     25394
35     30177
40     38211
45     38433
50     28243
55     13845
60      6461
65      2477
70       971
75       255
80        37
H      20298
JA       420
M         25
M30    18301
M35    26373
M40    33912
M45    27643
M50    17945
M55     9114
M60     4376
M65     2058
M70      597
M75       82
M80       15
MH     16315
MJA      724
U20       95
W          8
W30     5474
W35     6598
W40     8675
W45     7093
W50     4033
W55     1618
W60      584
W65      251
W70       50
W75        3
W80        5
WH      5742
WJA      174
Name: age_class, dtype: int64

In [52]:
len(df[df.age_class.str.contains(r'[MW]', na=False)])

197788

In [53]:
age_class_sex = df[df.age_class.str.contains(r'[MW]', na=False)].age_class.str.get(0)
len(age_class_sex)

197788

In [54]:
inconsistent_sex0 = df.loc[age_class_sex.index, 'sex'] != age_class_sex
inconsistent_sex = df.loc[inconsistent_sex0[inconsistent_sex0 == True].index, :]
len(inconsistent_sex)

34

In [55]:
inconsistent_sex

Unnamed: 0_level_0,place,bib,surname,forename,team,nationality,yob,sex,age_class,age_class_place,net_time,clock_time,year,age
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
200510341,1532,F3923,Lohmann,Henrik,Sparta,DEN,1964.0,W,M40,353,03:06:49,03:07:37,2005,41.0
200511280,14344,F5717,Sanchez,Raul,,SUI,1962.0,W,M40,3067,04:04:49,04:08:49,2005,43.0
200523704,10930,F4420,Kachlik,Stephan,,GER,1963.0,W,M40,2410,03:54:04,03:56:47,2005,42.0
200523897,7434,F5646,Kirchhoff,Jochen,TSV Bollensen,GER,1957.0,W,M45,1085,03:41:07,03:44:33,2005,48.0
200528829,9129,F6506,Pape,Gerald,TriFrogs Gellersen,GER,1965.0,W,M40,2017,03:47:41,03:49:22,2005,40.0
200533336,21772,F8928,Tricoche,Elisabeth,Defi Horizon,FRA,1949.0,W,M55,840,04:33:09,04:37:12,2005,56.0
200537786,6827,32681,Filippo,Proietti,,ITA,1967.0,M,W35,102,03:38:31,03:40:10,2005,38.0
200537789,20204,32699,Demetrio,Alberto,,BRA,1965.0,M,W40,677,04:26:27,04:30:40,2005,40.0
20064614,2110,F3676,Aeby,Pascal Laurent,,DEU,1966.0,W,M40,495,03:16:09,03:16:17,2006,40.0
200616028,106,F7055,Nordgaard,Haavard,Nike,NOR,1961.0,W,M45,3,02:38:57,02:39:04,2006,45.0


In [56]:
inconsistent_sex.sex = inconsistent_sex.age_class.str.get(0)
inconsistent_sex

Unnamed: 0_level_0,place,bib,surname,forename,team,nationality,yob,sex,age_class,age_class_place,net_time,clock_time,year,age
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
200510341,1532,F3923,Lohmann,Henrik,Sparta,DEN,1964.0,M,M40,353,03:06:49,03:07:37,2005,41.0
200511280,14344,F5717,Sanchez,Raul,,SUI,1962.0,M,M40,3067,04:04:49,04:08:49,2005,43.0
200523704,10930,F4420,Kachlik,Stephan,,GER,1963.0,M,M40,2410,03:54:04,03:56:47,2005,42.0
200523897,7434,F5646,Kirchhoff,Jochen,TSV Bollensen,GER,1957.0,M,M45,1085,03:41:07,03:44:33,2005,48.0
200528829,9129,F6506,Pape,Gerald,TriFrogs Gellersen,GER,1965.0,M,M40,2017,03:47:41,03:49:22,2005,40.0
200533336,21772,F8928,Tricoche,Elisabeth,Defi Horizon,FRA,1949.0,M,M55,840,04:33:09,04:37:12,2005,56.0
200537786,6827,32681,Filippo,Proietti,,ITA,1967.0,W,W35,102,03:38:31,03:40:10,2005,38.0
200537789,20204,32699,Demetrio,Alberto,,BRA,1965.0,W,W40,677,04:26:27,04:30:40,2005,40.0
20064614,2110,F3676,Aeby,Pascal Laurent,,DEU,1966.0,M,M40,495,03:16:09,03:16:17,2006,40.0
200616028,106,F7055,Nordgaard,Haavard,Nike,NOR,1961.0,M,M45,3,02:38:57,02:39:04,2006,45.0


In [57]:
wrong = {20087472, 200729757, 20079396, 200641938, 200537789, 200537786, 200533336}
fix_my_sex = inconsistent_sex[inconsistent_sex.index.isin(wrong)]
fix_my_sex

Unnamed: 0_level_0,place,bib,surname,forename,team,nationality,yob,sex,age_class,age_class_place,net_time,clock_time,year,age
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
200533336,21772,F8928,Tricoche,Elisabeth,Defi Horizon,FRA,1949.0,M,M55,840,04:33:09,04:37:12,2005,56.0
200537786,6827,32681,Filippo,Proietti,,ITA,1967.0,W,W35,102,03:38:31,03:40:10,2005,38.0
200537789,20204,32699,Demetrio,Alberto,,BRA,1965.0,W,W40,677,04:26:27,04:30:40,2005,40.0
200641938,19813,32282,Filho,Lourival,Brazil,BRA,1957.0,W,W45,460,04:31:24,04:33:18,2006,49.0
20079396,4775,33909,Ángel Ávila,Juan Manuel,Nacho Sport,ESP,1963.0,W,W40,58,03:26:09,03:28:44,2007,44.0
200729757,8923,34146,Frank,Vandenhole,,FRA,1968.0,W,W35,129,03:41:50,03:42:01,2007,39.0
20087472,15536,37917,Macedo,Guilherme,,BRA,1980.0,W,WH,258,03:56:58,04:01:31,2008,28.0


In [58]:
fix_my_sex.sex = fix_my_sex.sex.apply(lambda s: 'W' if s == 'M' else 'M')
df.loc[fix_my_sex.index, 'sex'] = fix_my_sex.sex
df.loc[fix_my_sex.index, :]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


Unnamed: 0_level_0,place,bib,surname,forename,team,nationality,yob,sex,age_class,age_class_place,net_time,clock_time,year,age
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
200533336,21772,F8928,Tricoche,Elisabeth,Defi Horizon,FRA,1949.0,W,M55,840,04:33:09,04:37:12,2005,56.0
200537786,6827,32681,Filippo,Proietti,,ITA,1967.0,M,W35,102,03:38:31,03:40:10,2005,38.0
200537789,20204,32699,Demetrio,Alberto,,BRA,1965.0,M,W40,677,04:26:27,04:30:40,2005,40.0
200641938,19813,32282,Filho,Lourival,Brazil,BRA,1957.0,M,W45,460,04:31:24,04:33:18,2006,49.0
20079396,4775,33909,Ángel Ávila,Juan Manuel,Nacho Sport,ESP,1963.0,M,W40,58,03:26:09,03:28:44,2007,44.0
200729757,8923,34146,Frank,Vandenhole,,FRA,1968.0,M,W35,129,03:41:50,03:42:01,2007,39.0
20087472,15536,37917,Macedo,Guilherme,,BRA,1980.0,M,WH,258,03:56:58,04:01:31,2008,28.0


In [59]:
df.age_class = df.age_class.str.replace(r'[MW]?(.*)', r'\1')
df.age_class.value_counts().sort_index()

          33
30     49169
35     63148
40     80798
45     73169
50     50221
55     24577
60     11421
65      4786
70      1618
75       340
80        57
H      42355
JA      1318
U20       95
Name: age_class, dtype: int64

In [60]:
no_ac = df[(df.age_class == "")]
no_ac

Unnamed: 0_level_0,place,bib,surname,forename,team,nationality,yob,sex,age_class,age_class_place,net_time,clock_time,year,age
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
20101501,27915,27762,Thiel,Stephan,,DEU,,M,,0,04:41:51,05:06:28,2010,
20102084,1254,36577,Naumann,Daniel,,DEU,,M,,0,02:59:23,02:59:52,2010,
20102087,3050,36580,Mindaugas,Garmus,,DEU,,M,,0,03:14:59,03:14:59,2010,
20102093,10741,36590,Da Conceicao,Toni,,DEU,,M,,0,03:45:01,03:54:56,2010,
20102443,27165,F8708,Castellari,Loredana,,ITA,,W,,0,04:38:07,04:58:51,2010,
20102822,31303,31359,Bitzer,Christian,adidas,DEU,,M,,0,05:05:31,05:28:21,2010,
20103418,3218,31316,Besnard,Nicolas,"real,- Gewinnspiel",FRA,,M,,0,03:16:04,03:18:35,2010,
20103419,14441,31792,Charles,Romain,"real,- Gewinnspiel",FRA,,M,,0,03:55:05,04:10:37,2010,
20105394,16646,F5795,Padberg,Barbara,,DEU,,W,,0,04:00:01,04:03:32,2010,
20108584,19394,27493,Talmas,Olivier,Next One,BEL,,M,,0,04:09:04,04:20:52,2010,


In [61]:
df.age_class = df.age_class.replace('', np.nan)
len(df[df.age_class == ""])

0

In [62]:
df.groupby('age_class').age.aggregate([np.min, np.median, np.max])

Unnamed: 0_level_0,amin,median,amax
age_class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
30,30.0,32.0,54.0
35,32.0,37.0,40.0
40,39.0,42.0,44.0
45,45.0,47.0,49.0
50,47.0,52.0,54.0
55,55.0,56.0,59.0
60,60.0,61.0,64.0
65,65.0,66.0,69.0
70,60.0,71.0,74.0
75,75.0,76.0,79.0


In [63]:
df[(df.age_class == '30') & (df.age >= 35)]

Unnamed: 0_level_0,place,bib,surname,forename,team,nationality,yob,sex,age_class,age_class_place,net_time,clock_time,year,age
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
200834733,4928,36826,Silva,Usiel,Rho Delta Trave,BRA,1954.0,M,30,677,03:23:32,03:23:38,2008,54.0


In [64]:
df.set_value(200834733, 'age_class', '50')

df[(df.age_class == '50') & (df.age < 50)]

Unnamed: 0_level_0,place,bib,surname,forename,team,nationality,yob,sex,age_class,age_class_place,net_time,clock_time,year,age
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
20088907,2532,17349,Lameir,Christian,,DEU,1959.0,M,50,155,03:10:18,03:10:26,2008,49.0
201232192,20225,13846,Hundertmark,Peter,,DEU,1965.0,M,50,2125,04:12:00,04:22:19,2012,47.0


In [65]:
df.set_value(201232192, 'age_class', '45')

df[(df.age_class == '70') & (df.age < 70)]

Unnamed: 0_level_0,place,bib,surname,forename,team,nationality,yob,sex,age_class,age_class_place,net_time,clock_time,year,age
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
200916097,28932,F174,Kiederle,Inger,Atletica,DNK,1942.0,W,70,3,04:52:07,05:07:35,2009,67.0
200932876,20390,37361,Thiel,Peter,,DEU,1949.0,M,70,19,04:19:01,04:30:43,2009,60.0


In [66]:
df.set_value(200916097, 'age_class', '65')
df.set_value(200932876, 'age_class', '60')

df[(df.age_class == 'H') & (df.age >= 30)]

Unnamed: 0_level_0,place,bib,surname,forename,team,nationality,yob,sex,age_class,age_class_place,net_time,clock_time,year,age
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
200722121,41,84,Güssow,Christian,Hamburger SV,DEU,1976.0,M,H,17,02:27:05,02:27:08,2007,31.0
200919946,14485,23852,Pranzner,Andreas,Berliner Feuerwehr,DEU,1961.0,M,H,1415,04:00:00,04:19:35,2009,48.0


In [67]:
df.set_value(200722121, 'age_class', '30')
df.set_value(200919946, 'age_class', '45')
len(df)

403134

In [68]:
def upper(match):
    if match.group(2) == '0':
        d = '4'
    else:
        d = '9'
    return '{0}-{1}{2}'.format(match.group(0), match.group(1), d)

df.age_class = df.age_class.str.replace(r'^(JA|U20)$', '0-19').str.replace('^H$', '20-29').str.replace(r'^8.*', '80+').str.replace(r'^(\d)(\d)$', upper)
df.groupby('age_class').age.aggregate([np.min, np.max])

Unnamed: 0_level_0,amin,amax
age_class,Unnamed: 1_level_1,Unnamed: 2_level_1
0-19,14.0,19.0
20-29,20.0,29.0
30-34,30.0,34.0
35-39,32.0,40.0
40-44,39.0,44.0
45-49,45.0,49.0
50-54,49.0,54.0
55-59,55.0,59.0
60-64,60.0,64.0
65-69,65.0,69.0


In [69]:
age_classes = pd.unique(df.age_class.dropna())
df.age_class = df.age_class.astype('category', categories=age_classes, ordered=True)

## Age Class Place

In [70]:
df.age_class_place.describe()

count    403134.000000
mean       1596.684058
std        1321.932040
min           0.000000
25%         517.000000
50%        1208.000000
75%        2456.000000
max        6101.000000
Name: age_class_place, dtype: float64

In [71]:
len(df[df.age_class_place.isnull()])

0

In [72]:
df.age_class_place = df.age_class_place.replace(0, np.nan)
zero_acp = df[df.age_class_place.isnull()]
zero_acp[['age_class', 'place', 'net_time', 'clock_time', 'year', 'yob', 'age']]

Unnamed: 0_level_0,age_class,place,net_time,clock_time,year,yob,age
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
20101501,,27915,04:41:51,05:06:28,2010,,
20102084,,1254,02:59:23,02:59:52,2010,,
20102087,,3050,03:14:59,03:14:59,2010,,
20102093,,10741,03:45:01,03:54:56,2010,,
20102443,,27165,04:38:07,04:58:51,2010,,
20102822,,31303,05:05:31,05:28:21,2010,,
20103418,,3218,03:16:04,03:18:35,2010,,
20103419,,14441,03:55:05,04:10:37,2010,,
20105394,,16646,04:00:01,04:03:32,2010,,
20108584,,19394,04:09:04,04:20:52,2010,,


In [73]:
missing_ac = df[(df.age_class_place.isnull()) & (df.age_class.isnull()) & (~df.yob.isnull())]
missing_ac.age_class = (5 * (missing_ac.age // 5)).astype('int').astype('str').str.replace(r'^(\d)(\d)$', upper)
missing_ac

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


Unnamed: 0_level_0,place,bib,surname,forename,team,nationality,yob,sex,age_class,age_class_place,net_time,clock_time,year,age
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
201013887,9534,,Coll,Florian,Cours Forest,FRA,1983.0,M,25-29,,03:41:07,03:52:11,2010,27.0
201220259,12554,3612,Adriaansen,Dave,,NLD,1977.0,M,35-39,,03:49:46,04:00:05,2012,35.0
2015141090,36483,F12454,Stasch,Christa,,GER,1956.0,W,55-59,,06:10:32,06:46:46,2015,59.0
2015141108,35431,38838,Wang,Hsi-Tsang,,TPE,1976.0,M,35-39,,05:36:47,06:16:10,2015,39.0
2015141111,9428,38825,Heinrichs,Markus,SCW München Triathlon,GER,1980.0,M,35-39,,03:36:51,03:56:07,2015,35.0
2015141114,19328,38827,Huai,Qianjiang,,CHN,1975.0,M,40-44,,04:04:35,04:24:01,2015,40.0
2015141120,18143,38828,Kraus,Martin,,GER,1975.0,M,40-44,,04:00:32,04:23:56,2015,40.0


In [74]:
missing_ac.set_value(201013887, 'age_class', '20-29')
df.loc[missing_ac.index, 'age_class'] = missing_ac.age_class

### Consistent Ordering

In [75]:
df['clean_place'] = df.groupby('year').net_time.rank(method='max', na_option='bottom')
df['clean_age_class_place'] = df.groupby(['year', 'age_class']).net_time.rank(method='max', na_option='bottom')
df['super_place'] = df.net_time.rank(method='max', na_option='bottom')
df['super_age_class_place'] = df.net_time.rank(method='max', na_option='bottom')
df.sample(10)

Unnamed: 0_level_0,place,bib,surname,forename,team,nationality,yob,sex,age_class,age_class_place,net_time,clock_time,year,age,clean_place,clean_age_class_place,super_place,super_age_class_place
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
20051937,19848,3302,Wonneberger,Martin,,GER,1984.0,M,20-29,1784.0,04:25:03,04:28:35,2005,21.0,19846.0,2185.0,271103.0,271103.0
201120851,12677,F5163,Martinez,Sondra,,USA,1976.0,W,35-39,265.0,03:54:44,04:03:06,2011,35.0,12682.0,2135.0,159368.0,159368.0
201318649,33082,F510,Abruzzini,Vera,PN TREINAMENTO,BRA,1963.0,W,50-54,994.0,05:00:46,05:25:22,2013,50.0,33003.0,4510.0,356405.0,356405.0
201133672,15992,34695,Timmer,Martin,,NLD,1953.0,M,55-59,625.0,04:03:41,04:11:54,2011,58.0,15991.0,668.0,199271.0,199271.0
200933231,30232,6689,Bucksch,Klaus,,DEU,1956.0,M,50-54,2845.0,04:59:15,05:13:28,2009,53.0,30231.0,3391.0,354079.0,354079.0
200626597,25881,32118,Neidhardt,Michael,,DEU,1963.0,M,40-44,4869.0,05:02:06,05:04:14,2006,43.0,25873.0,5945.0,358207.0,358207.0
2013424,4400,19107,Mühleisen,Stefan,Teamadidas,DEU,1979.0,M,30-34,676.0,03:19:25,03:22:55,2013,34.0,4395.0,712.0,44480.0,44480.0
200816510,10508,23850,Ravnholdt Frederiksen,Lars,KTT2000,DNK,1974.0,M,30-34,1278.0,03:43:29,03:45:30,2008,34.0,10503.0,1413.0,115259.0,115259.0
200825650,9360,27731,Soy Fusté,Joan,,ESP,1965.0,M,40-44,2070.0,03:39:46,03:41:51,2008,43.0,9357.0,2222.0,102801.0,102801.0
201319488,1714,24876,SIKORA,GRZEGORZ,,POL,1983.0,M,30-34,342.0,03:02:43,03:09:20,2013,30.0,1711.0,351.0,18256.0,18256.0


## Teams

In [76]:
df.team.describe()

count       181573
unique       68777
top       Terramia
freq          1908
Name: team, dtype: object

## Nationality

In [77]:
df.nationality.describe()

count     403120
unique       250
top          DEU
freq      155501
Name: nationality, dtype: object

In [78]:
len(df[df.nationality.isnull()])

14

[ISO](https://en.wikipedia.org/wiki/ISO_3166-1)
[FIFA](https://en.wikipedia.org/wiki/List_of_FIFA_country_codes)
[IOC](https://en.wikipedia.org/wiki/List_of_IOC_country_codes)

In [79]:
nat = pd.read_csv('../data/countries.csv')
nat.head()

Unnamed: 0,abbreviation,iso3166_1alpha_3,country
0,ABW,ABW,Aruba
1,AFG,AFG,Afghanistan
2,AGO,AGO,Angola
3,AHO,AHO,Netherlands Antilles
4,ALB,ALB,Albania


In [80]:
sorted(set(df.nationality.dropna()) - set(nat.abbreviation))

['RKS']

In [81]:
df = pd.merge(df, nat, left_on='nationality', right_on='abbreviation', how='left').drop('nationality', axis=1)
df.iso3166_1alpha_3.describe()

count     403117
unique       199
top          DEU
freq      192234
Name: iso3166_1alpha_3, dtype: object

## Names

In [82]:
df.forename.describe()

count     403089
unique     37436
top       Thomas
freq        7865
Name: forename, dtype: object

In [83]:
len(df[df.forename.isnull()])

45

In [84]:
df[(df.forename.isnull()) & (~df.surname.isnull())]

Unnamed: 0,place,bib,surname,forename,team,yob,sex,age_class,age_class_place,net_time,clock_time,year,age,clean_place,clean_age_class_place,super_place,super_age_class_place,abbreviation,iso3166_1alpha_3,country
60554,15749,29772,"Chávez-Fernández Goyburu,Juan José",,Regatas Lima,1967.0,M,35-39,2771.0,04:15:47,04:18:00,2006,39.0,15743.0,3142.0,241063.0,241063.0,PER,PER,Peru
60901,31004,F1115,"Mackenzie,",,,1962.0,W,45-49,1028.0,05:21:45,05:29:38,2007,45.0,30993.0,5337.0,379507.0,379507.0,NLD,NLD,Netherlands
94924,22570,F2721,"Elvers-Schreiber,",,DAV Berlin,1951.0,W,55-59,86.0,04:17:47,04:22:01,2008,57.0,22569.0,1021.0,247750.0,247750.0,DEU,DEU,Germany
122234,17214,F2727,"Emmanuelle Rochet-Blanc,",,les chauffe la semelle,1968.0,W,40-44,433.0,04:00:22,04:06:18,2008,40.0,17210.0,4043.0,187956.0,187956.0,FRA,FRA,France
124378,34879,37896,"Williams,",,,,M,20-29,2921.0,05:38:09,05:44:09,2008,,34865.0,3904.0,390558.0,390558.0,DEU,DEU,Germany
124379,27883,37897,"Keith,",,,,M,20-29,2452.0,04:37:34,04:41:15,2008,,27869.0,3097.0,307136.0,307136.0,GBR,GBR,United Kingdom
125273,4790,37898,"Witfield,",,,,M,20-29,503.0,03:22:56,03:24:39,2008,,4791.0,560.0,51816.0,51816.0,GBR,GBR,United Kingdom
132211,836,6092,Bosgoed,,ARO'88,1966.0,M,40-44,183.0,02:57:05,02:57:29,2009,43.0,837.0,184.0,11273.0,11273.0,NLD,NLD,Netherlands
266361,6254,10087,Gestin,,CACL Courbevoie,1967.0,M,45-49,1086.0,03:27:16,03:30:36,2013,46.0,6249.0,1145.0,63193.0,63193.0,FRA,FRA,France


In [85]:
df.surname.describe()

count      403096
unique     127265
top       Nielsen
freq         1648
Name: surname, dtype: object

In [86]:
len(df[df.surname.isnull()])

38

In [87]:
df[(df.surname.isnull()) & (~df.forename.isnull())]

Unnamed: 0,place,bib,surname,forename,team,yob,sex,age_class,age_class_place,net_time,clock_time,year,age,clean_place,clean_age_class_place,super_place,super_age_class_place,abbreviation,iso3166_1alpha_3,country
30385,12730,32232,,Benjamin,,1982.0,M,20-29,1134.0,04:04:33,04:10:01,2006,24.0,12726.0,1342.0,202133.0,202133.0,DEU,DEU,Germany
148108,223,26347,,Christoph,LAV Asics Tübingen,1984.0,M,20-29,53.0,02:41:58,02:42:06,2009,25.0,223.0,58.0,2791.0,2791.0,DEU,DEU,Germany


In [88]:
df.surname = df.surname.str.strip(string.punctuation)
df.forename = df.forename.str.strip(string.punctuation)

## Bibs

In [89]:
df.bib.describe()

count     403127
unique     59339
top        F1007
freq          37
Name: bib, dtype: object

## Save the Clean Data

In [90]:
df.dtypes

place                              int64
bib                               object
surname                           object
forename                          object
team                              object
yob                              float64
sex                             category
age_class                       category
age_class_place                  float64
net_time                 timedelta64[ns]
clock_time               timedelta64[ns]
year                               int64
age                              float64
clean_place                      float64
clean_age_class_place            float64
super_place                      float64
super_age_class_place            float64
abbreviation                      object
iso3166_1alpha_3                  object
country                           object
dtype: object

In [91]:
df.sort_values(['year', 'place', 'clock_time']).to_csv(clean_filename)