In [40]:
INPUT_PATH = "full_classes.csv"
OUTPUT_PATH = "full_classes.parquet"

In [41]:
import pandera as pa
import re

Kats = [
    "offene Klasse",
    "M Offene Klasse",
    "W Offene Klasse",
    "M Ü60",
    "Ü30",
    "Ü35",
    "Ü40",
    "Ü45",
    "Ü50",
    "Ü55",
    "Ü60",
]  # Example list of category names

schema = pa.DataFrameSchema(
    {
        "Pos": pa.Column(pa.Int, checks=pa.Check.greater_than_or_equal_to(0)),
        "Nr": pa.Column(pa.Int, checks=[pa.Check.greater_than_or_equal_to(0)]),
        "Name": pa.Column(
            pa.String,
            checks=pa.Check(
                lambda s: not bool(re.search(r"\d", s)),
                element_wise=True,
                error="Contains numbers",
            ),
        ),
        "Zeit": pa.Column(
            pa.String,
            checks=pa.Check(
                lambda x: re.match(r"\d{2}:\d{2}:\d{2}\.\d", x) is not None,
                element_wise=True,
            ),
        ),
        "Kat": pa.Column(pa.String, checks=pa.Check.isin(Kats)),
        "KPos": pa.Column(pa.Int, checks=pa.Check.greater_than_or_equal_to(0)),
        "GPos": pa.Column(pa.Int, checks=pa.Check.greater_than_or_equal_to(0)),
        "Nation": pa.Column(
            pa.String, checks=pa.Check.str_length(min_value=3, max_value=3)
        ),
        "Verein": pa.Column(pa.String),
    },
    unique=["Nr"],
)


In [42]:
import pandas as pd
raw_df = pd.read_csv(INPUT_PATH, header = None, names=range(9))
raw_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,W Offene Klasse,,,,,,,,
1,Pos Nr Name,Zeit,Kat,KPos,GPos,Nation,Verein,,
2,1. 50067 Maria Esfandiari,00:43:19.7,W Offene Klasse,1,975,,FUN FACTORY,,
3,2. 50184 Corinna Meyer,00:48:11.7,W Offene Klasse,2,1298,,RTC Treuhand GmbH & Co. KG,,
4,3. 4753 Elena Liliane Safta,00:53:28.5,W Offene Klasse,3,1592,,Giloy & Söhne,,


In [43]:
col_names = ["Pos", "Nr", "Name", "Zeit", "Kat", "KPos", "GPos", "Nation", "Verein"]
def label_columns(df):
    df.columns = col_names
    return df

In [44]:
import numpy as np
def clean_dashes_to_nans(df):
    df = df.replace("-", np.NaN)
    return df

In [45]:
time_pattern = r"\d{2}:\d{2}:\d{2}.\d"
def clean_squished_name_time(df):
    # Split the column at index 2 into name and time using rsplit()
    name_time = df.iloc[:, 2].str.rsplit(" ", n=1, expand=True)
    # Create a boolean mask to filter rows where the time matches the pattern
    mask = name_time[1].str.match(time_pattern, na=False)
    df.loc[mask, "Name"] = name_time[0]
    # Insert a new column to the right with the extracted times for matching rows
    df.loc[mask, "Zeit"] = name_time[1]
    return df

In [46]:
def clean_split_name_time_namecol(df):
    mask = df["Name"].str.contains(time_pattern, regex=True, na=False)
    df.loc[mask, "Zeit"] = df.loc[mask, "Name"].str.extract(fr"({time_pattern})", expand=False)
    df.loc[mask, "Name"] = df.loc[mask, "Name"].str.replace(time_pattern, "", regex=True)
    return df


In [47]:
def clean_time_in_nr(df):
    mask = df["Nr"].str.contains(time_pattern, regex=True, na=False)
    replacements = [("Verein", "GPos"), ("Nation", "KPos"), ("GPos", "Kat"), ("KPos", "Zeit"), ("Kat", "Name"), ("Zeit", "Nr")]
    for col1, col2 in replacements:
        df.loc[mask, col1] = df.loc[mask, col2]
    
    df.loc[mask, 'Nr'] = df.loc[mask, 'Pos'].str.split(' ', n=2).str[1].str.strip()
    df.loc[mask, 'Name'] = df.loc[mask, 'Pos'].str.split(' ', n=2).str[2:].str.join('').str.strip()
    df.loc[mask, 'Pos'] = df.loc[mask, 'Pos'].str.split(' ', n=1).str[0].str.strip()
    return df

In [48]:
def clean_drop_nans_and_label_rows(df):
    df = df.drop_duplicates()
    df = df.dropna(thresh=len(df.columns) - 5)
    df = df[~df.apply(lambda row: row.astype(str).str.contains('GPos').any(), axis=1)]
    return df

In [49]:
def clean_move_nation_to_verein(df):
    mask = (~df["Nation"].isna()) & (df["Verein"].isna())
    df.loc[mask, "Verein"] = df.loc[mask, "Nation"]
    return df


In [50]:
def clean_convert_pos(df):
    df["Pos"] = df["Pos"].str.split(".").str[0].str.strip()
    mask = df["Pos"] == "DNF"
    df.loc[mask, "Zeit"] = "00:00:00.0"
    df.loc[mask, "GPos"] = 0
    df.loc[mask, "KPos"] = 0
    df.loc[mask, "Pos"] = 0
    df["Pos"] = df["Pos"].astype(int)
    df["GPos"] = df["GPos"].astype(int)
    df["KPos"] = df["KPos"].astype(int)

    return df

In [51]:
def clean_convert_nr(df):
    df["Nr"] = df["Nr"].astype(int)
    return df

In [52]:
def clean_dedash_name(df):
    df["Name"] = df['Name'].apply(lambda text: text[:-1].strip() if text.endswith("-") else text)
    return df

In [53]:
def clean_spliced_names_times(df):
    mask = (~(df["Pos"] == 0)) & (df["Zeit"].isna())
    df.loc[mask, "Zeit"] = df.loc[mask, "Name"].apply(lambda s: "".join([c for c in s if (c.isnumeric() or c == ":" or c ==".")]))
    df.loc[mask, "Name"] = df.loc[mask, "Name"].apply(lambda s: "".join([c for c in s if not (c.isnumeric() or c == ":" or c ==".")]))
    return df

In [54]:
def clean_empty_nation(df):
    mask = (df["Nation"] == df["Verein"]) | (df["Nation"].isna())
    df.loc[mask,"Nation"] = "GER"
    return df

In [55]:
def clean_verein_na(df):
    df.loc[df["Verein"].isna(), "Verein"] = "None"
    return df

In [56]:
raw_df.sample(5, random_state=42)

Unnamed: 0,0,1,2,3,4,5,6,7,8
763,651.0,4985,Gerhard Zobel 00:32:30.0,,offene Klasse,651,1082,Hydro Extrusion Deutschland GmbH,
2991,921.0,2953,Sarah Appel,00:50:58.4,offene Klasse,921,1447,BKK firmus,
4631,112.0,7212,Benny Rievers 00:34:23.2,,Ü40,112,1570,ZARM Fallturm-Betriebsgesellschaft,
3509,1207.0,5895,Iris Schalk,01:04:52.0,offene Klasse,1207,1903,Nehlsen GmbH & Co. KG,
3384,2213.0,5145,Murat Demir,00:58:14.3,offene Klasse,2213,3592,Joh. Gottfr. Schütte GmbH & Co. KG,


# Cleaning initial data structure

In [57]:
clean_df = (raw_df.pipe(label_columns)
          .pipe(clean_dashes_to_nans)
          .pipe(clean_squished_name_time)
          .pipe(clean_split_name_time_namecol)
          .pipe(clean_time_in_nr)
          .pipe(clean_drop_nans_and_label_rows)
          .pipe(clean_move_nation_to_verein)
          .pipe(clean_convert_pos)
          .pipe(clean_convert_nr)
          .pipe(clean_dedash_name)
          .pipe(clean_spliced_names_times)
          .pipe(clean_empty_nation)
          .pipe(clean_verein_na)
          )
clean_df.reset_index(drop=True, inplace=True)
clean_df.head()

Unnamed: 0,Pos,Nr,Name,Zeit,Kat,KPos,GPos,Nation,Verein
0,1,50067,Maria Esfandiari,00:43:19.7,W Offene Klasse,1,975,GER,FUN FACTORY
1,2,50184,Corinna Meyer,00:48:11.7,W Offene Klasse,2,1298,GER,RTC Treuhand GmbH & Co. KG
2,3,4753,Elena Liliane Safta,00:53:28.5,W Offene Klasse,3,1592,GER,Giloy & Söhne
3,4,50180,Katja Krömer,00:56:44.3,W Offene Klasse,4,1736,GER,ZARM Fab
4,1,50181,Jannes Jacss,00:29:43.7,M Offene Klasse,1,476,GER,Broetje-Automation GmbH


In [58]:
clean_df.sample(5)

Unnamed: 0,Pos,Nr,Name,Zeit,Kat,KPos,GPos,Nation,Verein
1995,1531,5196,Adem Wojcinska,00:40:00.5,offene Klasse,1531,2580,GER,Jürgen Klose Industrietechnik GmbH
4699,167,488,Lars Neddermann,00:38:29.5,Ü40,167,2370,GER,DESMA Schuhmaschinen GmbH
3069,967,6096,Katharina Eigner,00:51:40.5,offene Klasse,967,1502,GER,Open Reply
4040,312,3212,Marco Mann,00:47:36.2,Ü30,312,3210,GER,Bremer Tresor
1477,1206,681,Felix Sonntag,00:36:35.5,offene Klasse,1206,2028,GER,FR. FASSMER GmbH & Co. KG


In [59]:
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5719 entries, 0 to 5718
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Pos     5719 non-null   int64 
 1   Nr      5719 non-null   int64 
 2   Name    5719 non-null   object
 3   Zeit    5719 non-null   object
 4   Kat     5719 non-null   object
 5   KPos    5719 non-null   int64 
 6   GPos    5719 non-null   int64 
 7   Nation  5719 non-null   object
 8   Verein  5719 non-null   object
dtypes: int64(4), object(5)
memory usage: 402.2+ KB


In [60]:
clean_df.describe(include="all")

Unnamed: 0,Pos,Nr,Name,Zeit,Kat,KPos,GPos,Nation,Verein
count,5719.0,5719.0,5719,5719,5719,5719.0,5719.0,5719,5719
unique,,,5240,4618,11,,,1,362
top,,,No Name,00:00:00.0,offene Klasse,,,GER,ArcelorMittal Bremen GmbH
freq,,,460,17,3573,,,5719,150
mean,633.404092,4453.3943,,,,633.404092,1537.228886,,
std,648.469488,6150.007726,,,,648.469488,1016.943,,
min,0.0,1.0,,,,0.0,0.0,,
25%,97.0,1908.5,,,,97.0,693.0,,
50%,342.0,3763.0,,,,342.0,1408.0,,
75%,1055.5,5678.5,,,,1055.5,2267.5,,


# Validation and Probing

In [61]:
schema.validate(clean_df)

Unnamed: 0,Pos,Nr,Name,Zeit,Kat,KPos,GPos,Nation,Verein
0,1,50067,Maria Esfandiari,00:43:19.7,W Offene Klasse,1,975,GER,FUN FACTORY
1,2,50184,Corinna Meyer,00:48:11.7,W Offene Klasse,2,1298,GER,RTC Treuhand GmbH & Co. KG
2,3,4753,Elena Liliane Safta,00:53:28.5,W Offene Klasse,3,1592,GER,Giloy & Söhne
3,4,50180,Katja Krömer,00:56:44.3,W Offene Klasse,4,1736,GER,ZARM Fab
4,1,50181,Jannes Jacss,00:29:43.7,M Offene Klasse,1,476,GER,Broetje-Automation GmbH
...,...,...,...,...,...,...,...,...,...
5714,23,133,Nicola Wißmann-Voß,00:59:48.5,Ü60,23,1820,GER,Allianz Private Krankenversicherung
5715,24,2816,Annette Fischer,01:03:47.8,Ü60,24,1897,GER,AWO Bremen
5716,25,2849,Bozena Smolka,01:04:40.5,Ü60,25,1899,GER,AWO Bremen
5717,26,1282,Gabi Klockgether,01:09:57.0,Ü60,26,1928,GER,Max-Planck-Institut Bremen


# Featurizing for Smarter Data

In [62]:
new_columns = {
    "DNF": pa.Column(pa.Bool),
    "VName": pa.Column(pa.String),
    "FName": pa.Column(pa.String),
    "NoName": pa.Column(pa.Bool),
    "NoFName": pa.Column(pa.Bool),
    "MTeam": pa.Column(pa.Bool),
}

# Add the new columns to the existing schema
full_schema = schema.add_columns(new_columns)
full_schema = full_schema.update_column("Zeit", dtype=pa.Float, checks=[pa.Check.between(0, 12*3600)])

In [63]:
def convert_time_from_str(df):
    df['Zeit'] = pd.to_timedelta(df['Zeit']).dt.total_seconds()
    return df

In [64]:
def featurize(df):
    df["DNF"] = df["Pos"] == 0
    df['VName'] = df['Name'].str.split().str[:-1].apply(' '.join).str.strip()
    df['FName'] = df['Name'].str.split().str[-1].str.strip()
    df['NoName'] = (df['VName'] == 'No') & (df['FName'] == 'Name')
    df['NoFName'] = (df['FName'] == 'Noname') | (df['NoName'])
    df['MTeam'] = df['Verein'] == 'MERENTIS GmbH'
    df['MTeam'] = df['MTeam'].astype(bool)
    df = df.sort_values(by=['Kat', 'Zeit']).reset_index(drop=True)
    return df

In [65]:
full_df = (clean_df.pipe(convert_time_from_str)
           .pipe(featurize)
           )

In [66]:
full_df.head()

Unnamed: 0,Pos,Nr,Name,Zeit,Kat,KPos,GPos,Nation,Verein,DNF,VName,FName,NoName,NoFName,MTeam
0,1,50181,Jannes Jacss,1783.7,M Offene Klasse,1,476,GER,Broetje-Automation GmbH,False,Jannes,Jacss,False,False,False
1,2,50060,Carsten Smidt,1842.6,M Offene Klasse,2,661,GER,,False,Carsten,Smidt,False,False,False
2,3,50065,Marc Sowinski,1852.7,M Offene Klasse,3,695,GER,DPolG Bremen,False,Marc,Sowinski,False,False,False
3,4,50064,Bernhard Soika,1853.4,M Offene Klasse,4,698,GER,DPolG Bremen,False,Bernhard,Soika,False,False,False
4,5,50066,Jannis Fischer,1888.2,M Offene Klasse,5,837,GER,DPolG Bremen,False,Jannis,Fischer,False,False,False


In [67]:
full_schema.validate(full_df)

Unnamed: 0,Pos,Nr,Name,Zeit,Kat,KPos,GPos,Nation,Verein,DNF,VName,FName,NoName,NoFName,MTeam
0,1,50181,Jannes Jacss,1783.7,M Offene Klasse,1,476,GER,Broetje-Automation GmbH,False,Jannes,Jacss,False,False,False
1,2,50060,Carsten Smidt,1842.6,M Offene Klasse,2,661,GER,,False,Carsten,Smidt,False,False,False
2,3,50065,Marc Sowinski,1852.7,M Offene Klasse,3,695,GER,DPolG Bremen,False,Marc,Sowinski,False,False,False
3,4,50064,Bernhard Soika,1853.4,M Offene Klasse,4,698,GER,DPolG Bremen,False,Bernhard,Soika,False,False,False
4,5,50066,Jannis Fischer,1888.2,M Offene Klasse,5,837,GER,DPolG Bremen,False,Jannis,Fischer,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5714,23,133,Nicola Wißmann-Voß,3588.5,Ü60,23,1820,GER,Allianz Private Krankenversicherung,False,Nicola,Wißmann-Voß,False,False,False
5715,24,2816,Annette Fischer,3827.8,Ü60,24,1897,GER,AWO Bremen,False,Annette,Fischer,False,False,False
5716,25,2849,Bozena Smolka,3880.5,Ü60,25,1899,GER,AWO Bremen,False,Bozena,Smolka,False,False,False
5717,26,1282,Gabi Klockgether,4197.0,Ü60,26,1928,GER,Max-Planck-Institut Bremen,False,Gabi,Klockgether,False,False,False


In [68]:
full_df.describe(include="all")

Unnamed: 0,Pos,Nr,Name,Zeit,Kat,KPos,GPos,Nation,Verein,DNF,VName,FName,NoName,NoFName,MTeam
count,5719.0,5719.0,5719,5719.0,5719,5719.0,5719.0,5719,5719,5719,5719,5719,5719,5719,5719
unique,,,5240,,11,,,1,362,2,1627,3964,2,2,2
top,,,No Name,,offene Klasse,,,GER,ArcelorMittal Bremen GmbH,False,No,Name,False,False,False
freq,,,460,,3573,,,5719,150,5702,460,461,5259,5250,5703
mean,633.404092,4453.3943,,2418.386501,,633.404092,1537.228886,,,,,,,,
std,648.469488,6150.007726,,606.256221,,648.469488,1016.943,,,,,,,,
min,0.0,1.0,,0.0,,0.0,0.0,,,,,,,,
25%,97.0,1908.5,,1992.1,,97.0,693.0,,,,,,,,
50%,342.0,3763.0,,2286.3,,342.0,1408.0,,,,,,,,
75%,1055.5,5678.5,,2752.7,,1055.5,2267.5,,,,,,,,


In [69]:
men_df = pd.read_csv("classes_m.csv", header=None, names=range(9))
men_df = (
    men_df.pipe(label_columns)
    .pipe(clean_dashes_to_nans)
    .pipe(clean_squished_name_time)
    .pipe(clean_split_name_time_namecol)
    .pipe(clean_time_in_nr)
    .pipe(clean_drop_nans_and_label_rows)
    .pipe(clean_convert_nr)
)
men_nr = men_df["Nr"]


women_df = pd.read_csv("classes_w.csv", header=None, names=range(9))
women_df = (
    women_df.pipe(label_columns)
    .pipe(clean_dashes_to_nans)
    #.pipe(clean_squished_name_time)
    .pipe(clean_split_name_time_namecol)
    .pipe(clean_time_in_nr)
    .pipe(clean_drop_nans_and_label_rows)
    .pipe(clean_convert_nr)
)
women_nr = women_df["Nr"]

In [70]:
conditions = [
    full_df["Nr"].isin(men_df["Nr"]),
    full_df["Nr"].isin(women_df["Nr"])
]

choices = ["M", "W"]

full_df["Geschlecht"] = np.select(conditions, choices, default="U")
full_df.to_parquet(OUTPUT_PATH)

In [71]:
full_df.head(20)

Unnamed: 0,Pos,Nr,Name,Zeit,Kat,KPos,GPos,Nation,Verein,DNF,VName,FName,NoName,NoFName,MTeam,Geschlecht
0,1,50181,Jannes Jacss,1783.7,M Offene Klasse,1,476,GER,Broetje-Automation GmbH,False,Jannes,Jacss,False,False,False,M
1,2,50060,Carsten Smidt,1842.6,M Offene Klasse,2,661,GER,,False,Carsten,Smidt,False,False,False,M
2,3,50065,Marc Sowinski,1852.7,M Offene Klasse,3,695,GER,DPolG Bremen,False,Marc,Sowinski,False,False,False,M
3,4,50064,Bernhard Soika,1853.4,M Offene Klasse,4,698,GER,DPolG Bremen,False,Bernhard,Soika,False,False,False,M
4,5,50066,Jannis Fischer,1888.2,M Offene Klasse,5,837,GER,DPolG Bremen,False,Jannis,Fischer,False,False,False,M
5,6,50049,Bjoern Bischoff,2303.4,M Offene Klasse,6,2349,GER,,False,Bjoern,Bischoff,False,False,False,M
6,7,50178,Fabian Skok,2306.5,M Offene Klasse,7,2361,GER,DCON,False,Fabian,Skok,False,False,False,M
7,8,50182,Thomas Dennis,2496.0,M Offene Klasse,8,2779,GER,Broetje-Automation GmbH,False,Thomas,Dennis,False,False,False,M
8,9,50185,Thomas Blome,2749.2,M Offene Klasse,9,3098,GER,NORD/LB,False,Thomas,Blome,False,False,False,M
9,10,50186,Dennis Thomas,3199.2,M Offene Klasse,10,3458,GER,Broetje-Automation GmbH,False,Dennis,Thomas,False,False,False,M


In [72]:
full_df[(full_df["Geschlecht"] == "U") & (~full_df["Verein"].str.contains("Fruitful")) & (~full_df["DNF"])]

Unnamed: 0,Pos,Nr,Name,Zeit,Kat,KPos,GPos,Nation,Verein,DNF,VName,FName,NoName,NoFName,MTeam,Geschlecht
1345,3,1291,Andrea Nebel,2150.0,offene Klasse,3,3,GER,Max-Planck-Institut Bremen,False,Andrea,Nebel,False,False,False,U
1500,4,7178,Helge Deeg,2203.0,offene Klasse,4,4,GER,#OneXylem,False,Helge,Deeg,False,False,False,U
2259,10,7180,Jochen Meyer,2526.2,offene Klasse,10,11,GER,#OneXylem,False,Jochen,Meyer,False,False,False,U
2717,11,50183,Schapn Noname,2811.4,offene Klasse,11,13,GER,ISW-MBH,False,Schapn,Noname,False,True,False,U
2741,12,2083,Mats Thieme,2824.6,offene Klasse,12,14,GER,adesso SE,False,Mats,Thieme,False,False,False,U
2765,13,2958,Dmitrij Gridunov,2849.1,offene Klasse,13,15,GER,BKK firmus,False,Dmitrij,Gridunov,False,False,False,U
3190,20,7179,Thorben Lange,3209.5,offene Klasse,20,24,GER,#OneXylem,False,Thorben,Lange,False,False,False,U
3192,21,7181,Sören Mons,3210.2,offene Klasse,21,25,GER,#OneXylem,False,Sören,Mons,False,False,False,U
3255,22,1670,Felicia Weiß,3301.8,offene Klasse,22,26,GER,SEGHORN GmbH,False,Felicia,Weiß,False,False,False,U
3983,1,1275,Jana Geuer,2509.5,Ü30,1,10,GER,Max-Planck-Institut Bremen,False,Jana,Geuer,False,False,False,U


In [75]:
ok_df = full_df[(full_df["Kat"] == "offene Klasse") & (full_df["Geschlecht"] == "W")].sort_values(by='GPos', ascending=True).reset_index(drop=True)
jumps_ok = ok_df[ok_df['Zeit'] < ok_df['Zeit'].shift(1)]
jumps_ok

Unnamed: 0,Pos,Nr,Name,Zeit,Kat,KPos,GPos,Nation,Verein,DNF,VName,FName,NoName,NoFName,MTeam,Geschlecht
365,358,3936,Katharina Sprick,1817.3,offene Klasse,358,570,GER,Deutsche Factoring Bank,False,Katharina,Sprick,False,False,False,W
757,726,3919,Alke Hegeler,1981.7,offene Klasse,726,1208,GER,Deutsche Factoring Bank,False,Alke,Hegeler,False,False,False,W


In [81]:
ok_df[(ok_df['GPos'].isin(jumps_ok["GPos"]- 1))].sort_values('GPos')

Unnamed: 0,Pos,Nr,Name,Zeit,Kat,KPos,GPos,Nation,Verein,DNF,VName,FName,NoName,NoFName,MTeam,Geschlecht
364,365,1133,Svetlana Kotelnikova,2304.2,offene Klasse,365,569,GER,MAERSK,False,Svetlana,Kotelnikova,False,False,False,W
756,756,267,Neele Reiners,2801.7,offene Klasse,756,1207,GER,ATLAS ELEKTRONIK GmbH,False,Neele,Reiners,False,False,False,W
