In [None]:
import re

import numpy as np
import pandas as pd
import pandera as pa

INPUT_PATH = "./data/full_classes.csv"
OUTPUT_PATH = "./data/full_classes.parquet"

In [None]:
Kats = [
    "offene Klasse",
    "M Offene Klasse",
    "W Offene Klasse",
    "M Ü60",
    "Ü30",
    "Ü35",
    "Ü40",
    "Ü45",
    "Ü50",
    "Ü55",
    "Ü60",
]  # Example list of category names

schema = pa.DataFrameSchema(
    {
        "Pos": pa.Column(pa.Int, checks=pa.Check.greater_than_or_equal_to(0)),
        "Nr": pa.Column(pa.Int, checks=[pa.Check.greater_than_or_equal_to(0)]),
        "Name": pa.Column(
            pa.String,
            checks=pa.Check(
                lambda s: not s.isnumeric(),
                element_wise=True,
                error="Name is numeric",
            ),
        ),
        "Zeit": pa.Column(
            pa.String,
            checks=pa.Check(
                lambda x: re.match(r"\d{2}:\d{2}:\d{2}\.\d", x) is not None,
                element_wise=True,
            ),
        ),
        "Kat": pa.Column(pa.String, checks=pa.Check.isin(Kats)),
        "KPos": pa.Column(pa.Int, checks=pa.Check.greater_than_or_equal_to(0)),
        "GPos": pa.Column(pa.Int, checks=pa.Check.greater_than_or_equal_to(0)),
        "Nation": pa.Column(
            pa.String, checks=pa.Check.str_length(min_value=3, max_value=3)
        ),
        "Verein": pa.Column(pa.String),
    },
    unique=["Nr"],
)

In [None]:
raw_df = pd.read_csv(INPUT_PATH, header=None, names=range(9))
raw_df.head()

In [None]:
col_names = ["Pos", "Nr", "Name", "Zeit", "Kat", "KPos", "GPos", "Nation", "Verein"]


def label_columns(df):
    df.columns = col_names
    return df

In [None]:
def clean_dashes_to_nans(df):
    df = df.replace("-", np.NaN)
    return df

In [None]:
time_pattern = r"\d{2}:\d{2}:\d{2}.\d"


def clean_squished_name_time(df):
    # Split the column at index 2 into name and time using rsplit()
    name_time = df.iloc[:, 2].str.rsplit(" ", n=1, expand=True)
    # Create a boolean mask to filter rows where the time matches the pattern
    mask = name_time[1].str.match(time_pattern, na=False)
    df.loc[mask, "Name"] = name_time[0]
    # Insert a new column to the right with the extracted times for matching rows
    df.loc[mask, "Zeit"] = name_time[1]
    return df

In [None]:
def clean_split_name_time_namecol(df):
    mask = df["Name"].str.contains(time_pattern, regex=True, na=False)
    df.loc[mask, "Zeit"] = df.loc[mask, "Name"].str.extract(
        rf"({time_pattern})", expand=False
    )
    df.loc[mask, "Name"] = df.loc[mask, "Name"].str.replace(
        time_pattern, "", regex=True
    )
    return df

In [None]:
def clean_time_in_nr(df):
    mask = df["Nr"].str.contains(time_pattern, regex=True, na=False)
    replacements = [
        ("Verein", "GPos"),
        ("Nation", "KPos"),
        ("GPos", "Kat"),
        ("KPos", "Zeit"),
        ("Kat", "Name"),
        ("Zeit", "Nr"),
    ]
    for col1, col2 in replacements:
        df.loc[mask, col1] = df.loc[mask, col2]

    df.loc[mask, "Nr"] = df.loc[mask, "Pos"].str.split(" ", n=2).str[1].str.strip()
    df.loc[mask, "Name"] = (
        df.loc[mask, "Pos"].str.split(" ", n=2).str[2:].str.join("").str.strip()
    )
    df.loc[mask, "Pos"] = df.loc[mask, "Pos"].str.split(" ", n=1).str[0].str.strip()
    return df

In [None]:
def clean_drop_nans_and_label_rows(df):
    df = df.drop_duplicates()
    df = df.dropna(thresh=len(df.columns) - 5)
    df = df[~df.apply(lambda row: row.astype(str).str.contains("GPos").any(), axis=1)]
    return df

In [None]:
def clean_move_nation_to_verein(df):
    mask = (~df["Nation"].isna()) & (df["Verein"].isna())
    df.loc[mask, "Verein"] = df.loc[mask, "Nation"]
    return df

In [None]:
def clean_convert_pos(df):
    df["Pos"] = df["Pos"].str.split(".").str[0].str.strip()
    mask = df["Pos"] == "DNF"
    df.loc[mask, "Zeit"] = "00:00:00.0"
    df.loc[mask, "GPos"] = 0
    df.loc[mask, "KPos"] = 0
    df.loc[mask, "Pos"] = 0
    df["Pos"] = df["Pos"].astype(int)
    df["GPos"] = df["GPos"].astype(int)
    df["KPos"] = df["KPos"].astype(int)

    return df

In [None]:
def clean_convert_nr(df):
    df["Nr"] = df["Nr"].astype(int)
    return df

In [None]:
def clean_dedash_name(df):
    df["Name"] = df["Name"].apply(
        lambda text: text[:-1].strip() if text.endswith("-") else text
    )
    return df

In [None]:
def clean_spliced_names_times(df):
    mask = (~(df["Pos"] == 0)) & (df["Zeit"].isna())
    df.loc[mask, "Zeit"] = df.loc[mask, "Name"].apply(
        lambda s: "".join([c for c in s if (c.isnumeric() or c == ":" or c == ".")])
    )
    df.loc[mask, "Name"] = df.loc[mask, "Name"].apply(
        lambda s: "".join([c for c in s if not (c.isnumeric() or c == ":" or c == ".")])
    )
    return df

In [None]:
def clean_empty_nation(df):
    mask = (df["Nation"] == df["Verein"]) | (df["Nation"].isna())
    df.loc[mask, "Nation"] = "GER"
    return df

In [None]:
def clean_verein_na(df):
    df.loc[df["Verein"].isna(), "Verein"] = "None"
    return df

In [None]:
raw_df.sample(5, random_state=42)

# Cleaning initial data structure

In [None]:
clean_df = (
    raw_df.pipe(label_columns)
    .pipe(clean_dashes_to_nans)
    .pipe(clean_squished_name_time)
    .pipe(clean_split_name_time_namecol)
    .pipe(clean_time_in_nr)
    .pipe(clean_drop_nans_and_label_rows)
    .pipe(clean_move_nation_to_verein)
    .pipe(clean_convert_pos)
    .pipe(clean_convert_nr)
    .pipe(clean_dedash_name)
    .pipe(clean_spliced_names_times)
    .pipe(clean_empty_nation)
    .pipe(clean_verein_na)
)
clean_df.reset_index(drop=True, inplace=True)
clean_df.head()

In [None]:
clean_df.sample(5)

In [None]:
clean_df.info()

In [None]:
clean_df.describe(include="all")

# Validation and Probing

In [None]:
schema.validate(clean_df)

# Featurizing for Smarter Data

In [None]:
Geschlechter = ["M", "W", "U"]  # Männlich, weiblich, and unknown/unbekannt

new_columns = {
    "DNF": pa.Column(pa.Bool),
    "NameLen": pa.Column(pa.Int, pa.Check.greater_than_or_equal_to(0)),
    "FirstLetterRank": pa.Column(pa.Int, pa.Check.between(0, 26)),
    "VName": pa.Column(
        pa.String,
        checks=pa.Check(
            lambda s: not s.isnumeric(),
            element_wise=True,
            error="Name is numeric",
        ),
    ),
    "FName": pa.Column(
        pa.String,
        checks=pa.Check(
            lambda s: not s.isnumeric(),
            element_wise=True,
            error="Name is numeric",
        ),
    ),
    "NoName": pa.Column(pa.Bool),
    "NoFName": pa.Column(pa.Bool),
    "MTeam": pa.Column(pa.Bool),
    "Geschlecht": pa.Column(pa.String, pa.Check.isin(Geschlechter)),
}

# Add the new columns to the existing schema
full_schema = schema.add_columns(new_columns)
full_schema = full_schema.update_column(
    "Zeit", dtype=pa.Float, checks=[pa.Check.between(0, 12 * 3600)]
)

In [None]:
def convert_time_from_str(df):
    df["Zeit"] = pd.to_timedelta(df["Zeit"]).dt.total_seconds()
    return df

In [None]:
letter_rank = {
    letter: rank for rank, letter in enumerate("abcdefghijklmnopqrstuvwxyz", start=1)
}

# Function to get the rank of the first letter in a name


def get_first_letter_rank(name):
    first_letter = name[0].lower()
    return letter_rank.get(first_letter, 0)


def featurize(df):
    df["DNF"] = df["Pos"] == 0
    df["NameLen"] = df["Name"].str.len()
    df["FirstLetterRank"] = df["Name"].apply(get_first_letter_rank)
    df["VName"] = df["Name"].str.split().str[:-1].apply(" ".join).str.strip()
    df["FName"] = df["Name"].str.split().str[-1].str.strip()
    df["NoName"] = (df["VName"] == "No") & (df["FName"] == "Name")
    df["NoFName"] = (df["FName"] == "Noname") | (df["NoName"])
    df["MTeam"] = df["Verein"] == "MERENTIS GmbH"
    df["MTeam"] = df["MTeam"].astype(bool)
    df = df.sort_values(by=["Kat", "Zeit"]).reset_index(drop=True)
    return df

In [None]:
full_df = clean_df.pipe(convert_time_from_str).pipe(featurize)

In [None]:
men_df = pd.read_csv("./data/classes_m.csv", header=None, names=range(9))
men_df = (
    men_df.pipe(label_columns)
    .pipe(clean_dashes_to_nans)
    .pipe(clean_squished_name_time)
    .pipe(clean_split_name_time_namecol)
    .pipe(clean_time_in_nr)
    .pipe(clean_drop_nans_and_label_rows)
    .pipe(clean_convert_nr)
)
men_nr = men_df["Nr"]


women_df = pd.read_csv("./data/classes_w.csv", header=None, names=range(9))
women_df = (
    women_df.pipe(label_columns)
    .pipe(clean_dashes_to_nans)
    # .pipe(clean_squished_name_time)
    .pipe(clean_split_name_time_namecol)
    .pipe(clean_time_in_nr)
    .pipe(clean_drop_nans_and_label_rows)
    .pipe(clean_convert_nr)
)
women_nr = women_df["Nr"]

In [None]:
conditions = [full_df["Nr"].isin(men_df["Nr"]), full_df["Nr"].isin(women_df["Nr"])]

choices = ["M", "W"]

full_df["Geschlecht"] = np.select(conditions, choices, default="U")

In [None]:
full_schema.validate(full_df)

In [None]:
full_df.head()

In [None]:
full_df.describe(include="all")

In [None]:
full_df.to_parquet(OUTPUT_PATH)

In [None]:
full_df[
    (full_df["Geschlecht"] == "U")
    & (~full_df["Verein"].str.contains("Fruitful"))
    & (~full_df["DNF"])
]

In [None]:
ok_df = (
    full_df[(full_df["Kat"] == "offene Klasse") & (full_df["Geschlecht"] == "W")]
    .sort_values(by="GPos", ascending=True)
    .reset_index(drop=True)
)
jumps_ok = ok_df[ok_df["Zeit"] < ok_df["Zeit"].shift(1)]
jumps_ok

In [None]:
ok_df[(ok_df["GPos"].isin(jumps_ok["GPos"] - 1))].sort_values("GPos")