In [None]:
import pandas as pd
import tabula

MEN_PATH = "/app/fixed_results_m.pdf"
WOMEN_PATH = "/app/results_w.pdf"
MIXED_PATH = "/app/results_mixed.pdf"


def load_tables(path):
    return tabula.read_pdf(path, pages="all", stream=True)

In [None]:
men_tables, women_tables, mixed_tables = (
    load_tables(MEN_PATH),
    load_tables(WOMEN_PATH),
    load_tables(MIXED_PATH),
)

In [None]:
def clean_split_name_from_time(df):
    # Cleans dfs that have columns like 'Firstname-Lastname22:31:02.3'
    # Define regex pattern for matching the time in hh:mm:ss.x format
    time_pattern = r"(\d{2}:\d{2}:\d{2}\.\d)"
    replace_pattern = r"\d{2}:\d{2}:\d{2}\.\d"
    # Extract the time from the original column using regex and create a new column
    df.insert(5, 5, df.iloc[:, 2].str.extract(time_pattern))
    # Remove the time from the original column, leaving just the name
    df.iloc[:, 2] = (
        df.iloc[:, 2].str.replace(replace_pattern, "", regex=True).str.strip()
    )
    # Reorder columns 3-5 to move column index 5 to index 3
    df = df.iloc[:, [0, 1, 2, 5, 3, 4]]
    return df


def clean_move_name_from_time(df):
    # Cleans dfs that have columns like 'Firstname-Lastname22:31:02.3'
    # Define regex pattern for matching the time in hh:mm:ss.x format
    time_pattern = r"(\d{2}:\d{2}:\d{2}\.\d)"
    replace_pattern = r"\d{2}:\d{2}:\d{2}\.\d"
    # Extract the time from the original column using regex and create a new column
    df.iloc[:, 3] = df.iloc[:, 2].str.extract(time_pattern)
    # Remove the time from the original column, leaving just the name
    df.iloc[:, 2] = (
        df.iloc[:, 2].str.replace(replace_pattern, "", regex=True).str.strip()
    )
    return df


def load_run_df(tables):
    # Loads extracted tables into df and puts things in the right order
    columns = tables[0].columns
    # Create an empty DataFrame with column names from the first DataFrame
    run_df = pd.DataFrame(columns=columns)
    for df in tables:
        df = df.T.reset_index().T.reset_index(drop=True)
        # Name and time shoved together, missing time column
        if len(df.columns) != len(columns):
            df = clean_split_name_from_time(df)
        # Name and time shoved together, time column NaN or garbage data
        elif ":" in df.iloc[0, 2]:
            df = clean_move_name_from_time(df)
        df.columns = columns
        df = df.loc[df.ne(df.columns).any(axis=1)]
        run_df = pd.concat([run_df, df], ignore_index=True)

    run_df = run_df.drop(columns=["GPos"])
    print(
        f"Processed {len(run_df)} total entries, {len(run_df[run_df.isna().any(axis=1)])} have NaNs"
    )
    run_df = run_df.dropna(subset="Zeit")
    print(
        f"After dropping time NaNs {len(run_df[run_df.isna().any(axis=1)])} NaNs left"
    )
    return run_df

In [None]:
men_run_df, women_run_df, mixed_run_df = (
    load_run_df(men_tables),
    load_run_df(women_tables),
    load_run_df(mixed_tables),
)

In [None]:
men_run_df[men_run_df.isna().any(axis=1)]

In [None]:
mixed_run_df.head()

In [None]:
men_run_df.head()

In [None]:
women_run_df.head()

In [None]:
def clean_int_col(df, colname):
    df[colname] = (
        df[colname].astype(str).str.replace(".0", "").str.rstrip(".").astype(int)
    )
    return df


def clean_time(df):
    df["Zeit"] = pd.to_timedelta(df["Zeit"]).dt.total_seconds()
    return df


def clean_run_df(df):
    df = clean_int_col(df, "Pos")
    df = clean_int_col(df, "Nr")
    # leave in this order!
    # cleaning time before the other vars causes errors in other cols? black magic!
    df = clean_time(df)
    df = df.dropna()
    return df

In [None]:
men_run_df, women_run_df = clean_run_df(men_run_df), clean_run_df(women_run_df)
men_run_df["Geschlecht"] = "M"
men_run_df["Geschlecht"] = men_run_df["Geschlecht"].astype("category")
women_run_df["Geschlecht"] = "W"
women_run_df["Geschlecht"] = women_run_df["Geschlecht"].astype("category")

In [None]:
men_run_df.head()

In [None]:
women_run_df.head()

In [None]:
combined_run_df = pd.concat([men_run_df, women_run_df], ignore_index=True)
combined_run_df["Geschlecht"] = combined_run_df["Geschlecht"].astype("category")

In [None]:
combined_run_df["VName"] = (
    combined_run_df["Name"].str.split().str[:-1].apply(" ".join).str.strip()
)
combined_run_df["FName"] = combined_run_df["Name"].str.split().str[-1].str.strip()
combined_run_df["NoName"] = (combined_run_df["VName"] == "No") & (
    combined_run_df["FName"] == "Name"
)
combined_run_df["NoFName"] = (combined_run_df["FName"] == "Noname") | (
    combined_run_df["NoName"]
)
combined_run_df["MTeam"] = combined_run_df["Firma"] == "MERENTIS GmbH"
combined_run_df["MTeam"] = combined_run_df["MTeam"].astype(bool)
combined_run_df = combined_run_df.sort_values(by="Zeit").reset_index(drop=True)

In [None]:
combined_run_df.head(20)

In [None]:
men_df = combined_run_df[combined_run_df["Geschlecht"] == "M"]
men_df = men_df.sort_values(by="Pos", ascending=True).reset_index(drop=True)
jumps_men = men_df[men_df["Zeit"] < men_df["Zeit"].shift(1)].set_index("Pos")
jumps_men

In [None]:
prejump_times = (
    men_df[men_df["Pos"].isin(jumps_men.index - 1)]
    .sort_values("Pos")["Zeit"]
    .reset_index(drop=True)
)
jump_times = jumps_men["Zeit"].reset_index(drop=True)
prejump_times.subtract(jump_times).to_frame().T

In [None]:
women_df = combined_run_df[combined_run_df["Geschlecht"] == "W"]
women_df = women_df.sort_values(by="Pos", ascending=True).reset_index(drop=True)
jumps_women = women_df[women_df["Zeit"] < women_df["Zeit"].shift(1)].set_index("Pos")
jumps_women

In [None]:
prejump_times = (
    women_df[women_df["Pos"].isin(jumps_women.index - 1)]
    .sort_values("Pos")["Zeit"]
    .reset_index(drop=True)
)
jump_times = jumps_women["Zeit"].reset_index(drop=True)
prejump_times.subtract(jump_times).to_frame().T

In [None]:
combined_run_df.info()

In [None]:
combined_run_df[combined_run_df.isna().any(axis=1)],
men_run_df[men_run_df.isna().any(axis=1)],
women_run_df[women_run_df.isna().any(axis=1)],

In [None]:
combined_run_df.to_parquet("/app/combined.parquet", index=False)