In [None]:
from datetime import timedelta

import hiplot as hip
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.optimize import curve_fit
from sklearn.metrics import r2_score

combined_path = "./data/full_classes.parquet"
mixed_path = "/app/mixed.parquet"

combined_run_df = pd.read_parquet(combined_path)

In [None]:
combined_run_df.head()

In [None]:
combined_run_df.info()

In [None]:
combined_run_df.describe()

In [None]:
men_df = combined_run_df[combined_run_df["Geschlecht"] == "M"]
women_df = combined_run_df[combined_run_df["Geschlecht"] == "W"]
len(combined_run_df) == len(men_df) + len(women_df)

In [None]:
men_df.describe()

In [None]:
women_df.describe()

In [None]:
pd.set_option("display.max_columns", 1000)

In [None]:
vnames = (
    combined_run_df.groupby("VName")["Name"]
    .nunique()
    .to_frame()
    .sort_values(by=["Name", "VName"], ascending=False)
    .reset_index()
)
fnames = (
    combined_run_df.groupby("FName")["Name"]
    .nunique()
    .to_frame()
    .sort_values(by=["Name", "FName"], ascending=False)
    .reset_index()
)

fnames[fnames["Name"] > 1].set_index("FName").T

In [None]:
vnames[vnames["Name"] > 1].set_index("VName").T

In [None]:
sns.histplot(fnames[fnames["Name"] > 4])

In [None]:
sns.histplot(vnames[vnames["Name"] > 5])

In [None]:
combined_run_df[combined_run_df["VName"] == combined_run_df["FName"]]

In [None]:
combined_run_df[
    (combined_run_df["VName"] == "No") & (combined_run_df["FName"] != "Name")
]

In [None]:
combined_run_df[
    (combined_run_df["VName"] != "No") & (combined_run_df["FName"] == "Name")
]

In [None]:
combined_run_df[combined_run_df["Pos"] == 1]

In [None]:
# Create a dataframe for the top 10 finishers' times
top_10_df = combined_run_df[~combined_run_df["DNF"]].nsmallest(10, "Zeit")
top_10_df["name_and_firma"] = top_10_df.apply(
    lambda x: f"{x['Name']}\n({x['Verein']})", axis=1
)

# Bar chart for top 10 finishers' times
plt.figure(figsize=(10, 6))
ax = sns.barplot(
    x="Zeit", y="name_and_firma", data=top_10_df, palette="Blues_d", errwidth=0
)
ax.set(xlim=(top_10_df["Zeit"].min() - 30, top_10_df["Zeit"].max() + 30))
plt.xlabel("Time (hh:mm:ss)")
plt.ylabel("Runner")
plt.title("Top 10 Finishers' Times")

# Get the x-axis tick positions and convert them to the desired format
tick_positions = ax.get_xticks()
tick_labels = [str(timedelta(seconds=int(tick))) for tick in tick_positions]

# Set the x-axis tick labels
ax.set_xticklabels(tick_labels)

plt.show()

In [None]:
# Create a dataframe for the gender distribution
gender_df = combined_run_df["Geschlecht"].value_counts()

# Pie chart for gender distribution
plt.figure(figsize=(6, 6))
plt.pie(
    gender_df,
    labels=gender_df.index,
    autopct="%1.1f%%",
    startangle=120,
    colors=["lightblue", "lightpink", "gray"],
)
plt.axis("equal")
plt.title("Gender Distribution")
plt.show()

In [None]:
full_classes_df = pd.read_parquet("./data/full_classes.parquet")

In [None]:
full_classes_df.info()

In [None]:
sns.stripplot(
    x="Zeit",
    y="Kat",
    data=full_classes_df[full_classes_df["Geschlecht"] == "M"],
    jitter=0.2,
    size=2,
    alpha=0.5,
    color="orange",
)
sns.stripplot(
    x="Zeit",
    y="Kat",
    data=full_classes_df[full_classes_df["Geschlecht"] == "W"],
    jitter=0.2,
    size=2,
    alpha=0.3,
    color="blue",
)

plt.xlabel("Finish Time (seconds)")
plt.ylabel("Category")
plt.title("B2Run Bremen - Finish Times")
plt.show()

In [None]:
full_classes_df[full_classes_df["Pos"] == 1]

In [None]:
full_classes_df[
    (full_classes_df["Geschlecht"] == "M") & (full_classes_df["Kat"] == "Ü30")
].sort_values(by="Zeit").reset_index(drop=True)

In [None]:
plot1 = sns.stripplot(
    x="Zeit",
    y="Geschlecht",
    data=combined_run_df[combined_run_df["MTeam"]],
    jitter=0.2,
    size=5,
    alpha=1,
    color="blue",
)
plot2 = sns.stripplot(
    x="Zeit",
    y="Geschlecht",
    data=combined_run_df[~combined_run_df["MTeam"]],
    jitter=0.45,
    size=1,
    alpha=1,
    color="orange",
)

plt.xlabel("Finish Time (seconds)")
plt.ylabel("Gender")
plt.legend(["Merentis"])
plt.title("B2Run Bremen - Finish Times")
""" this works but it's wrong, mixing rankings from both w and m
ax2 = plot2.twiny()
ax2.set_xlim(plot2.get_xlim())
positions = combined_run_df['Pos'].unique()
print(positions)
positions = positions[::500]
ax2.set_xticks(positions)
ax2.set_xticklabels(positions, rotation=45)
ax2.set_xlabel('Position')"""
plt.show()

In [None]:
plot1 = sns.stripplot(
    x="Zeit",
    data=combined_run_df[~combined_run_df["MTeam"]],
    jitter=0.45,
    size=1,
    alpha=1,
    color="orange",
)
plot2 = sns.stripplot(
    x="Zeit",
    data=combined_run_df[combined_run_df["MTeam"]],
    jitter=0.07,
    size=5,
    alpha=1,
    color="blue",
)

plt.xlabel("Finish Time (seconds)")
blue_patch = mpatches.Patch(color="blue", label="Merentis")
plt.legend(handles=[blue_patch])
plt.title("B2Run Bremen - Finish Times")
plt.show()

In [None]:
combined_run_df = combined_run_df[~combined_run_df["DNF"]]
combined_run_df["Overall_Pos"] = (
    combined_run_df["Zeit"].rank(method="min", ascending=True).astype(int)
)
combined_run_df = combined_run_df.sort_values(by="Zeit").reset_index(drop=True)
merentis_df = combined_run_df[combined_run_df["MTeam"]]
merentis_df.sort_values(by="Zeit")

In [None]:
combined_run_df[["Zeit", "Overall_Pos", "Name"]].T

In [None]:
merentis_df["Name_Pos"] = merentis_df.apply(
    lambda x: f"{x['Name']} (Pos. {int(x['Overall_Pos'])})", axis=1
)
merentis_df = merentis_df.sort_values(by="Zeit").reset_index(drop=True)

ax = sns.barplot(x="Zeit", y="Name_Pos", data=merentis_df, errwidth=0)
ax.set(xlim=(merentis_df["Zeit"].min() - 30, merentis_df["Zeit"].max() + 30))
plt.xlabel("Finish Time")
plt.ylabel("Name and Position")
plt.title("Race Finish Times")
plt.show()

In [None]:
# Plot overall position vs race times
plt.figure(figsize=(10, 6))
sns.scatterplot(
    x="Overall_Pos", y="Zeit", data=combined_run_df, color="blue", linewidth=0
)
plt.xlabel("Overall Position")
plt.ylabel("Race Time (seconds)")
plt.title("Overall Position vs Race Times")
plt.show()

In [None]:
# Define your x and y data
x_data = combined_run_df["Overall_Pos"].values
y_data = combined_run_df["Zeit"].values


def fit_pos_vs_time_curve(model_func, initial_guess):
    optimized_params, _ = curve_fit(
        model_func, x_data, y_data, p0=initial_guess, bounds=(0, np.inf)
    )
    fitted_curve = model_func(x_data, *optimized_params)

    plt.scatter(x_data, y_data, label="Original Data", s=1)
    plt.plot(x_data, fitted_curve, color="red", label="Fitted Curve")
    plt.xlabel("Overall Position")
    plt.ylabel("Time")
    plt.title("Fitted Curve vs. Original Data")
    plt.legend()
    plt.show()

    predicted_values = model_func(x_data, *optimized_params)
    r2 = r2_score(y_data, predicted_values)
    print(f"R^2: {r2}")

In [None]:
def model_func(x, A, B, C, D, E):
    return (A - B) * (np.arcsin(np.clip(2 * x / C - 1, -1, 1)) * D + E) + B


initial_guess = [6500, 1150, 5702, 1 / np.pi, 0.5]
fit_pos_vs_time_curve(model_func=model_func, initial_guess=initial_guess)

In [None]:
def model_func(x, A, B, C, D, E):
    return -A * np.log10(np.arcsin(np.clip(-1 / B * (x - C), -1, 1)) + D) + E


initial_guess = [5640, 1870, 1870, 2.3, 4000]  # Initial parameter guess
fit_pos_vs_time_curve(model_func=model_func, initial_guess=initial_guess)

In [None]:
combined_run_df["Zeit"].min(), combined_run_df["Zeit"].max(), combined_run_df[
    "Overall_Pos"
].min(), combined_run_df["Overall_Pos"].max()

In [None]:
sorted_companies = pd.read_parquet("./data/companies.parquet")

In [None]:
sorted_companies[sorted_companies["Läufer"] == 1].loc[
    :, ["Firma", "Ds_Zeit", "Ds_Tempo", "Anzahl_Frauen"]
].set_index("Firma").T

In [None]:
sns.histplot(data=sorted_companies["Läufer"])

In [None]:
# Create a dataframe for the stacked bar chart
stacked_df = (
    sorted_companies[sorted_companies["Läufer"] > 10]
    .sort_values(by="Läufer")
    .iloc[-15:-1, :]
    .reset_index(drop=True)[["Firma", "Anzahl_Männer", "Anzahl_Frauen"]]
)

# Plot stacked bar chart
plt.figure(figsize=(12, 6))
stacked_df.plot(kind="bar", x="Firma", stacked=True)
plt.xlabel("Organization")
plt.ylabel("Number of Participants")
plt.title("Number of Male and Female Participants by Organization")
plt.legend()
plt.xticks(rotation=90)
plt.show()

In [None]:
# Create a dataframe for the bar chart with error bars
bar_df = (
    sorted_companies[["Firma", "Ds_Zeit", "std"]]
    .copy()
    .sort_values(by="Ds_Zeit", ascending=True)
    .reset_index(drop=True)
)
bar_df = bar_df.nlargest(10, "Ds_Zeit")

# Plot bar chart with error bars
plt.figure(figsize=(12, 6))
plt.bar(bar_df["Firma"], bar_df["Ds_Zeit"], yerr=bar_df["std"], capsize=4)
plt.xlabel("Organization")
plt.ylabel("Average Race Time")
plt.title("Average Race Times with Error Bars by Organization")
plt.xticks(rotation=90)
plt.show()

In [None]:
scatter_df = sorted_companies[["Läufer", "Ds_Zeit"]].copy()

# Plot scatter plot with line of best fit
plt.figure(figsize=(10, 6))
sns.scatterplot(x="Läufer", y="Ds_Zeit", data=scatter_df)
plt.xlabel("Team Size")
plt.ylabel("Average Race Time")
plt.title("Scatter Plot: Number of People vs. Average Race Time")
plt.show()

In [None]:
women_perc_leaderboard = (
    sorted_companies[sorted_companies["Läufer"] > 3]
    .sort_values(by=["Anteil_Frauen", "Läufer"], ascending=False)
    .reset_index(drop=True)
)
women_perc_leaderboard.head()

In [None]:
women_perc_leaderboard[women_perc_leaderboard["Firma"] == "MERENTIS GmbH"]

In [None]:
men_perc_leaderboard = (
    sorted_companies[sorted_companies["Läufer"] > 3]
    .sort_values(by=["Anteil_Männer", "Läufer"], ascending=False)
    .reset_index(drop=True)
)
men_perc_leaderboard.head()

In [None]:
men_perc_leaderboard[men_perc_leaderboard["Firma"] == "MERENTIS GmbH"]

In [None]:
tempo_lb = sorted_companies.sort_values(by="Ds_Tempo").reset_index(drop=True)
tempo_lb.head()

In [None]:
tempo_lb[tempo_lb["Firma"] == "MERENTIS GmbH"]

In [None]:
combined_run_df.head()

In [None]:
nonames = combined_run_df[
    (combined_run_df["VName"] == "Noname") | (combined_run_df["FName"] == "Noname")
]
nonames.shape

In [None]:
combined_run_df[~combined_run_df["VName"].str.isalpha()].head()

In [None]:
combined_run_df[~combined_run_df["FName"].str.isalpha()].head()

In [None]:
combined_names = combined_run_df.drop(nonames.index)
combined_names.shape, combined_run_df.shape

In [None]:
sorted_companies.head()

In [None]:
hip.Experiment.from_iterable(combined_run_df.to_dict(orient="records")).display()

In [None]:
hip.Experiment.from_iterable(sorted_companies.to_dict(orient="records")).display()