In [None]:
import pandas as pd

combined_csv = "/app/combined.csv"
combined_run_df = pd.read_csv(combined_csv)

In [None]:
company_counts = combined_run_df.groupby("Firma")["Nr"].nunique().reset_index()
# Sort the companies based on the number of runners in descending order
sorted_companies = company_counts.sort_values("Nr", ascending=False)
sorted_companies = sorted_companies.rename(columns={"Nr": "Läufer"})
# Reset the index of the sorted dataframe
sorted_companies.reset_index(drop=True, inplace=True)

In [None]:
summary_stats = (
    combined_run_df.groupby("Firma")["Zeit"]
    .agg(["min", "max", "mean", "std"])
    .rename(columns={"min": "Min_Zeit", "max": "Max_Zeit", "mean": "Ds_Zeit"})
)
gender_counts = (
    combined_run_df.groupby("Firma")["Geschlecht"]
    .value_counts()
    .unstack(fill_value=0)
    .rename(columns={"W": "Anzahl_Frauen", "M": "Anzahl_Männer"})
)
sorted_companies = pd.merge(sorted_companies, summary_stats, on="Firma")
sorted_companies = pd.merge(sorted_companies, gender_counts, on="Firma")
sorted_companies["Ds_Tempo"] = 6.3 * 3600 / sorted_companies["Ds_Zeit"]
sorted_companies["Anteil_Männer"] = sorted_companies["Anzahl_Männer"].divide(
    sorted_companies["Läufer"]
)
sorted_companies["Anteil_Frauen"] = sorted_companies["Anzahl_Frauen"].divide(
    sorted_companies["Läufer"]
)

In [None]:
sorted_companies.head()

In [None]:
sorted_companies.info()

In [None]:
sorted_companies[sorted_companies.drop("std", axis=1).isna().any(axis=1)]

In [None]:
sorted_companies.to_parquet("/app/companies.parquet", index=False)