In [1]:
from typing import Any, Dict, List, Optional, Tuple, Union

import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
from plotly.graph_objects import Figure
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

**Preprocessing**

In [3]:
from typing import Tuple

import pandas as pd
import plotly.graph_objects as go
import streamlit as st


def load_gps(
    file_path: str = "data/players_data/marc_cucurella/CFC GPS Data.csv",
    encoding: str = "ISO-8859-1",
    season: str = "2023/2024",
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Load and preprocess GPS tracking data for training and matches.

    Args:
        file_path: Path to the GPS data CSV file
        encoding: Character encoding of the CSV file

    Returns:
        Tuple containing:
            - Complete DataFrame with all GPS data
            - Filtered DataFrame with only active training/match days (distance > 0)
    """
    df = pd.read_csv(file_path, encoding=encoding)

    df["date"] = pd.to_datetime(df["date"], format="%d/%m/%Y")
    df = df[df["season"] == season]
    # Define heart rate zone columns
    hr_columns = [
        "hr_zone_1_hms",
        "hr_zone_2_hms",
        "hr_zone_3_hms",
        "hr_zone_4_hms",
        "hr_zone_5_hms",
    ]

    for col in hr_columns:
        df[f"{col}_seconds"] = df[col].apply(
            lambda x: (
                sum(
                    int(part) * (60**i)
                    for i, part in enumerate(reversed(str(x).split(":")))
                )
                if pd.notna(x) and x != "00:00:00"
                else 0
            )
        )

    # Add useful derived columns for analysis
    df["is_match_day"] = df["md_plus_code"] == 0
    df["week_num"] = ((df["date"] - df["date"].min()).dt.days // 7) + 1
    df["day_name"] = df["date"].dt.day_name()

    df_active = df[df["distance"] > 0].copy()

    return df, df_active


def load_physical_capabilities(
    file_path: str = "data/players_data/marc_cucurella/CFC Physical Capability Data.csv",
    season: str = "2023/2024",
) -> pd.DataFrame:
    """
    Load and preprocess physical capabilities assessment data.

    Args:
        file_path: Path to the physical capabilities CSV file
        preview: If True, display a preview of the data in the Streamlit app

    Returns:
        DataFrame with preprocessed physical capabilities data
    """
    df = pd.read_csv(file_path)

    df["testDate"] = pd.to_datetime(df["testDate"], format="%d/%m/%Y")
    df["benchmarkPct"] = pd.to_numeric(df["benchmarkPct"], errors="coerce")

    if season == "2023/2024":
        df = df.loc[
            (df["testDate"] >= pd.Timestamp("01/07/2023"))
            & (df["testDate"] <= pd.Timestamp("30/06/2024"))
        ]
    elif season == "2024/2025":
        df = df.loc[df["testDate"] >= pd.Timestamp("01/07/2024")]
    df = df.sort_values("testDate")

    return df


def load_recovery_status(
    file_path: str = "data/players_data/marc_cucurella/CFC Recovery status Data.csv",
    season: str = "2023/2024",
) -> pd.DataFrame:
    """
    Load and preprocess player recovery status data.

    Args:
        file_path: Path to the recovery status CSV file

    Returns:
        DataFrame with preprocessed recovery status data
    """
    df = pd.read_csv(file_path)
    df = df[df["seasonName"] == season]

    # Convert date strings to datetime objects
    df["sessionDate"] = pd.to_datetime(df["sessionDate"], format="%d/%m/%Y")

    df = df.sort_values("sessionDate")

    df = df.dropna(subset=["value"])

    df["value"] = pd.to_numeric(df["value"], errors="coerce")

    # Add temporal grouping columns for analysis
    df["week"] = df["sessionDate"].dt.isocalendar().week
    df["month"] = df["sessionDate"].dt.month_name()

    # Extract and categorize different metric types
    df["metric_type"] = df["metric"].apply(
        lambda x: (
            "completeness"
            if "completeness" in x
            else ("composite" if "composite" in x else "score")
        )
    )

    # Clean up metric names by removing type suffixes
    df["base_metric"] = df["metric"].apply(
        lambda x: x.replace("_baseline_completeness", "")
        .replace("_baseline_composite", "")
        .replace("_baseline_score", "")
    )

    return df


def load_priority(path: str, encoding: str = "ISO-8859-1") -> pd.DataFrame:
    """
    Loads a CSV file into a Pandas DataFrame.

    :param path: Path to the CSV file.
    :param encoding: Encoding format for reading the file (default: "ISO-8859-1").
    :return: DataFrame containing the loaded data.
    """
    df = pd.read_csv(path, encoding=encoding)
    return df


In [5]:

# Consistent color palette
COLORS = {
    "primary": "#1A237E",  # Dark blue
    "secondary": "#004D40",  # Dark green
    "accent1": "#311B92",  # Deep purple
    "accent2": "#01579B",  # Dark cyan
    "accent3": "#33691E",  # Dark lime
    "text": "#212121",  # Almost black text
}
QUALITATIVE_PALETTE = [
    COLORS["primary"],
    COLORS["secondary"],
    COLORS["accent1"],
    COLORS["accent2"],
    COLORS["accent3"],
]
TEMPLATE = "plotly_white"
COMMON_MARGINS = dict(l=50, r=50, t=80, b=50)



In [6]:
df, df_active = load_gps(
    "data/players_data/marc_cucurella/CFC GPS Data.csv",
    season=st.session_state.selected_season,
    )

df_filtered = df[df["distance"] > 0]
df_matches = df_filtered[df_filtered["opposition_code"].notna()]
df_trainings = df_filtered[df_filtered["opposition_code"].isna()]



AttributeError: st.session_state has no attribute "selected_season". Did you forget to initialize it? More info: https://docs.streamlit.io/develop/concepts/architecture/session-state#initialization