In [None]:
import pandas as pd
from pathlib import Path

In [None]:
def split_time_series(
        csv_path,
        date_col,
        uid_col,
        y_col,
        train_end,
        val_end,
        test_end,
        start_date=None,
        end_date=None):
    """
    Load time-series data from CSV and split into train, val, test dictionaries.

    This function **loads the dataset from a CSV file internally**, 
    making it reusable across notebooks.

    Parameters
    ----------
    csv_path : str or Path
        Path to a CSV file containing the long-format time-series table.

    date_col : str
        Name of timestamp column.

    uid_col : str
        Name of entity ID column.

    y_col : str
        Name of target variable column.

    train_end : str or Timestamp
        Last date of the training window.

    val_end : str or Timestamp
        Last date of the validation window.

    test_end : str or Timestamp
        Last date of the test window.

    start_date : optional str or Timestamp
        Trim data to begin no earlier than this date.

    end_date : optional str or Timestamp
        Trim data to end no later than this date.

    Returns
    -------
    dict
        {
            "train": { uid : DataFrame },
            "val":   { uid : DataFrame },
            "test":  { uid : DataFrame }
        }
    """

    # --------------------------
    # Load CSV
    # --------------------------
    csv_path = Path(csv_path)
    if not csv_path.exists():
        raise FileNotFoundError(f"CSV not found: {csv_path}")

    df = pd.read_csv(csv_path)

    # Ensure datetime format
    df[date_col] = pd.to_datetime(df[date_col], errors="coerce")

    # --------------------------
    # Trimming
    # --------------------------
    df = df[df[date_col] >= pd.Timestamp(start_date)]
    df = df[df[date_col] <= pd.Timestamp(end_date)]

    # --------------------------
    # Define Boundaries
    # --------------------------
    train_end = pd.Timestamp(train_end)
    val_end   = pd.Timestamp(val_end)
    test_end  = pd.Timestamp(test_end)

    val_start  = train_end + pd.offsets.MonthBegin(1)
    test_start = val_end   + pd.offsets.MonthBegin(1)

    # --------------------------
    # Splitting per UID
    # --------------------------
    splits = {"train": {}, "val": {}, "test": {}}

    for uid, g in df.groupby(uid_col):
        g = g.sort_values(date_col).reset_index(drop=True)

        g_train = g[g[date_col] <= train_end]
        g_val   = g[(g[date_col] >= val_start)  & (g[date_col] <= val_end)]
        g_test  = g[(g[date_col] >= test_start) & (g[date_col] <= test_end)]

        # Ensure every split exists
        if len(g_train) > 0 and len(g_val) > 0 and len(g_test) > 0:
            splits["train"][uid] = g_train[[date_col, y_col]]
            splits["val"][uid]   = g_val[[date_col, y_col]]
            splits["test"][uid]  = g_test[[date_col, y_col]]

    return splits
