# Many Model Training with Ray Tune

This template is a quickstart to using [Ray Tune](todo) for batch inference. Ray Tune is one of many libraries under the [Ray AI Runtime](air). See [this blog post](https://www.anyscale.com/blog/training-one-million-machine-learning-models-in-record-time-with-ray) for more information on the benefits of performing many model training with Ray!

This template walks through time-series forecasting using `sklearn`, but the framework and data format can be swapped out easily -- they are there just to help you build your own application!

At a high level, this template will:


> Slot in your own code below with the ✂️ icon to build a many model training Ray application of your own!

In [13]:
import time
import itertools
import pandas as pd
import numpy as np
from collections import defaultdict
from statsforecast import StatsForecast
from statsforecast.models import ETS, AutoARIMA, _TS
from pyarrow import parquet as pq
from sklearn.metrics import mean_squared_error

import ray
from ray import air, tune

ModuleNotFoundError: No module named 'statsforecast'

In [None]:
def get_m5_partition(unique_id: str) -> pd.DataFrame:
    df = pq.read_table(
        "s3://anonymous@m5-benchmarks/data/train/target.parquet",
        columns=["item_id", "timestamp", "demand"],
        filters=[("item_id", "=", unique_id)],
    ).to_pandas().rename(
        columns={"item_id": "unique_id", "timestamp": "ds", "demand": "y"}
    )
    df["unique_id"] = df["unique_id"].astype(str)
    df["ds"] = pd.to_datetime(df["ds"])
    return df.dropna()

train_df = get_m5_partition("FOODS_1_001_CA_1")
train_df

In [None]:
from ray.air import Checkpoint, session

def evaluate_cross_validation(df, metric):
    models = df.drop(columns=['ds', 'cutoff', 'y']).columns.tolist()
    evals = []
    for model in models:
        eval_ = df.groupby(['unique_id', 'cutoff']).apply(
            lambda x: metric(x['y'].values, x[model].values)
        ).to_frame()
        eval_.columns = [model]
        evals.append(eval_)
    evals = pd.concat(evals, axis=1)
    evals = evals.groupby(['unique_id']).mean(numeric_only=True)
    evals['best_model'] = evals.idxmin(axis=1)
    return evals

def cross_validation(config: dict):
    data_partition_id = config["data_partition_id"]
    train_df = get_m5_partition(data_partition_id)
    
    models = [
        AutoARIMA(),
        ETS(season_length=6, model: "ZNA"),
        ETS(season_length=7, model: "ZNA"),
        ETS(season_length=6, model: "ZZZ"),
        ETS(season_length=7, model: "ZZZ"),
    ]
    forecast_horizon = 4
    num_windows = 2
    
    sf = StatsForecast(
        df=train_df,
        models=models,
        freq="D",
        n_jobs=n_windows,
    )
    cv_df = sf.cross_validation(
        h=forecast_horizon,
        step_size=forecast_horizon,
        n_windows=n_windows,

    )

    eval_df = evaluate_cross_validation(df=cv_df, metric=mean_squared_error)
    best_model = eval_df["best_model"]
    session.report({"best_model": best_model})
    


In [None]:
data_partitions = list(pd.read_csv("item_ids.csv")["item_id"])

param_space = {
    "data_partition_id": tune.grid_search(data_partitions[:100]),
}

In [None]:
tuner = tune.Tuner(cross_validation, param_space=param_space)
result_grid = tuner.fit()