### Use the AG example in the docs (Forecasting with Chronos) and then convert to training container and inf container

In [61]:
from autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredictor
import numpy as np
import boto3
import uuid
from io import BytesIO
import os
import sagemaker
from sagemaker.pytorch import PyTorch
from sagemaker.session import TrainingInput

In [28]:
data = TimeSeriesDataFrame.from_path(
    "https://autogluon.s3.amazonaws.com/datasets/timeseries/australian_electricity_subset/test.csv"
)
data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,target
item_id,timestamp,Unnamed: 2_level_1
T000000,2013-03-10 00:00:00,5207.959961
T000000,2013-03-10 00:30:00,5002.275879
T000000,2013-03-10 01:00:00,4747.569824
T000000,2013-03-10 01:30:00,4544.880859
T000000,2013-03-10 02:00:00,4425.952148


### Add features (not in ex)

In [30]:
random_values = np.random.uniform(0, 500, size=len(data['target']))
data['random_feature'] = data['target'].values + random_values
data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,target,random_feature
item_id,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1
T000000,2013-03-10 00:00:00,5207.959961,5558.00664
T000000,2013-03-10 00:30:00,5002.275879,5390.543877
T000000,2013-03-10 01:00:00,4747.569824,4798.287882
T000000,2013-03-10 01:30:00,4544.880859,4545.22837
T000000,2013-03-10 02:00:00,4425.952148,4706.047467


In [None]:
# prediction_length = 48
# train_data, test_data = data.train_test_split(prediction_length)
# train_data = train_data.reset_index()
# test_data = test_data.reset_index()

# predictor = TimeSeriesPredictor(prediction_length=prediction_length
#                                ).fit(train_data)

### Write files to parquet (not in ex.) to test train and inf job

In [46]:
bucket = 'ag-example-timeseries'

# Create S3 client
s3 = boto3.client("s3")

In [118]:
train_data

Unnamed: 0,item_id,timestamp,target,random_feature
0,T000000,2013-03-10 00:00:00,5207.959961,5558.006640
1,T000000,2013-03-10 00:30:00,5002.275879,5390.543877
2,T000000,2013-03-10 01:00:00,4747.569824,4798.287882
3,T000000,2013-03-10 01:30:00,4544.880859,4545.228370
4,T000000,2013-03-10 02:00:00,4425.952148,4706.047467
...,...,...,...,...
172795,T000004,2015-02-27 21:30:00,368.948792,452.005475
172796,T000004,2015-02-27 22:00:00,346.332764,598.047604
172797,T000004,2015-02-27 22:30:00,327.962677,499.156166
172798,T000004,2015-02-27 23:00:00,307.481934,498.390110


In [47]:
# Number of parquet files to create
num_files = 100

dfs_to_write = {'train': train_data, 'test': test_data}

for key in dfs_to_write.keys():
    print(key)

train
test


In [194]:
# Clear train & test buckets [can change this]
! aws s3 rm s3://{bucket}/train/ --recursive 
! aws s3 rm s3://{bucket}/test/ --recursive 

delete: s3://ag-example-timeseries/train/dummy_051a6370.parquet
delete: s3://ag-example-timeseries/train/dummy_074ddab6.parquet
delete: s3://ag-example-timeseries/train/dummy_05246c34.parquet
delete: s3://ag-example-timeseries/train/dummy_063c3e10.parquet
delete: s3://ag-example-timeseries/train/dummy_09ea8721.parquet
delete: s3://ag-example-timeseries/train/dummy_11711ae8.parquet
delete: s3://ag-example-timeseries/train/dummy_08b1dc91.parquet
delete: s3://ag-example-timeseries/train/dummy_0941929d.parquet
delete: s3://ag-example-timeseries/train/dummy_11a2f2fd.parquet
delete: s3://ag-example-timeseries/train/dummy_1228d43d.parquet
delete: s3://ag-example-timeseries/train/dummy_11fd6d93.parquet
delete: s3://ag-example-timeseries/train/dummy_199f7825.parquet
delete: s3://ag-example-timeseries/train/dummy_16b90365.parquet
delete: s3://ag-example-timeseries/train/dummy_150b9b3e.parquet
delete: s3://ag-example-timeseries/train/dummy_19db2211.parquet
delete: s3://ag-example-timeseries/train

In [195]:
chunk

Unnamed: 0,item_id,timestamp,target,random_feature
171310,T000003,2014-09-15 19:00:00,1549.564575,1889.001289
171311,T000003,2013-12-27 20:00:00,1408.881348,1707.482654
171312,T000003,2014-11-08 18:30:00,1049.678345,1318.529002
171313,T000001,2014-07-03 06:00:00,4561.230957,4620.362423
171314,T000002,2013-05-31 05:00:00,3342.124268,3776.884811
...,...,...,...,...
173035,T000003,2014-02-08 11:30:00,2116.354248,2325.709252
173036,T000002,2015-03-30 07:00:00,4776.804199,5155.161506
173037,T000003,2014-10-17 14:00:00,978.261658,1009.637501
173038,T000004,2013-09-01 17:30:00,500.091858,858.222577


In [196]:
for split_name, df in dfs_to_write.items():
    # Shuffle and split into chunks
    shuffled = df.sample(frac=1, random_state=42).reset_index()
    chunks = np.array_split(shuffled, num_files)

    for i, chunk in enumerate(chunks, 1):
        filename = f"dummy_{uuid.uuid4().hex[:8]}.parquet"
        s3_key = f"{split_name}/{filename}"   # <-- include folder + filename

        buffer = BytesIO()
        chunk_reset = chunk.reset_index()
        chunk_reset.to_parquet(buffer, index=False)
        buffer.seek(0)

        s3.upload_fileobj(buffer, bucket, s3_key)
        print(f"Uploaded {len(chunk)} rows to s3://{bucket}/{s3_key}")


  return bound(*args, **kwds)


Uploaded 1728 rows to s3://ag-example-timeseries/train/dummy_ed5a6ade.parquet
Uploaded 1728 rows to s3://ag-example-timeseries/train/dummy_12d36c43.parquet
Uploaded 1728 rows to s3://ag-example-timeseries/train/dummy_45a88326.parquet
Uploaded 1728 rows to s3://ag-example-timeseries/train/dummy_30bd8e8a.parquet
Uploaded 1728 rows to s3://ag-example-timeseries/train/dummy_be566b9b.parquet
Uploaded 1728 rows to s3://ag-example-timeseries/train/dummy_9f66ec89.parquet
Uploaded 1728 rows to s3://ag-example-timeseries/train/dummy_e2423f6f.parquet
Uploaded 1728 rows to s3://ag-example-timeseries/train/dummy_84126629.parquet
Uploaded 1728 rows to s3://ag-example-timeseries/train/dummy_2775cb49.parquet
Uploaded 1728 rows to s3://ag-example-timeseries/train/dummy_b528749f.parquet
Uploaded 1728 rows to s3://ag-example-timeseries/train/dummy_b3b95a92.parquet
Uploaded 1728 rows to s3://ag-example-timeseries/train/dummy_a7c3250e.parquet
Uploaded 1728 rows to s3://ag-example-timeseries/train/dummy_b86

  return bound(*args, **kwds)


Uploaded 1731 rows to s3://ag-example-timeseries/test/dummy_8724b0d9.parquet
Uploaded 1731 rows to s3://ag-example-timeseries/test/dummy_7fe16a93.parquet
Uploaded 1731 rows to s3://ag-example-timeseries/test/dummy_66ffae35.parquet
Uploaded 1731 rows to s3://ag-example-timeseries/test/dummy_591feade.parquet
Uploaded 1731 rows to s3://ag-example-timeseries/test/dummy_7a4a578a.parquet
Uploaded 1731 rows to s3://ag-example-timeseries/test/dummy_82f35802.parquet
Uploaded 1731 rows to s3://ag-example-timeseries/test/dummy_1c9f47bb.parquet
Uploaded 1731 rows to s3://ag-example-timeseries/test/dummy_99322d03.parquet
Uploaded 1731 rows to s3://ag-example-timeseries/test/dummy_9cf88685.parquet
Uploaded 1731 rows to s3://ag-example-timeseries/test/dummy_0d2bd7a8.parquet
Uploaded 1731 rows to s3://ag-example-timeseries/test/dummy_1ba8fad4.parquet
Uploaded 1731 rows to s3://ag-example-timeseries/test/dummy_95ea3bb9.parquet
Uploaded 1731 rows to s3://ag-example-timeseries/test/dummy_45210d0f.parquet

In [124]:
train_data.columns

Index(['item_id', 'timestamp', 'target', 'random_feature'], dtype='object')

In [32]:
train_data
predictions = predictor.predict(train_data)
# predictor.plot(
#     data=data,
#     predictions=predictions,
#     item_ids=data.item_ids[:2],
#     max_history_length=200,
# );

Model not specified in predict, will default to the model with the best validation score: WeightedEnsemble


In [None]:
# predictions = predictor.predict(train_data)
# predictor.plot(
#     data=data,
#     predictions=predictions,
#     item_ids=data.item_ids[:2],
#     max_history_length=200,
# );

Model not specified in predict, will default to the model with the best validation score: WeightedEnsemble


In [202]:
! aws s3 cp s3://{bucket}/train/ train --recursive 

download: s3://ag-example-timeseries/train/dummy_0137fab4.parquet to train/dummy_0137fab4.parquet
download: s3://ag-example-timeseries/train/dummy_0cd4de9c.parquet to train/dummy_0cd4de9c.parquet
download: s3://ag-example-timeseries/train/dummy_0a66dbcf.parquet to train/dummy_0a66dbcf.parquet
download: s3://ag-example-timeseries/train/dummy_12d36c43.parquet to train/dummy_12d36c43.parquet
download: s3://ag-example-timeseries/train/dummy_155dc49c.parquet to train/dummy_155dc49c.parquet
download: s3://ag-example-timeseries/train/dummy_095f27c6.parquet to train/dummy_095f27c6.parquet
download: s3://ag-example-timeseries/train/dummy_001ceaa1.parquet to train/dummy_001ceaa1.parquet
download: s3://ag-example-timeseries/train/dummy_024b7783.parquet to train/dummy_024b7783.parquet
download: s3://ag-example-timeseries/train/dummy_16c6cf8f.parquet to train/dummy_16c6cf8f.parquet
download: s3://ag-example-timeseries/train/dummy_1a499879.parquet to train/dummy_1a499879.parquet
download: s3://ag-ex

In [203]:
import pyarrow.parquet as pq
import pandas as pd

def _find_parquet_files(root: str, keyword: str | None):
    if not root or not os.path.isdir(root):
        raise FileNotFoundError(f"Data directory not found: {root}")
    files = []
    for r, _, fns in os.walk(root):
        for fn in fns:
            _, ext = os.path.splitext(fn)
            if ext in ['.parquet']:
                if keyword:
                    if keyword in fn:
                        files.append(os.path.join(r, fn))
                else:
                    files.append(os.path.join(r, fn))
    print(f"[finder] root={root} keyword={keyword!r} found={len(files)}")
    if not files:
        raise FileNotFoundError(f"No parquet files in {root} (keyword={keyword!r})")
    return sorted(files)

def load_timeseries_parquet(
    data_root: str,
    keyword: str | None,
    id_col: str,
    time_col: str,
    target_col: str,
    covariate_cols: list[str] | None = None,  # e.g., ["random_feature"]
):
    files = _find_parquet_files(data_root, keyword)

    def resolve(cols: list[str], desired: str, aliases: list[str]) -> str | None:
        norm = {c.strip().lower(): c for c in cols}
        for cand in [desired] + aliases:
            k = cand.strip().lower()
            if k in norm:
                return norm[k]
        return None

    frames = []
    for fp in files:
        t = pq.read_table(fp)
        df = t.to_pandas()
        # print(df)
        frames.append(df.reset_index())

    all_df = pd.concat(frames, ignore_index=True).sort_values(["item_id", "timestamp"]).reset_index(drop=True)

    target_tsf = TimeSeriesDataFrame.from_data_frame(
        all_df[["item_id", "timestamp", "target"]],
        id_column="item_id",
        timestamp_column="timestamp",
    )

    cov_tsf = None
    if covariate_cols:
        present = [c for c in covariate_cols if c in all_df.columns]  # (only if you merged covs into all_df)
        if present:
            cov_df = all_df[["item_id", "timestamp"] + present]
            cov_tsf = TimeSeriesDataFrame.from_data_frame(cov_df, id_column="item_id", timestamp_column="timestamp")

    return target_tsf, cov_tsf


In [204]:
load_timeseries_parquet('train', None, 'item_id', 'timestamp', 'target')

[finder] root=train keyword=None found=100


(                                  target
 item_id timestamp                       
 T000000 2013-03-10 00:00:00  5207.959961
         2013-03-10 00:30:00  5002.275879
         2013-03-10 01:00:00  4747.569824
         2013-03-10 01:30:00  4544.880859
         2013-03-10 02:00:00  4425.952148
 ...                                  ...
 T000004 2015-02-27 21:30:00   368.948792
         2015-02-27 22:00:00   346.332764
         2015-02-27 22:30:00   327.962677
         2015-02-27 23:00:00   307.481934
         2015-02-27 23:30:00   291.532776
 
 [172800 rows x 1 columns],
 None)

### Create train container

In [271]:
%%writefile train.py
import os, sys, time, glob, argparse
from functools import wraps

import pandas as pd
import pyarrow.parquet as pq
import mlflow
import cloudpickle

from autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredictor
from helper_functions import (
    AGTimeSeriesWrapper,
    log_autogluon_timeseries_to_mlflow_artifact,
    log_autogluon_timeseries_metrics,
)

# ----------------------------
# Retry helper
# ----------------------------
def retry_decorator(max_attempts=3, delay_seconds=60, backoff_factor=2):
    def decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            attempts, delay = 0, delay_seconds
            while attempts < max_attempts:
                try:
                    return func(*args, **kwargs)
                except Exception as e:
                    attempts += 1
                    if attempts >= max_attempts:
                        raise
                    print(f"[retry] {e} | attempt {attempts}/{max_attempts} | sleeping {delay}s")
                    time.sleep(delay)
                    delay *= backoff_factor
        return wrapper
    return decorator

# ----------------------------
# Args
# ----------------------------
def parse_args():
    p = argparse.ArgumentParser()
    p.add_argument('--output_dir', type=str, default='/opt/ml/model')
    p.add_argument('--mlflow_arn', type=str, required=True)
    p.add_argument('--mlflow_experiment', type=str, required=True)
    p.add_argument('--train-dir', type=str, default=os.environ.get('SM_CHANNEL_TRAINING', '/opt/ml/input/data/training'))
    p.add_argument('--test-dir',  type=str, default=os.environ.get('SM_CHANNEL_TEST', '/opt/ml/input/data/test'))
    p.add_argument('--train-keyword', type=str, default=None)
    p.add_argument('--test-keyword',  type=str, default=None)
    p.add_argument('--id-col', type=str, default='item_id')
    p.add_argument('--time-col', type=str, default='timestamp')
    p.add_argument('--target-col', type=str, default='target')
    p.add_argument('--prediction-length', type=int, default=24)
    p.add_argument('--eval-metric', type=str, default='MAPE')
    p.add_argument('--presets', type=str, default='best_quality')
    p.add_argument('--time-limit', type=int, default=900)
    p.add_argument('--num-gpus', type=int, default=int(os.environ.get('SM_NUM_GPUS', '0')))
    p.add_argument('--model-name', type=str, default="ag-ts-model")
    return p.parse_args()

# ----------------------------
# File discovery (recursive)
# ----------------------------
PARQUET_EXTS = {".parquet", ".PARQUET", ".pq", ".PQ"}
def _find_parquet_files(root: str, keyword: str | None):
    if not root or not os.path.isdir(root):
        raise FileNotFoundError(f"Data directory not found: {root}")
    files = []
    for r, _, fns in os.walk(root):
        for fn in fns:
            _, ext = os.path.splitext(fn)
            if ext in PARQUET_EXTS:
                if keyword:
                    if keyword in fn:
                        files.append(os.path.join(r, fn))
                else:
                    files.append(os.path.join(r, fn))
    print(f"[finder] root={root} keyword={keyword!r} found={len(files)}")
    if not files:
        raise FileNotFoundError(f"No parquet files in {root} (keyword={keyword!r})")
    return sorted(files)

# ----------------------------
# Loader -> TSF objects (target + optional covariates)
# ----------------------------
@retry_decorator(max_attempts=3, delay_seconds=30, backoff_factor=2)
def load_timeseries_parquet(
    data_root: str,
    keyword: str | None,
    id_col: str,
    time_col: str,
    target_col: str,
    covariate_cols: list[str] | None = None,
):
    files = _find_parquet_files(data_root, keyword)
    frames = []
    for fp in files:
        t = pq.read_table(fp)
        df = t.to_pandas()
        frames.append(df)

    all_df = pd.concat(frames, ignore_index=True).sort_values(["item_id", "timestamp"]).reset_index(drop=True)

    target_tsf = TimeSeriesDataFrame.from_data_frame(
        all_df[["item_id", "timestamp", "target"]],
        id_column="item_id",
        timestamp_column="timestamp",
    )

    cov_tsf = None
    if covariate_cols:
        present = [c for c in covariate_cols if c in all_df.columns]
        if present:
            cov_df = all_df[["item_id", "timestamp"] + present]
            cov_tsf = TimeSeriesDataFrame.from_data_frame(cov_df, id_column="item_id", timestamp_column="timestamp")

    return target_tsf, cov_tsf

# ----------------------------
# Main
# ----------------------------
def main():
    args = parse_args()

    mlflow.set_tracking_uri(args.mlflow_arn)
    mlflow.set_experiment(args.mlflow_experiment)

    mlflow.autolog(disable=True)

    print(f"[load] train_dir={args.train_dir} keyword={args.train_keyword!r}")
    train_tsf, train_cov_tsf = load_timeseries_parquet(
        args.train_dir, args.train_keyword, args.id_col, args.time_col, args.target_col,
        covariate_cols=["random_feature"],
    )

    try:
        test_tsf, test_cov_tsf = load_timeseries_parquet(
            args.test_dir, args.test_keyword, args.id_col, args.time_col, args.target_col,
            covariate_cols=["random_feature"],
        )
    except FileNotFoundError:
        test_tsf, test_cov_tsf = None, None

    with mlflow.start_run() as run:
        # Log training parameters
        mlflow.log_params({
            "prediction_length": args.prediction_length,
            "eval_metric": args.eval_metric,
            "presets": args.presets,
            "time_limit": args.time_limit,
            "train_dir": args.train_dir,
            "test_dir": args.test_dir,
            "train_keyword": args.train_keyword,
            "test_keyword": args.test_keyword,
        })

        # Train the AutoGluon model
        predictor = TimeSeriesPredictor(
            prediction_length=args.prediction_length,
            eval_metric=args.eval_metric,
            path=args.output_dir,
        )
        predictor.fit(
            train_data=train_tsf,
            presets=args.presets,
            time_limit=args.time_limit,
        )
        predictor.save()

        # Step 1: Log the model artifact and capture the returned object
        # Use a sample of the training data as an input example for the model signature
        print(train_tsf.head(1))
        input_example = train_tsf.head(10).to_pandas()
        print(input_example)
        logged_model = log_autogluon_timeseries_to_mlflow_artifact(predictor, input_example)
        
        # Log additional AutoGluon metrics and details
        log_autogluon_timeseries_metrics(predictor)

        if test_tsf is not None:
            scores = predictor.evaluate(test_tsf)
            for k, v in scores.items():
                mlflow.log_metric(f"test_{k}", float(v))
        
        # Step 2: Explicitly register the model using the URI from the logged object
        mlflow.register_model(model_uri=logged_model.model_uri, name=args.model_name)

        print("[done] training complete and model logged and registered to MLflow.")

if __name__ == "__main__":
    main()


Overwriting train.py


### Run the training container

In [275]:
# Config
region      = sagemaker.Session().boto_region_name
session     = sagemaker.Session()
role        = sagemaker.get_execution_role()  # or set your role arn string

instance_type   = "ml.g4dn.4xlarge"             # CPU example; use a GPU like "ml.g5.2xlarge" if needed
instance_count  = 1
use_spot        = True                        # optional cost saver
max_wait        = 3600 + 600                  # seconds (must be > max_run if use_spot)
max_run         = 3600                        # seconds

# Hyperparameters for train.py (match argparse names)
hyperparameters = {
    "id-col": "item_id",
    "time-col": "timestamp",
    "target-col": "target",
    # "train-keyword": None,                 # your parquet file name filter
    # "test-keyword": None,                   # set None/"" if no test set
    "prediction-length": 24,
    "eval-metric": "MAPE",
    "presets": "best_quality",
    "time-limit": 900,                        # seconds
    "mlflow_arn": 'arn:aws:sagemaker:us-east-1:543531862107:mlflow-tracking-server/ag-ex-timeseries',
    "mlflow_experiment": "autogluon-timeseries",
}

In [276]:
train_s3_uri

's3://ag-example-timeseries/train'

In [None]:
# Inputs
train_s3_uri = f"s3://{bucket}/train"
test_s3_uri = f"s3://{bucket}/test"

inputs = {
    "training": TrainingInput(
        s3_data=train_s3_uri,
        s3_data_type="S3Prefix",
        content_type="application/x-parquet",
        input_mode="File"
    ),
    "test": TrainingInput(
        s3_data=test_s3_uri,
        s3_data_type="S3Prefix",
        content_type="application/x-parquet",
        input_mode="File"
    )
}
# (Optional) if you kept train/test together under the same prefix and rely purely on keyword,
# you only need one channel. If you prefer a separate channel for test, you can add another:
# inputs["test"] = TrainingInput(s3_data=f"s3://{bucket}/{test_prefix}/", ...)

# -----------------------------------
# Estimator: Managed PyTorch DLC
# -----------------------------------
estimator = PyTorch(
    entry_point="train.py",          # your training script
    source_dir=".",                  # folder containing train.py (and any utils/requirements.txt)
    role=role,
    framework_version="2.1.0",       # pick a supported version
    py_version="py310",
    instance_type=instance_type,
    instance_count=instance_count,
    dependencies = ['requirements.txt', 'helper_functions.py'],
    hyperparameters=hyperparameters,
    sagemaker_session=session,
    disable_profiler=True,
    debugger_hook_config=False,
    # max_run=max_run,
    # use_spot_instances=use_spot,
    # max_wait=max_wait if use_spot else None,
    # Optional: checkpointing (recommended if using spot)
    # checkpoint_s3_uri=f"s3://{bucket}/checkpoints/autogluon-ts/",
)

# (Optional) if you have a requirements.txt in the source_dir, PyTorch Estimator will install it.
# Example requirements.txt lines that work for your script:
# autogluon.timeseries[all]==1.1.1
# pyarrow>=13.0.0
# mlflow>=2.9.0
# pandas>=2.0.0

# -----------------------------------
# Launch training
# -----------------------------------
job_name = sagemaker.utils.unique_name_from_base("ag-ts-train")
print("Starting training job:", job_name)
estimator.fit(inputs, job_name=job_name, wait=True, logs=True)

# After completion:
print("Model artifacts:", estimator.model_data)

INFO:sagemaker.telemetry.telemetry_logging:SageMaker Python SDK will collect telemetry to help us better understand our user's needs, diagnose issues, and deliver additional features.
To opt out of telemetry, please disable via TelemetryOptOut parameter in SDK defaults config. For more information, refer to https://sagemaker.readthedocs.io/en/stable/overview.html#configuring-and-using-defaults-with-the-sagemaker-python-sdk.


Starting training job: ag-ts-train-1757097501-6ae1


INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: ag-ts-train-1757097501-6ae1


2025-09-05 18:38:58 Starting - Starting the training job
2025-09-05 18:38:58 Pending - Training job waiting for capacity....