In [1]:
import os
import sys

import numpy as np
import pandas as pd
import dotenv
import mlflow
from autogluon.timeseries import TimeSeriesPredictor, TimeSeriesDataFrame
import plotly.graph_objects as go
from huggingface_hub import login

sys.path.append("../..")

from utils import calculate_sklearn_metrics, TrainingConfig

dotenv.load_dotenv("../../.env")

token = os.environ["HF_TOKEN"]
login(token=token)

mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("rosstat_forecasting");

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [2]:
data_dir = '../../../data/rosstat/processed'

train_df = pd.read_csv(os.path.join(data_dir, 'train/data.csv'))
val_df = pd.read_csv(os.path.join(data_dir, 'val/data.csv'))
test_df = pd.read_csv(os.path.join(data_dir, 'test/data.csv'))

print(f"Обучающая выборка: {train_df.shape[0]} строк")
print(f"Валидационная выборка: {val_df.shape[0]} строк")
print(f"Тестовая выборка: {test_df.shape[0]} строк")

Обучающая выборка: 4140 строк
Валидационная выборка: 828 строк
Тестовая выборка: 828 строк


In [3]:
train_data = TimeSeriesDataFrame.from_data_frame(
    train_df.rename(columns={"nominal_wage": "target"}),
    id_column="code",
    timestamp_column="date",
)
val_data = TimeSeriesDataFrame.from_data_frame(
    val_df.rename(columns={"nominal_wage": "target"}),
    id_column="code",
    timestamp_column="date",
)
test_data = TimeSeriesDataFrame.from_data_frame(
    test_df.rename(columns={"nominal_wage": "target"}),
    id_column="code",
    timestamp_column="date",
)

In [4]:
config = TrainingConfig(
    prediction_length=2,  # полгода
    artifact_path="../models/auto_ml_single_target_bigger",
)

predictor = TimeSeriesPredictor(
    prediction_length=config.prediction_length, path=config.artifact_path, freq="MS"
).fit(
    train_data=train_data,
    tuning_data=val_data,
    verbosity=4,
    hyperparameters={
        "Chronos": [
            {"model_path": "bolt_tiny", "ag_args": {"name_suffix": "ZeroShot"}},
            {
                "model_path": "bolt_tiny",
                "fine_tune": True,
                "ag_args": {"name_suffix": "FineTuned"},
            },
            {"model_path": "bolt_mini", "ag_args": {"name_suffix": "ZeroShot"}},
            {
                "model_path": "bolt_mini",
                "fine_tune": True,
                "ag_args": {"name_suffix": "FineTuned"},
            },
            {"model_path": "bolt_small", "ag_args": {"name_suffix": "ZeroShot"}},
            {
                "model_path": "bolt_small",
                "fine_tune": True,
                "ag_args": {"name_suffix": "FineTuned"},
            },
            {"model_path": "bolt_base", "ag_args": {"name_suffix": "ZeroShot"}},
            {
                "model_path": "bolt_base",
                "fine_tune": True,
                "ag_args": {"name_suffix": "FineTuned"},
            }
        ],
    },
    enable_ensemble=False,
)

Beginning AutoGluon training...
AutoGluon will save models to '/home/nikita/projects/time_series_analysis/code_dir/rosstat/models/auto_ml_single_target_bigger'
AutoGluon Version:  1.3.0
Python Version:     3.12.7
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #61~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Tue Apr 15 17:03:15 UTC 2
CPU Count:          12
GPU Count:          1
Memory Avail:       17.86 GB / 30.95 GB (57.7%)
Disk Space Avail:   113.93 GB / 233.67 GB (48.8%)

Fitting with arguments:
{'enable_ensemble': False,
 'eval_metric': WQL,
 'freq': 'MS',
 'hyperparameters': {'Chronos': [{'ag_args': {'name_suffix': 'ZeroShot'},
                                  'model_path': 'bolt_tiny'},
                                 {'ag_args': {'name_suffix': 'FineTuned'},
                                  'fine_tune': True,
                                  'model_path': 'bolt_tiny'},
                                 {'ag_args': {'name_suffix': 'ZeroShot'},
                  

In [5]:
config = TrainingConfig(
    prediction_length=2,  # полгода
    artifact_path="../models/auto_ml_single_target_bigger",
)

predictor = TimeSeriesPredictor.load(config.artifact_path)

Loading predictor from path /home/nikita/projects/time_series_analysis/code_dir/rosstat/models/auto_ml_single_target_bigger


In [6]:
leaderboard = predictor.leaderboard(
    test_data,
    extra_metrics=['MASE', 'MAPE', 'MSE', 'MAE', 'SQL'],
)
leaderboard.rename(columns={'score_test': 'WQL_test', 'score_val': 'WQL_val'}, inplace=True)
leaderboard

Generating leaderboard for all models trained
Additional data provided, testing on additional data. Resulting leaderboard will be sorted according to test score (`score_test`).
Prediction order: ['ChronosZeroShot[bolt_tiny]', 'ChronosZeroShot[bolt_mini]', 'ChronosZeroShot[bolt_small]', 'ChronosFineTuned[bolt_base]', 'ChronosFineTuned[bolt_small]', 'ChronosZeroShot[bolt_base]', 'ChronosFineTuned[bolt_mini]', 'ChronosFineTuned[bolt_tiny]']


loading configuration file config.json from cache at /home/nikita/.cache/huggingface/hub/models--autogluon--chronos-bolt-tiny/snapshots/590f7666166f6f503e215bf0dac08a68390e0302/config.json
Model config T5Config {
  "_name_or_path": "autogluon/chronos-bolt-tiny",
  "architectures": [
    "ChronosBoltModelForForecasting"
  ],
  "chronos_config": {
    "context_length": 2048,
    "input_patch_size": 16,
    "input_patch_stride": 16,
    "prediction_length": 64,
    "quantiles": [
      0.1,
      0.2,
      0.3,
      0.4,
      0.5,
      0.6,
      0.7,
      0.8,
      0.9
    ],
    "use_reg_token": true
  },
  "chronos_pipeline_class": "ChronosBoltPipeline",
  "classifier_dropout": 0.0,
  "d_ff": 1024,
  "d_kv": 64,
  "d_model": 256,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 0.05,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "mod

Unnamed: 0,model,WQL_test,WQL_val,pred_time_test,pred_time_val,fit_time_marginal,fit_order,MASE,MAPE,MSE,MAE,SQL
0,ChronosFineTuned[bolt_mini],-0.121768,-0.1208,0.248127,0.010143,19.947505,4,-13259.441395,-0.114235,-435562400.0,-13259.441395,-11419.164102
1,ChronosFineTuned[bolt_base],-0.123265,-0.121874,0.39,0.062578,85.766299,8,-13919.888281,-0.121897,-437365700.0,-13919.888281,-11559.567488
2,ChronosFineTuned[bolt_small],-0.12509,-0.120516,0.271593,0.010703,32.348965,6,-13888.484726,-0.120993,-446017400.0,-13888.484726,-11730.761201
3,ChronosFineTuned[bolt_tiny],-0.131363,-0.122022,0.244138,0.036198,19.447215,2,-13800.15034,-0.1181,-490570400.0,-13800.15034,-12319.010296
4,ChronosZeroShot[bolt_mini],-0.133486,-0.134947,0.853079,1.291696,0.002489,3,-14264.333939,-0.12117,-557893200.0,-14264.333939,-12518.047403
5,ChronosZeroShot[bolt_small],-0.135828,-0.134195,0.835327,1.21527,0.002882,5,-14345.202859,-0.121813,-563494900.0,-14345.202859,-12737.699989
6,ChronosZeroShot[bolt_base],-0.13596,-0.136489,0.977047,2.486759,0.002879,7,-14271.658163,-0.121214,-556591300.0,-14271.658163,-12750.126862
7,ChronosZeroShot[bolt_tiny],-0.13708,-0.137194,1.042228,1.934478,2.18356,1,-14454.922486,-0.123342,-565142500.0,-14454.922486,-12855.106233


In [22]:
def extract_specific_rows_from_indexed_data(data, start_row: int, end_row: int):
    rows_to_extract = np.arange(start_row, end_row)
    unique_ids = data.index.get_level_values('item_id').unique()
        
    selected_data = []

    for item_id in unique_ids:
        item_data = data.loc[[item_id]]
        
        selected_rows = item_data.iloc[rows_to_extract]
        selected_data.append(selected_rows)

    result = pd.concat(selected_data)

    return result

k = 2

top_k_models = leaderboard.sort_values(['SQL'], ascending=False).head(k)['model'].tolist()
window_size = config.prediction_length
test_length = test_df['code'].value_counts().iloc[0]
max_iterations = (test_length + window_size - 1) // window_size# - 1 ещё -1 из-за known_covariates

current_data = train_data.copy()
val_predictions = {}

for i in range(max_iterations):
    start_idx = i * window_size
    end_idx = start_idx + window_size
    
    for model_name in top_k_models:
        if model_name not in val_predictions:
            val_predictions[model_name] = []
        
        # future_covariates = test_data[start_idx:start_idx + config.prediction_length][known_covariates_names]
        # prediction_covariates = pd.concat([current_data[known_covariates_names], future_covariates])
        
        predictions = predictor.predict(current_data, 
                                       model=model_name,)
                                       # known_covariates=prediction_covariates)
                                       
        val_predictions[model_name].append(predictions)
        
    current_data = pd.concat([current_data, extract_specific_rows_from_indexed_data(val_data, start_idx, end_idx)])

test_df_shape = test_df.shape[0]
val_predictions = {k: pd.concat(v)[:test_df_shape] for k, v in val_predictions.items()}

current_data = val_data.copy()
test_predictions = {}

for i in range(max_iterations):
    start_idx = i * window_size
    end_idx = start_idx + window_size
    
    for model_name in top_k_models:
        if model_name not in test_predictions:
            test_predictions[model_name] = []
        
        # future_covariates = test_data[start_idx:start_idx + config.prediction_length][known_covariates_names]
        # prediction_covariates = pd.concat([current_data[known_covariates_names], future_covariates])
        
        predictions = predictor.predict(current_data, 
                                       model=model_name,)
                                       # known_covariates=prediction_covariates)
                                       
        test_predictions[model_name].append(predictions)
        
    current_data = pd.concat([current_data, extract_specific_rows_from_indexed_data(test_data, start_idx, end_idx)])

test_df_shape = test_df.shape[0]
test_predictions = {k: pd.concat(v)[:test_df_shape] for k, v in test_predictions.items()}

Prediction order: {'ChronosFineTuned[bolt_mini]'}
Cached predictions saved to /home/nikita/projects/time_series_analysis/code_dir/rosstat/models/auto_ml_single_target_bigger/models/cached_predictions.pkl
Prediction order: {'ChronosFineTuned[bolt_base]'}
Cached predictions saved to /home/nikita/projects/time_series_analysis/code_dir/rosstat/models/auto_ml_single_target_bigger/models/cached_predictions.pkl
Prediction order: {'ChronosFineTuned[bolt_mini]'}
Cached predictions saved to /home/nikita/projects/time_series_analysis/code_dir/rosstat/models/auto_ml_single_target_bigger/models/cached_predictions.pkl
Prediction order: {'ChronosFineTuned[bolt_base]'}
Cached predictions saved to /home/nikita/projects/time_series_analysis/code_dir/rosstat/models/auto_ml_single_target_bigger/models/cached_predictions.pkl
Prediction order: {'ChronosFineTuned[bolt_mini]'}
Cached predictions saved to /home/nikita/projects/time_series_analysis/code_dir/rosstat/models/auto_ml_single_target_bigger/models/cac

In [24]:
import ipywidgets as widgets
from IPython.display import display, clear_output
import datetime
from utils.plotting import plot_forecasts_val_test


date_col = pd.to_datetime(test_df["date"])
min_date = date_col.min().date()
max_date = date_col.max().date()
size_multiplyer = 2
height = 400 * size_multiplyer
width = 800 * size_multiplyer * 2
item_id = 1
title = f'Предсказания номинальной заработной платы (для code = {item_id})'

start_date_picker = widgets.DatePicker(
    description="Start date:", disabled=False, value=min_date
)

end_date_picker = widgets.DatePicker(
    description="End date:", disabled=False, value=max_date
)

output_area = widgets.Output()


def on_button_clicked(b):
    with output_area:
        clear_output(wait=True)
        start_date = datetime.datetime.combine(
            start_date_picker.value, datetime.datetime.min.time()
        )
        end_date = datetime.datetime.combine(
            end_date_picker.value, datetime.datetime.min.time()
        )
        plot_forecasts_val_test(
            val_df=val_df_,
            test_df=test_df_,
            val_predictions=all_val_models_predictions_,
            test_predictions=test_predictions,
            title=title,
            start_date=start_date,
            end_date=end_date,
            height=height,
            width=width,
            item_id=item_id,
        )


plot_button = widgets.Button(description="Plot Forecasts")
plot_button.on_click(on_button_clicked)

controls = widgets.VBox(
    [widgets.HBox([start_date_picker, end_date_picker]), plot_button]
)

display(controls, output_area)

val_df_ = val_df.rename(columns={'date': 'timestamp', "nominal_wage": "target"})[['code', 'timestamp', "target"]]
val_df_ = val_df_[val_df_['code'].eq(item_id)].reset_index(drop=True)
val_df_['timestamp'] = pd.to_datetime(val_df_['timestamp'])

test_df_ = test_df.rename(columns={'date': 'timestamp', "nominal_wage": "target"})[['code', 'timestamp', "target"]]
test_df_ = test_df_[test_df_['code'].eq(item_id)].reset_index(drop=True)
test_df_['timestamp'] = pd.to_datetime(test_df_['timestamp'])

val_df_ = pd.concat([val_df_, test_df_.iloc[[0]]])

all_val_models_predictions_ = val_predictions.copy()
for model_ in all_val_models_predictions_.keys():
    all_val_models_predictions_[model_] = pd.concat([all_val_models_predictions_[model_], test_predictions[model_].loc[[item_id]].iloc[[0]]])

with output_area:
    plot_forecasts_val_test(
        val_df=val_df_,
        test_df=test_df_,
        val_predictions=all_val_models_predictions_,
        test_predictions=test_predictions,
        title=title,
        height=height,
        width=width,
        item_id=item_id,
    )

VBox(children=(HBox(children=(DatePicker(value=datetime.date(2023, 1, 1), description='Start date:'), DatePick…

Output()

In [15]:
all_codes = test_df['code'].unique()

In [16]:
all_models_metrics = {}

for model in test_predictions.keys():
    metrics_df = []
    for code in all_codes:
        pred_df = pd.concat([
            test_predictions[model]
            .loc[code][["0.1", "0.5", "0.9"]]
            .reset_index(drop=True),
            test_df[test_df["code"].eq(code)][["nominal_wage"]].reset_index(drop=True),
        ], axis=1)
        pred_df = pd.DataFrame(pred_df)

        metrics_df.append(calculate_sklearn_metrics(pred_df, target_column='nominal_wage'))

    metrics_dict = pd.DataFrame(metrics_df).mean().to_dict()

    all_models_metrics[model] = metrics_dict
all_models_metrics

{'ChronosFineTuned[bolt_mini]': {'MSE': 40170983.83083443,
  'MAE': 3874.4268979279886,
  'MAPE': 4.192300228913504,
  'MASE': 1.2665296264320147,
  'SQL': 1300.2013263008253},
 'ChronosFineTuned[bolt_base]': {'MSE': 29727738.823054302,
  'MAE': 3118.1045643682055,
  'MAPE': 3.5263244260032187,
  'MASE': 0.7360772940086453,
  'SQL': 1044.5361339919227},
 'ChronosFineTuned[bolt_small]': {'MSE': 34845768.12375621,
  'MAE': 3483.042628132549,
  'MAPE': 3.820555286905376,
  'MASE': 1.012560623143062,
  'SQL': 1161.219935477682},
 'ChronosFineTuned[bolt_tiny]': {'MSE': 42303706.056688614,
  'MAE': 3863.7402730600847,
  'MAPE': 4.217355132305277,
  'MASE': 1.151575029299694,
  'SQL': 1282.6508712793134},
 'ChronosZeroShot[bolt_mini]': {'MSE': 172869152.33298156,
  'MAE': 7677.028133963618,
  'MAPE': 8.436172229836057,
  'MASE': 3.25467089886358,
  'SQL': 2660.8140362381737},
 'ChronosZeroShot[bolt_small]': {'MSE': 161446185.57887167,
  'MAE': 7305.511725354771,
  'MAPE': 7.982010861934878,
 

In [17]:
prefix = 'AutoGluon_all_Chronos_versions'

for k, metrics_ in all_models_metrics.items():
    run_name = f"{k}_{prefix}"

    with mlflow.start_run(run_name=run_name):
        mlflow.log_metrics(metrics_)
        mlflow.log_param("model_name", model_name)

        mlflow.set_tag("prefix", prefix)

🏃 View run ChronosFineTuned[bolt_mini]_AutoGluon_all_Chronos_versions at: http://127.0.0.1:5000/#/experiments/169882278836627198/runs/150194fa061d43c2a03ae70dba7f0486
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/169882278836627198
🏃 View run ChronosFineTuned[bolt_base]_AutoGluon_all_Chronos_versions at: http://127.0.0.1:5000/#/experiments/169882278836627198/runs/22f0838fdd7043a898145f504f720957
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/169882278836627198
🏃 View run ChronosFineTuned[bolt_small]_AutoGluon_all_Chronos_versions at: http://127.0.0.1:5000/#/experiments/169882278836627198/runs/e1bda0ca08514c67a348a86fef742e54
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/169882278836627198
🏃 View run ChronosFineTuned[bolt_tiny]_AutoGluon_all_Chronos_versions at: http://127.0.0.1:5000/#/experiments/169882278836627198/runs/b0d7c609430c4e4fada8a0cf8122010e
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/169882278836627198
🏃 View run ChronosZeroS