In [1]:
import sys
import os

import joblib
import torch
import pandas as pd
import mlflow
from sklearn.preprocessing import StandardScaler

sys.path.append('../..')

from utils import get_quantile_from_median, calculate_sklearn_metrics
from xPatch_repo.exp.exp_main import Exp_Main

mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("rosstat_forecasting");

In [2]:
data_dir = '../../../data/rosstat/processed'

target_column = 'nominal_wage'
train_df = pd.read_csv(os.path.join(data_dir, 'train/data.csv'))
val_df = pd.read_csv(os.path.join(data_dir, 'val/data.csv'))
test_df = pd.read_csv(os.path.join(data_dir, 'test/data.csv'))

def reorder_columns_with_target_last(df, target_col):
    columns = list(df.columns)
    if target_col in columns:
        columns.remove(target_col)
        columns.append(target_col)
    return df[columns]

train_df = reorder_columns_with_target_last(train_df, target_column)
val_df = reorder_columns_with_target_last(val_df, target_column)
test_df = reorder_columns_with_target_last(test_df, target_column)

In [3]:
# Нормализация по колонкам

scale_columns = [
    "nominal_wage",
    "capital_labor_ratio_change",
    "capital_productivity_change",
    "fixed_assets_renewal_comparable_prices",
    "labor_productivity",
    "high_productivity_jobs",
    "machinery_share_in_total_assets",
    "investment_share_for_modernization",
    "production_index_yoy",
    "production_index_mom",
]
os.makedirs("./artifacts", exist_ok=True)

train_scaled = train_df.copy()
val_scaled = val_df.copy()
test_scaled = test_df.copy()

scalers = {}
for column in scale_columns:
    scaler = StandardScaler()

    train_scaled[column] = scaler.fit_transform(train_df[[column]])
    val_scaled[column] = scaler.transform(val_df[[column]])
    test_scaled[column] = scaler.transform(test_df[[column]])

    scalers[column] = scaler
    joblib.dump(scaler, f"./artifacts/scaler_{column}.joblib")

joblib.dump(scalers, "./artifacts/all_column_scalers.joblib")

concated_data_path = os.path.join(data_dir, 'xPatch_concated_data.csv')

train_val_df = pd.concat([train_scaled, val_scaled])
train_val_df.to_csv(concated_data_path, index=False)

test_data_path = os.path.join(data_dir, 'xPatch_test_data.csv')
pd.concat([val_scaled, test_scaled]).to_csv(test_data_path, index=False)

val_data_path = os.path.join(data_dir, 'xPatch_val_data.csv')
pd.concat([train_scaled.groupby(by=['code']).tail(12), val_scaled]).to_csv(val_data_path, index=False)

In [51]:
class Args:
    def __init__(self):
        self.seq_len = 12
        self.label_len = 12
        self.pred_len = 2
        
        self.is_training = 1
        self.model_id = 'rosstat'
        self.model = 'xPatch'
        self.data = 'custom_multi'
        
        self.root_path = data_dir
        self.data_path = 'xPatch_concated_data.csv'
        
        self.features = 'MS'
        self.target = 'nominal_wage'
        self.freq = 'M'
        self.scale = False
        self.timeenc = 0
        self.train_only = False
        
        self.enc_in = 10
        self.embed = 'timeF'
        
        self.patch_len = 16
        self.stride = 8
        self.padding_patch = 'end'
        
        self.ma_type = 'ema'
        self.alpha = 0.3
        self.beta = 0.3
        
        self.batch_size = 8
        self.train_epochs = 100
        self.patience = 10
        self.learning_rate = 5e-3
        self.loss = 'mse'
        self.lradj = 'type1'
        self.use_amp = False
        self.revin = 1
        
        self.use_gpu = True if torch.cuda.is_available() else False
        self.gpu = 0
        self.use_multi_gpu = False
        self.devices = '0'
        self.test_flop = False
        
        self.checkpoints = './checkpoints/'
        self.des = 'test'
        self.itr = 1
        self.num_workers = 10

        self.train_perc = 0.95
        self.test_perc = 0.0

args = Args()
assert not args.scale, 'Нормализацию нужно проводить вручную'
experiment = Exp_Main(args)

Use GPU: cuda:0


In [52]:
ii = 0
setting = '{}_{}_{}_ft{}_sl{}_ll{}_pl{}_{}_{}'.format(
            args.model_id,
            args.model,
            args.data,
            args.features,
            args.seq_len,
            args.label_len,
            args.pred_len,
            args.des, ii)

model = experiment.train(setting)

train 3795
val 207
	iters: 100, epoch: 1 | loss: 0.1189000
	speed: 0.0047s/iter; left time: 223.7092s
	iters: 200, epoch: 1 | loss: 0.1182361
	speed: 0.0030s/iter; left time: 142.3323s
	iters: 300, epoch: 1 | loss: 0.1758851
	speed: 0.0031s/iter; left time: 147.5759s
	iters: 400, epoch: 1 | loss: 0.0525256
	speed: 0.0032s/iter; left time: 150.1644s
Epoch: 1 cost time: 1.6613800525665283
Epoch: 1, Steps: 474 | Train Loss: 0.1141962 Vali Loss: 0.1646948 Test Loss: 0.0000000
Validation loss decreased (inf --> 0.164695).  Saving model ...
Updating learning rate to 0.005
	iters: 100, epoch: 2 | loss: 0.0844330
	speed: 0.0094s/iter; left time: 440.2887s
	iters: 200, epoch: 2 | loss: 0.1325845
	speed: 0.0030s/iter; left time: 140.2346s
	iters: 300, epoch: 2 | loss: 0.0625288
	speed: 0.0031s/iter; left time: 142.9997s
	iters: 400, epoch: 2 | loss: 0.0834241
	speed: 0.0030s/iter; left time: 138.6782s
Epoch: 2 cost time: 1.612511157989502
Epoch: 2, Steps: 474 | Train Loss: 0.0988746 Vali Loss: 0

## Получение предсказаний на настоящем test

In [53]:
from copy import copy

val_args = copy(args)

val_args.train_perc = 1.
val_args.train_only = True
val_args.data_path = 'xPatch_val_data.csv'
experiment = Exp_Main(val_args)

val_dataset, _ = experiment._get_data(flag='train')
true_val_datasets = val_dataset.datasets
len(true_val_datasets), len(true_val_datasets[0])

Use GPU: cuda:0
train 759


(69, 11)

In [54]:
test_args = copy(args)

test_args.train_perc = 1.
test_args.train_only = True
test_args.data_path = 'xPatch_test_data.csv'
experiment = Exp_Main(val_args)

test_dataset, _ = experiment._get_data(flag='train')
true_test_datasets = test_dataset.datasets
len(true_test_datasets), len(true_test_datasets[0])

Use GPU: cuda:0
train 759


(69, 11)

In [55]:
from tqdm.auto import tqdm
import numpy as np

val_predictions = {}
val_predictions_df = pd.DataFrame()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
all_codes = train_df['code'].unique()

for code, true_val_dataset in zip(all_codes, true_val_datasets):
    all_preds = []
    all_true = []

    for i in tqdm(range(0, len(true_val_dataset), args.pred_len), disable=True):
        batch_x, batch_y, _, _ = true_val_dataset[i]
        batch_x = torch.tensor(batch_x).unsqueeze(0).float().to(device)
        batch_y = torch.tensor(batch_y).unsqueeze(0).float().to(device)

        model.eval()
        with torch.no_grad():
            preds = model(batch_x)

        y_pred = scalers['nominal_wage'].inverse_transform(preds.detach().squeeze(0).cpu().numpy())[..., -1:].flatten()
        y_true = scalers['nominal_wage'].inverse_transform(batch_y.detach().squeeze(0).cpu().numpy())[:preds.shape[1], -1:].flatten()

        all_preds.append(y_pred)
        all_true.append(y_true)

    predictions = np.concatenate(all_preds)
    true_values = np.concatenate(all_true)

    df = pd.DataFrame([
                    predictions,
                    true_values,
                ]).transpose()
    
    df.columns = ['mean', 'y_true']
    df['code'] = code
    df['0.1'] = get_quantile_from_median(df['mean'].values, 0.1)
    df['0.9'] = get_quantile_from_median(df['mean'].values, 0.9)

    val_predictions_df = pd.concat([val_predictions_df, df])

val_predictions['xPatch'] = val_predictions_df.set_index('code')


test_predictions = {}
test_predictions_df = pd.DataFrame()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
all_codes = train_df['code'].unique()

for code, true_test_dataset in zip(all_codes, true_test_datasets):
    all_preds = []
    all_true = []

    for i in tqdm(range(0, len(true_test_dataset), args.pred_len), disable=True):
        batch_x, batch_y, _, _ = true_test_dataset[i]
        batch_x = torch.tensor(batch_x).unsqueeze(0).float().to(device)
        batch_y = torch.tensor(batch_y).unsqueeze(0).float().to(device)

        model.eval()
        with torch.no_grad():
            preds = model(batch_x)

        y_pred = scalers['nominal_wage'].inverse_transform(preds.detach().squeeze(0).cpu().numpy())[..., -1:].flatten()
        y_true = scalers['nominal_wage'].inverse_transform(batch_y.detach().squeeze(0).cpu().numpy())[:preds.shape[1], -1:].flatten()

        all_preds.append(y_pred)
        all_true.append(y_true)

    predictions = np.concatenate(all_preds)
    true_values = np.concatenate(all_true)

    df = pd.DataFrame([
                    predictions,
                    true_values,
                ]).transpose()
    
    df.columns = ['mean', 'y_true']
    df['code'] = code
    df['0.1'] = get_quantile_from_median(df['mean'].values, 0.1)
    df['0.9'] = get_quantile_from_median(df['mean'].values, 0.9)

    test_predictions_df = pd.concat([test_predictions_df, df])

test_predictions['xPatch'] = test_predictions_df.set_index('code')

In [58]:
import ipywidgets as widgets
from IPython.display import display, clear_output
import datetime
from utils.plotting import plot_forecasts_val_test


date_col = pd.to_datetime(test_df["date"])
min_date = date_col.min().date()
max_date = date_col.max().date()
size_multiplyer = 2
height = 400 * size_multiplyer
width = 800 * size_multiplyer
item_id = 1
title = f'Предсказания номинальной заработной платы (для code = {item_id})'

start_date_picker = widgets.DatePicker(
    description="Start date:", disabled=False, value=min_date
)

end_date_picker = widgets.DatePicker(
    description="End date:", disabled=False, value=max_date
)

output_area = widgets.Output()


def on_button_clicked(b):
    with output_area:
        clear_output(wait=True)
        start_date = datetime.datetime.combine(
            start_date_picker.value, datetime.datetime.min.time()
        )
        end_date = datetime.datetime.combine(
            end_date_picker.value, datetime.datetime.min.time()
        )
        plot_forecasts_val_test(
            val_df=val_df_,
            test_df=test_df_,
            val_predictions=all_val_models_predictions_,
            test_predictions=test_predictions,
            title=title,
            start_date=start_date,
            end_date=end_date,
            height=height,
            width=width,
            item_id=item_id,
        )


plot_button = widgets.Button(description="Plot Forecasts")
plot_button.on_click(on_button_clicked)

controls = widgets.VBox(
    [widgets.HBox([start_date_picker, end_date_picker]), plot_button]
)

display(controls, output_area)

val_df_ = val_df.rename(columns={'date': 'timestamp', "nominal_wage": "target"})[['code', 'timestamp', "target"]]
val_df_ = val_df_[val_df_['code'].eq(item_id)].reset_index(drop=True)
val_df_['timestamp'] = pd.to_datetime(val_df_['timestamp'])

test_df_ = test_df.rename(columns={'date': 'timestamp', "nominal_wage": "target"})[['code', 'timestamp', "target"]]
test_df_ = test_df_[test_df_['code'].eq(item_id)].reset_index(drop=True)
test_df_['timestamp'] = pd.to_datetime(test_df_['timestamp'])

val_df_ = pd.concat([val_df_, test_df_.iloc[[0]]])

all_val_models_predictions_ = val_predictions.copy()
for model_ in all_val_models_predictions_.keys():
    all_val_models_predictions_[model_] = pd.concat([all_val_models_predictions_[model_], test_predictions[model_].loc[[item_id]].iloc[[0]]])

with output_area:
    plot_forecasts_val_test(
        val_df=val_df_,
        test_df=test_df_,
        val_predictions=all_val_models_predictions_,
        test_predictions=test_predictions,
        title=title,
        height=height,
        width=width,
        item_id=item_id,
    )

VBox(children=(HBox(children=(DatePicker(value=datetime.date(2023, 1, 1), description='Start date:'), DatePick…

Output()

In [43]:
all_models_metrics = {}

test_predictions
for model in test_predictions.keys():
    metrics_df = []
    for code in all_codes:
        pred_df = pd.concat([
            test_predictions[model].rename(columns={'mean': '0.5'})
            .loc[code][["0.1", "0.5", "0.9"]]
            .reset_index(drop=True),
            test_df[test_df["code"].eq(code)][["nominal_wage"]].reset_index(drop=True),
        ], axis=1)
        pred_df = pd.DataFrame(pred_df)

        metrics_df.append(calculate_sklearn_metrics(pred_df, target_column='nominal_wage'))

    metrics_dict = pd.DataFrame(metrics_df).mean().to_dict()

    all_models_metrics[model] = metrics_dict

all_models_metrics

{'xPatch': {'MSE': 172628571.59412074,
  'MAE': 11447.651473335598,
  'MAPE': 14.688398540614227,
  'MASE': 3.4372183596557337,
  'SQL': 3415.250993703453}}

In [None]:
1/0

In [None]:
prefix = 'xPatch'

for k, metrics_ in all_models_metrics.items():
    run_name = f"{k}_{prefix}"

    with mlflow.start_run(run_name=run_name):
        mlflow.log_metrics(metrics_)
        mlflow.log_param("model_name", k)

        mlflow.set_tag("prefix", prefix)

🏃 View run xPatch_xPatch at: http://127.0.0.1:5000/#/experiments/169882278836627198/runs/8336b198560a4747bf9a7f81c569cda8
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/169882278836627198
