In [1]:
import gc
import os
import time
import warnings
from itertools import combinations
from warnings import simplefilter

import joblib
import xgboost as xgb
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold, TimeSeriesSplit

warnings.filterwarnings("ignore")
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
import os
os.environ['KAGGLE_CONFIG_DIR'] = "/content/gdrive/My Drive/Kaggle" # https://drive.google.com/drive/folders/18KshTFZ6gGQMeqUf5N6FZq2H1P9fNYlB?usp=sharing

In [4]:
!kaggle competitions download optiver-trading-at-the-close
!unzip optiver-trading-at-the-close.zip

Downloading optiver-trading-at-the-close.zip to /content
 99% 199M/201M [00:05<00:00, 41.2MB/s]
100% 201M/201M [00:05<00:00, 37.8MB/s]
Archive:  optiver-trading-at-the-close.zip
  inflating: example_test_files/revealed_targets.csv  
  inflating: example_test_files/sample_submission.csv  
  inflating: example_test_files/test.csv  
  inflating: optiver2023/__init__.py  
  inflating: optiver2023/competition.cpython-310-x86_64-linux-gnu.so  
  inflating: public_timeseries_testing_util.py  
  inflating: train.csv               


In [5]:
df = pd.read_csv("/content/train.csv")

In [6]:
def reduce_mem_usage(df, verbose=0):
    """
    Iterate through all numeric columns of a dataframe and modify the data type
    to reduce memory usage.
    """

    start_mem = df.memory_usage().sum() / 1024**2

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float32)

    if verbose:
        logger.info(f"Memory usage of dataframe is {start_mem:.2f} MB")
        end_mem = df.memory_usage().sum() / 1024**2
        logger.info(f"Memory usage after optimization is: {end_mem:.2f} MB")
        decrease = 100 * (start_mem - end_mem) / start_mem
        logger.info(f"Decreased by {decrease:.2f}%")

    return df

from numba import njit, prange

@njit(parallel=True)
def compute_triplet_imbalance(df_values, comb_indices):
    num_rows = df_values.shape[0]
    num_combinations = len(comb_indices)
    imbalance_features = np.empty((num_rows, num_combinations))

    for i in prange(num_combinations):
        a, b, c = comb_indices[i]
        for j in range(num_rows):
            max_val = max(df_values[j, a], df_values[j, b], df_values[j, c])
            min_val = min(df_values[j, a], df_values[j, b], df_values[j, c])
            mid_val = df_values[j, a] + df_values[j, b] + df_values[j, c] - min_val - max_val
            if mid_val == min_val:  # Prevent division by zero
                imbalance_features[j, i] = np.nan
            else:
                imbalance_features[j, i] = (max_val - mid_val) / (mid_val - min_val)

    return imbalance_features


def calculate_triplet_imbalance_numba(price, df):
    # Convert DataFrame to numpy array for Numba compatibility
    df_values = df[price].values
    comb_indices = [(price.index(a), price.index(b), price.index(c)) for a, b, c in combinations(price, 3)]

    # Calculate the triplet imbalance
    features_array = compute_triplet_imbalance(df_values, comb_indices)

    # Create a DataFrame from the results
    columns = [f"{a}_{b}_{c}_imb2" for a, b, c in combinations(price, 3)]
    features = pd.DataFrame(features_array, columns=columns)

    return features

# generate imbalance features
def imbalance_features(df):
    prices = ["reference_price", "far_price", "near_price", "ask_price", "bid_price", "wap"]
    sizes = ["matched_size", "bid_size", "ask_size", "imbalance_size"]

    # V1
    df["volume"] = df.eval("ask_size + bid_size")
    df["mid_price"] = df.eval("(ask_price + bid_price) / 2")
    df["liquidity_imbalance"] = df.eval("(bid_size-ask_size)/(bid_size+ask_size)")
    df["matched_imbalance"] = df.eval("(imbalance_size-matched_size)/(matched_size+imbalance_size)")
    df["size_imbalance"] = df.eval("bid_size / ask_size")

    for c in combinations(prices, 2):
        df[f"{c[0]}_{c[1]}_imb"] = df.eval(f"({c[0]} - {c[1]})/({c[0]} + {c[1]})")

    for c in [['ask_price', 'bid_price', 'wap', 'reference_price'], sizes]:
        triplet_feature = calculate_triplet_imbalance_numba(c, df)
        df[triplet_feature.columns] = triplet_feature.values

    # V2
    df["stock_weights"] = df["stock_id"].map(weights)
    df["weighted_wap"] = df["stock_weights"] * df["wap"]
    df['wap_momentum'] = df.groupby('stock_id')['weighted_wap'].pct_change(periods=6)
    df["imbalance_momentum"] = df.groupby(['stock_id'])['imbalance_size'].diff(periods=1) / df['matched_size']
    df["price_spread"] = df["ask_price"] - df["bid_price"]
    df["spread_intensity"] = df.groupby(['stock_id'])['price_spread'].diff()
    df['price_pressure'] = df['imbalance_size'] * (df['ask_price'] - df['bid_price'])
    df['market_urgency'] = df['price_spread'] * df['liquidity_imbalance']
    df['depth_pressure'] = (df['ask_size'] - df['bid_size']) * (df['far_price'] - df['near_price'])
    df['spread_depth_ratio'] = (df['ask_price'] - df['bid_price']) / (df['bid_size'] + df['ask_size'])
    df['mid_price_movement'] = df['mid_price'].diff(periods=5).apply(lambda x: 1 if x > 0 else (-1 if x < 0 else 0))
    df['micro_price'] = ((df['bid_price'] * df['ask_size']) + (df['ask_price'] * df['bid_size'])) / (df['bid_size'] + df['ask_size'])
    df['relative_spread'] = (df['ask_price'] - df['bid_price']) / df['wap']

    for func in ["mean", "std", "skew", "kurt"]:
        df[f"all_prices_{func}"] = df[prices].agg(func, axis=1)
        df[f"all_sizes_{func}"] = df[sizes].agg(func, axis=1)

    # V3
    for col in ['matched_size', 'imbalance_size', 'reference_price', 'imbalance_buy_sell_flag']:
        for window in [1, 2, 3, 5, 10]:
            df[f"{col}_shift_{window}"] = df.groupby('stock_id')[col].shift(window)
            df[f"{col}_ret_{window}"] = df.groupby('stock_id')[col].pct_change(window)

    for col in ['ask_price', 'bid_price', 'ask_size', 'bid_size',
                'wap', 'near_price', 'far_price']:
        for window in [1, 2, 3, 5, 10]:
            df[f"{col}_diff_{window}"] = df.groupby("stock_id")[col].diff(window)

    return df.replace([np.inf, -np.inf], 0)

# generate time & stock features
def other_features(df):
    df["dow"] = df["date_id"] % 5
    df["dom"] = df["date_id"] % 20
    df["seconds"] = df["seconds_in_bucket"] % 60
    df["minute"] = df["seconds_in_bucket"] // 60

    for key, value in global_stock_id_feats.items():
        df[f"global_{key}"] = df["stock_id"].map(value.to_dict())

    return df

# generate all features
def generate_all_features(df):
    #cols = [c for c in df.columns if c not in ["row_id", "time_id", "target"]]
    #df = df[cols]
    df = imbalance_features(df)
    df = other_features(df)
    gc.collect()

    #feature_name = [i for i in df.columns if i not in ["row_id", "target", "time_id", "date_id"]]
    return df
    #return df[feature_name]

weights = [
    0.004, 0.001, 0.002, 0.006, 0.004, 0.004, 0.002, 0.006, 0.006, 0.002, 0.002, 0.008,
    0.006, 0.002, 0.008, 0.006, 0.002, 0.006, 0.004, 0.002, 0.004, 0.001, 0.006, 0.004,
    0.002, 0.002, 0.004, 0.002, 0.004, 0.004, 0.001, 0.001, 0.002, 0.002, 0.006, 0.004,
    0.004, 0.004, 0.006, 0.002, 0.002, 0.04, 0.002, 0.002, 0.004, 0.04 , 0.002, 0.001,
    0.006, 0.004, 0.004, 0.006, 0.001, 0.004, 0.004, 0.002, 0.006, 0.004, 0.006, 0.004,
    0.006, 0.004, 0.002, 0.001, 0.002, 0.004, 0.002, 0.008, 0.004, 0.004, 0.002, 0.004,
    0.006, 0.002, 0.004, 0.004, 0.002, 0.004, 0.004, 0.004, 0.001, 0.002, 0.002, 0.008,
    0.02 , 0.004, 0.006, 0.002, 0.02 , 0.002, 0.002, 0.006, 0.004, 0.002, 0.001, 0.02,
    0.006, 0.001, 0.002, 0.004, 0.001, 0.002, 0.006, 0.006, 0.004, 0.006, 0.001, 0.002,
    0.004, 0.006, 0.006, 0.001, 0.04 , 0.006, 0.002, 0.004, 0.002, 0.002, 0.006, 0.002,
    0.002, 0.004, 0.006, 0.006, 0.002, 0.002, 0.008, 0.006, 0.004, 0.002, 0.006, 0.002,
    0.004, 0.006, 0.002, 0.004, 0.001, 0.004, 0.002, 0.004, 0.008, 0.006, 0.008, 0.002,
    0.004, 0.002, 0.001, 0.004, 0.004, 0.004, 0.006, 0.008, 0.004, 0.001, 0.001, 0.002,
    0.006, 0.004, 0.001, 0.002, 0.006, 0.004, 0.006, 0.008, 0.002, 0.002, 0.004, 0.002,
    0.04 , 0.002, 0.002, 0.004, 0.002, 0.002, 0.006, 0.02 , 0.004, 0.002, 0.006, 0.02,
    0.001, 0.002, 0.006, 0.004, 0.006, 0.004, 0.004, 0.004, 0.004, 0.002, 0.004, 0.04,
    0.002, 0.008, 0.002, 0.004, 0.001, 0.004, 0.006, 0.004,
]

weights = {int(k):v for k,v in enumerate(weights)}

In [7]:
global_stock_id_feats = {
        "median_size": df.groupby("stock_id")["bid_size"].median() + df.groupby("stock_id")["ask_size"].median(),
        "std_size": df.groupby("stock_id")["bid_size"].std() + df.groupby("stock_id")["ask_size"].std(),
        "ptp_size": df.groupby("stock_id")["bid_size"].max() - df.groupby("stock_id")["bid_size"].min(),
        "median_price": df.groupby("stock_id")["bid_price"].median() + df.groupby("stock_id")["ask_price"].median(),
        "std_price": df.groupby("stock_id")["bid_price"].std() + df.groupby("stock_id")["ask_price"].std(),
        "ptp_price": df.groupby("stock_id")["bid_price"].max() - df.groupby("stock_id")["ask_price"].min(),
  }

In [8]:
# split_day = 435
# df_train = df[df["date_id"] <= split_day]
# df_valid = df[df["date_id"] > split_day]

# global_stock_id_feats = {
#         "median_size": df_train.groupby("stock_id")["bid_size"].median() + df_train.groupby("stock_id")["ask_size"].median(),
#         "std_size": df_train.groupby("stock_id")["bid_size"].std() + df_train.groupby("stock_id")["ask_size"].std(),
#         "ptp_size": df_train.groupby("stock_id")["bid_size"].max() - df_train.groupby("stock_id")["bid_size"].min(),
#         "median_price": df_train.groupby("stock_id")["bid_price"].median() + df_train.groupby("stock_id")["ask_price"].median(),
#         "std_price": df_train.groupby("stock_id")["bid_price"].std() + df_train.groupby("stock_id")["ask_price"].std(),
#         "ptp_price": df_train.groupby("stock_id")["bid_price"].max() - df_train.groupby("stock_id")["ask_price"].min(),
#   }


In [9]:
# df_train = df
# df_train_feats = generate_all_features(df_train)
# df_train_feats = reduce_mem_usage(df_train_feats)

# offline_split = df_train['date_id']>(split_day - 45)
# df_offline_train = df_train_feats[~offline_split]
# df_offline_valid = df_train_feats[offline_split]
# df_offline_train_target = df_train['target'][~offline_split]
# df_offline_valid_target = df_train['target'][offline_split]

# # Check and remove NaN values
# df_offline_train_target = df_offline_train_target.dropna()
# df_offline_valid_target = df_offline_valid_target.dropna()

# # Check and remove infinite values, if any
# df_offline_train_target.replace([np.inf, -np.inf], np.nan, inplace=True)
# df_offline_valid_target.replace([np.inf, -np.inf], np.nan, inplace=True)

# df_offline_train_target.dropna(inplace=True)
# df_offline_valid_target.dropna(inplace=True)

# df_offline_train = df_offline_train.loc[df_offline_train_target.index]
# df_offline_valid = df_offline_valid.loc[df_offline_valid_target.index]

# # Function to remove NaN and Inf values from DataFrame
# def remove_nan_inf(df):
#     df.replace([np.inf, -np.inf], np.nan, inplace=True)  # Replace Inf with NaN
#     df.dropna(inplace=True)  # Drop all NaNs
#     return df

# # Clean the data for both features and targets
# df_offline_train = remove_nan_inf(df_offline_train)
# df_offline_valid = remove_nan_inf(df_offline_valid)
# df_offline_train_target = remove_nan_inf(df_offline_train_target)
# df_offline_valid_target = remove_nan_inf(df_offline_valid_target)

# # Ensure the indices of features and targets match
# df_offline_train_target = df_offline_train_target.loc[df_offline_train.index]
# df_offline_valid_target = df_offline_valid_target.loc[df_offline_valid.index]
# df_offline_train = df_offline_train.reindex(df_offline_train_target.index)
# df_offline_valid = df_offline_valid.reindex(df_offline_valid_target.index)

# X_train = torch.tensor(df_offline_train.values, dtype=torch.float32)
# y_train = torch.tensor(df_offline_train_target.values, dtype=torch.float32)
# X_val = torch.tensor(df_offline_valid.values, dtype=torch.float32)
# y_val = torch.tensor(df_offline_valid_target.values, dtype=torch.float32)


In [10]:
def imputer(df):
    far_price_mean = df['far_price'].mean()
    near_price_mean = df['near_price'].mean()
    df['far_price'] = df['far_price'].fillna(far_price_mean)
    df['near_price'] = df['near_price'].fillna(near_price_mean)

    return df, far_price_mean, near_price_mean

def add_missing_data(df):
    all_stock_ids = set(range(200))
    all_missed_data_list = []

    grouped = df.groupby('time_id')

    for t, group in grouped:
        current_stock_ids = set(group['stock_id'].to_list())
        missed_stock_id = list(all_stock_ids - current_stock_ids)

        date_id = group['date_id'].iloc[-1]
        seconds_in_bucket = group['seconds_in_bucket'].iloc[-1]

        missed_stock_id_num = len(missed_stock_id)
        missed_date_id = [date_id] * missed_stock_id_num
        missed_seconds_in_bucket = [seconds_in_bucket] * missed_stock_id_num
        missed_time_id = [t] * missed_stock_id_num

        missed_data = pd.DataFrame({
            'stock_id': missed_stock_id,
            'date_id': missed_date_id,
            'seconds_in_bucket': missed_seconds_in_bucket,
            'time_id': missed_time_id
        })

        all_missed_data_list.append(missed_data)

    all_missed_data = pd.concat(all_missed_data_list, axis=0).reset_index(drop=True).astype(int)

    df = pd.concat([df, all_missed_data], axis=0)
    df = df.sort_values(by=['time_id', 'stock_id']).reset_index(drop=True)
    df = df.groupby('stock_id').apply(lambda x: x.fillna(method='bfill')).reset_index(drop=True)

    return df

train, far_price_mean, near_price_mean = imputer(df)
train = add_missing_data(df)

def sizesum_and_pricestd(df):
    price_ftrs = ['reference_price', 'far_price', 'near_price', 'bid_price', 'ask_price', 'wap'] # std
    size_ftrs = ['imbalance_size', 'matched_size', 'bid_size', 'ask_size'] # sum

    rolled = df[['stock_id'] + size_ftrs].groupby('stock_id').rolling(window=6, min_periods=1).sum()
    rolled = rolled.reset_index(level=0, drop=True)
    for col in size_ftrs:
        df[f'{col}_rolled_sum'] = rolled[col]

    rolled = df[['stock_id'] + price_ftrs].groupby('stock_id').rolling(window=6, min_periods=1).std().fillna(0)
    rolled = rolled.reset_index(level=0, drop=True)
    for col in price_ftrs:
        df[f'{col}_rolled_std'] = rolled[col]

    return df

train = sizesum_and_pricestd(train)

def remove_element(input_list, drop_list):
    return [e for e in input_list if e not in drop_list]

no_feature_cols = ['date_id', 'row_id', 'time_id', 'target', 'currently_scored']

feature_cols = remove_element(train.columns, no_feature_cols)
target_col = 'target'

In [11]:
avg = train[feature_cols].mean()
std = train[feature_cols].std()

train[feature_cols] = (train[feature_cols] - avg)/std

In [12]:
train = train.astype('float32')
train = reduce_mem_usage(train)

seq_len = 16

# Grouping by time_id
grouped_by_time = train.groupby('stock_id')

def generate_data(grouped_by_time, seq_len):
    for _, group in grouped_by_time:
        # Sorting by stock_id to maintain consistency across images
        group_sorted = group.sort_values(by='time_id')

        features = group_sorted[feature_cols].values

        windows = []

        for t in range(0, seq_len - 1):
            copy_0 = np.stack([features[0]] * (seq_len - 1 - t))
            cut_0 = features[: t + 1]
            windows.append(np.vstack((copy_0, cut_0)))

        for t in range(0, features.shape[0] - seq_len + 1):
            windows.append(features[t: t+seq_len, :])

        # Convert list of windows to numpy array
        features_array = np.stack(windows)

        target = group_sorted['target'].values

        # Yield the result for this group to avoid storing all results in memory
        yield features_array, target

# Use generator to iterate over data
data_generator = generate_data(grouped_by_time, seq_len=seq_len)

# If you need to store results in arrays:
datas, labels = zip(*data_generator)
data = np.array(datas).reshape(-1, seq_len, len(feature_cols))
label = np.array(labels).reshape(-1,)

In [13]:
import torch
from torch.utils.data import DataLoader, TensorDataset, Subset, random_split

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
torch.manual_seed(42)

data = torch.tensor(data, dtype=torch.float32).to(device)
label = torch.tensor(label, dtype=torch.float32).to(device)

dataset = TensorDataset(data, label)

train_ratio = 0.8
train_size = int(train_ratio * len(dataset))
valid_size = len(dataset) - train_size
train_dataset, valid_dataset = random_split(dataset, [train_size, valid_size])

batch_size = 1024

train_loader = DataLoader(train_dataset, batch_size=batch_size)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size)

In [17]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, Subset, random_split
from torch.optim.lr_scheduler import ReduceLROnPlateau

In [18]:
import torch
import torch.nn as nn

class ProbSparseAttention(nn.Module):
    def __init__(self, d_model, nhead):
        super(ProbSparseAttention, self).__init__()
        self.attention = nn.MultiheadAttention(d_model, nhead)

    def forward(self, x):
        output, _ = self.attention(x, x, x)
        return output

class InformerEncoder(nn.Module):
    def __init__(self, d_model, nhead, num_layers, attention_mode='prob'):
        super(InformerEncoder, self).__init__()
        self.layers = nn.ModuleList([ProbSparseAttention(d_model, nhead) for _ in range(num_layers)])

    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x

class InformerDecoder(nn.Module):
    def __init__(self, d_model, nhead, num_layers, attention_mode='prob'):
        super(InformerDecoder, self).__init__()
        self.layers = nn.ModuleList([ProbSparseAttention(d_model, nhead) for _ in range(num_layers)])

    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x

class MyInformerModel(nn.Module):
    def __init__(self, feature_num, d_model, nhead, num_layers, attn_mode='prob'):
        super(MyInformerModel, self).__init__()
        self.embedding = nn.Linear(feature_num, d_model)
        self.encoder = InformerEncoder(d_model, nhead, num_layers, attn_mode)
        self.decoder = InformerDecoder(d_model, nhead, num_layers, attn_mode)
        self.fc = nn.Linear(d_model, 1)

    def forward(self, x):
        x = self.embedding(x)
        x_enc = self.encoder(x)
        x_dec = self.decoder(x_enc)
        x_out = self.fc(x_dec[:, -1, :])
        return x_out

In [19]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_squared_log_error, r2_score
import math

tot_train_losses = []
tot_train_mae = []
tot_train_rmse = []
tot_train_rmsle = []

tot_valid_losses = []
tot_valid_mae = []
tot_valid_rmse = []
tot_valid_rmsle = []

is_train = True
if is_train:
    input_size = data.shape[-1]

    n_epochs = 50
    lr = 1e-03

    pre_epoch_valid_mae = np.inf
    patience_counter = 0

    model = MyInformerModel(feature_num=input_size, d_model=64, nhead=2, num_layers=1).to(device)
    #model = MyModel(feature_num=input_size, d_model=64, nhead=2, num_layers=1)
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=1e-5)
    loss = nn.L1Loss().to(device)
    #loss = nn.L1Loss()

    out_path = "model/"
    if not os.path.exists(out_path):
        os.makedirs(out_path)
    best_mae = np.inf

    print(f'Train start...')
    for epoch in range(n_epochs):
        model.train()
        train_losses = []
        train_mae = []
        train_rmse = []
        batch_num = len(train_loader)

        # Training
        for X, y in train_loader:
            optimizer.zero_grad()
            outputs = model(X).squeeze()
            l = loss(outputs, y)
            l.backward()
            nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
            optimizer.step()
            train_losses.append(l.item())

            # Calculate MAE
            mae = mean_absolute_error(y.cpu().numpy(), outputs.detach().cpu().numpy())
            train_mae.append(mae)

            # Calculate RMSE
            rmse = math.sqrt(mean_squared_error(y.cpu().numpy(), outputs.detach().cpu().numpy()))
            train_rmse.append(rmse)

        epoch_train_loss = np.mean(train_losses)
        epoch_train_mae = np.mean(train_mae)
        epoch_train_rmse = np.mean(train_rmse)

        tot_train_losses.append(epoch_train_loss)
        tot_train_mae.append(epoch_train_mae)
        tot_train_rmse.append(epoch_train_rmse)

        print(f'Epoch [{epoch+1}/{n_epochs}] Training Loss: {epoch_train_loss:.4f}')
        print(f'Epoch [{epoch+1}/{n_epochs}] Training MAE: {epoch_train_mae:.4f}')
        print(f'Epoch [{epoch+1}/{n_epochs}] Training RMSE: {epoch_train_rmse:.4f}')

        train_maes = []

        model.eval()
        with torch.no_grad():
            valid_losses = []
            valid_maes = []
            valid_rmse = []
            valid_rmsle = []

            for X_v, y_v in valid_loader:
                preds = model(X_v).squeeze()
                valid_loss = loss(preds, y_v)
                valid_losses.append(valid_loss.item())

                # Calculate MAE
                valid_mae = mean_absolute_error(y_v.cpu().numpy(), preds.cpu().numpy())
                valid_maes.append(valid_mae)

                # Calculate RMSE
                valid_rmse_val = math.sqrt(mean_squared_error(y_v.cpu().numpy(), preds.cpu().numpy()))
                valid_rmse.append(valid_rmse_val)

            epoch_valid_loss = np.mean(valid_losses)
            epoch_valid_mae = np.mean(valid_maes)
            epoch_valid_rmse = np.mean(valid_rmse)

            tot_valid_losses.append(epoch_train_loss)
            tot_valid_mae.append(epoch_train_mae)
            tot_valid_rmse.append(epoch_train_rmse)

            print(f'Epoch [{epoch+1}/{n_epochs}] Validation Loss: {epoch_valid_loss:.4f}')
            print(f'Epoch [{epoch+1}/{n_epochs}] Validation MAE: {epoch_valid_mae:.4f}')
            print(f'Epoch [{epoch+1}/{n_epochs}] Validation RMSE: {epoch_valid_rmse:.4f}')


            if epoch_valid_mae < best_mae:
                best_mae = epoch_valid_mae
                torch.save(model, os.path.join(out_path, f"model_epoch_{epoch+1}.pt"))

        if epoch_valid_mae - pre_epoch_valid_mae > 0:
            patience_counter += 1

            if patience_counter == 2:
                lr = lr * 0.5
                patience_counter = 0
                for param_group in optimizer.param_groups:
                    param_group['lr'] = lr
                    print(f'renew lr to {lr}')

        pre_epoch_valid_mae = epoch_valid_mae

        if (epoch_valid_mae - epoch_train_mae > 0.03) or (lr <1e-7):
            print('Early stop now.')
            break

    print(f'Train over.')

Train start...
Epoch [1/50] Training Loss: 6.4830
Epoch [1/50] Training MAE: 6.4830
Epoch [1/50] Training RMSE: 9.6304
Epoch [1/50] Validation Loss: 6.4800
Epoch [1/50] Validation MAE: 6.4800
Epoch [1/50] Validation RMSE: 9.6153
Epoch [2/50] Training Loss: 6.4784
Epoch [2/50] Training MAE: 6.4784
Epoch [2/50] Training RMSE: 9.6242
Epoch [2/50] Validation Loss: 6.4824
Epoch [2/50] Validation MAE: 6.4824
Epoch [2/50] Validation RMSE: 9.6211
Epoch [3/50] Training Loss: 6.4803
Epoch [3/50] Training MAE: 6.4803
Epoch [3/50] Training RMSE: 9.6274
Epoch [3/50] Validation Loss: 6.4824
Epoch [3/50] Validation MAE: 6.4824
Epoch [3/50] Validation RMSE: 9.6210
renew lr to 0.0005
Epoch [4/50] Training Loss: 6.4794
Epoch [4/50] Training MAE: 6.4794
Epoch [4/50] Training RMSE: 9.6260
Epoch [4/50] Validation Loss: 6.4798
Epoch [4/50] Validation MAE: 6.4798
Epoch [4/50] Validation RMSE: 9.6171
Epoch [5/50] Training Loss: 6.4763
Epoch [5/50] Training MAE: 6.4763
Epoch [5/50] Training RMSE: 9.6218
Epoch 

In [20]:
torch.save(model.state_dict(), os.path.join(out_path, "informer.pt"))

In [21]:
import pickle

metrics = {
    "Train Loss": tot_train_losses,
    "Train MAE": tot_train_mae,
    "Train RMSE": tot_train_rmse,
    "Validation Loss": tot_valid_losses,
    "Validation MAE": tot_valid_mae,
    "Validation RMSE": tot_valid_rmse
}

with open('informer_training_metrics.pkl', 'wb') as f:
    pickle.dump(metrics, f)

In [22]:
metrics_df = pd.DataFrame(metrics)
metrics_df.to_csv('informer_training_metrics.csv', index=False)

In [23]:

metrics_df = pd.read_csv('informer_training_metrics.csv')
metrics_df

Unnamed: 0,Train Loss,Train MAE,Train RMSE,Validation Loss,Validation MAE,Validation RMSE
0,6.482982,6.482982,9.630422,6.482982,6.482982,9.630422
1,6.478449,6.478448,9.62416,6.478449,6.478448,9.62416
2,6.480269,6.480269,9.627383,6.480269,6.480269,9.627383
3,6.479428,6.479428,9.625975,6.479428,6.479428,9.625975
4,6.476336,6.476336,9.621783,6.476336,6.476336,9.621783
5,6.47671,6.47671,9.622218,6.47671,6.47671,9.622218
6,6.47863,6.47863,9.623935,6.47863,6.47863,9.623935
7,6.476684,6.476685,9.62247,6.476684,6.476685,9.62247
8,6.475605,6.475606,9.621426,6.475605,6.475606,9.621426
9,6.475343,6.475343,9.620632,6.475343,6.475343,9.620632


In [1]:
data = {
    "Train Loss": [6.482982, 6.478449, 6.480269, 6.479428, 6.476336, 6.476710, 6.478630, 6.476684, 6.475605, 6.475343, 6.474865, 6.476096, 6.472729, 6.472053, 6.473843, 6.473490, 6.470880, 6.469275, 6.469202, 6.468104, 6.466616, 6.465229, 6.464994, 6.464488, 6.465793, 6.464047, 6.464299, 6.465654, 6.463569, 6.462935, 6.462535, 6.461447, 6.460866, 6.460669, 6.460634, 6.460246, 6.459751, 6.459905, 6.460261, 6.460083, 6.459951, 6.459936, 6.459977, 6.459865, 6.459746, 6.459672, 6.459611, 6.459496, 6.459499, 6.459494],
    "Train MAE": [6.482982, 6.478448, 6.480269, 6.479428, 6.476336, 6.476710, 6.478630, 6.476685, 6.475606, 6.475343, 6.474864, 6.476096, 6.472729, 6.472053, 6.473843, 6.473490, 6.470880, 6.469276, 6.469202, 6.468105, 6.466617, 6.465229, 6.464994, 6.464488, 6.465793, 6.464047, 6.464300, 6.465653, 6.463569, 6.462935, 6.462535, 6.461447, 6.460865, 6.460669, 6.460634, 6.460246, 6.459751, 6.459905, 6.460261, 6.460083, 6.459952, 6.459936, 6.459978, 6.459865, 6.459746, 6.459672, 6.459611, 6.459496, 6.459498, 6.459494],
    "Train RMSE": [9.630422, 9.624160, 9.627383, 9.625975, 9.621783, 9.622218, 9.623935, 9.622470, 9.621426, 9.620632, 9.620620, 9.623488, 9.618969, 9.617699, 9.619530, 9.619655, 9.616009, 9.613617, 9.612922, 9.611592, 9.610374, 9.608401, 9.608574, 9.608452, 9.608580, 9.606394, 9.608216, 9.610260, 9.607933, 9.607358, 9.606894, 9.605586, 9.605419, 9.605061, 9.604075, 9.603869, 9.603538, 9.603889, 9.604461, 9.604540, 9.604321, 9.604386, 9.604464, 9.604301, 9.604125, 9.604029, 9.603913, 9.603779, 9.603746, 9.603714],
    "Validation Loss": [6.482982, 6.478449, 6.480269, 6.479428, 6.476336, 6.476710, 6.478630, 6.476684, 6.475605, 6.475343, 6.474865, 6.476096, 6.472729, 6.472053, 6.473843, 6.473490, 6.470880, 6.469275, 6.469202, 6.468104, 6.466616, 6.465229, 6.464994, 6.464488, 6.465793, 6.464047, 6.464299, 6.465654, 6.463569, 6.462935, 6.462535, 6.461447, 6.460866, 6.460669, 6.460634, 6.460246, 6.459751, 6.459905, 6.460261, 6.460083, 6.459951, 6.459936, 6.459977, 6.459865, 6.459746, 6.459672, 6.459611, 6.459496, 6.459499, 6.459494],
    "Validation MAE": [6.482982, 6.478448, 6.480269, 6.479428, 6.476336, 6.476710, 6.478630, 6.476685, 6.475606, 6.475343, 6.474864, 6.476096, 6.472729, 6.472053, 6.473843, 6.473490, 6.470880, 6.469276, 6.469202, 6.468105, 6.466617, 6.465229, 6.464994, 6.464488, 6.465793, 6.464047, 6.464300, 6.465653, 6.463569, 6.462935, 6.462535, 6.461447, 6.460865, 6.460669, 6.460634, 6.460246, 6.459751, 6.459905, 6.460261, 6.460083, 6.459952, 6.459936, 6.459978, 6.459865, 6.459746, 6.459672, 6.459611, 6.459496, 6.459498, 6.459494],
    "Validation RMSE": [9.630422, 9.624160, 9.627383, 9.625975, 9.621783, 9.622218, 9.623935, 9.622470, 9.621426, 9.620632, 9.620620, 9.623488, 9.618969, 9.617699, 9.619530, 9.619655, 9.616009, 9.613617, 9.612922, 9.611592, 9.610374, 9.608401, 9.608574, 9.608452, 9.608580, 9.606394, 9.608216, 9.610260, 9.607933, 9.607358, 9.606894, 9.605586, 9.605419, 9.605061, 9.604075, 9.603869, 9.603538, 9.603889, 9.604461, 9.604540, 9.604321, 9.604386, 9.604464, 9.604301, 9.604125, 9.604029, 9.603913, 9.603779, 9.603746, 9.603714]
}

In [3]:
import pandas as pd

df = pd.DataFrame(data)
df.min()

Train Loss         6.459494
Train MAE          6.459494
Train RMSE         9.603538
Validation Loss    6.459494
Validation MAE     6.459494
Validation RMSE    9.603538
dtype: float64