# Informer Implemented

The architecture has three distinctive features:
* A ProbSparse self-attention mechanism with an O time and memory complexity Llog(L).
* A self-attention distilling process that prioritizes attention and efficiently handles long input sequences.
* An MLP multi-step decoder that predicts long time-series sequences in a single forward operation rather than step-by-step.

The Informer model utilizes a three-component approach to define its embedding:
* It employs encoded autoregressive features obtained from a convolution network.
* 'It uses window-relative positional embeddings derived from harmonic functions.
* Absolute positional embeddings obtained from calendar features are utilized.

Inspiration from: 
* https://github.com/zhouhaoyi/Informer2020/

## Imports

In [29]:
import math
import time
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import MinMaxScaler

import pytorch_lightning as pl

## Set Seed for Reproducibility

In [30]:
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)
pl.seed_everything(seed)

Seed set to 42


42

## Code from Informer2020 github (Informer implemented) (Paper implementation)

In [31]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, d_model, 2).float() *
                             (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)  # Shape: (1, max_len, d_model)
        self.register_buffer('pe', pe)

    def forward(self, x):
        # x: (B, L, d_model)
        x = x + self.pe[:, :x.size(1)]
        return x


class ProbAttention(nn.Module):
    """
    A complete version of the probabilistic attention module.
    For simplicity, this implementation computes full attention,
    but it is structured similarly to the Informer2020 version.
    """

    def __init__(self, mask_flag=True, factor=5, scale=None, attention_dropout=0.1, output_attention=False):
        super(ProbAttention, self).__init__()
        self.mask_flag = mask_flag
        self.factor = factor
        self.scale = scale
        self.output_attention = output_attention
        self.dropout = nn.Dropout(attention_dropout)

    def forward(self, queries, keys, values, attn_mask=None):
        # queries, keys, values: (B, L, H, D)
        # Permute to (B, H, L, D)
        queries = queries.permute(0, 2, 1, 3)
        keys = keys.permute(0, 2, 1, 3)
        values = values.permute(0, 2, 1, 3)

        # Compute full attention scores
        scores = torch.matmul(
            queries, keys.transpose(-2, -1))  # (B, H, Lq, Lk)
        if self.scale:
            scores = scores / self.scale
        # Apply mask only for self-attention (query and key lengths match)
        if self.mask_flag and attn_mask is None and (queries.size(-2) == keys.size(-2)):
            L = queries.size(-2)
            attn_mask = torch.tril(torch.ones(
                L, L, device=queries.device)).unsqueeze(0).unsqueeze(0)
            scores = scores.masked_fill(attn_mask == 0, -1e9)
        elif attn_mask is not None:
            scores = scores.masked_fill(attn_mask == 0, -1e9)
        attn = torch.softmax(scores, dim=-1)
        attn = self.dropout(attn)
        output = torch.matmul(attn, values)  # (B, H, Lq, D)
        output = output.permute(0, 2, 1, 3)    # (B, Lq, H, D)
        if self.output_attention:
            return output, attn
        else:
            return output, None

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_heads, dropout=0.1):
        super(MultiHeadAttention, self).__init__()
        assert d_model % n_heads == 0, "d_model must be divisible by n_heads"
        self.d_head = d_model // n_heads
        self.n_heads = n_heads
        self.q_linear = nn.Linear(d_model, d_model)
        self.k_linear = nn.Linear(d_model, d_model)
        self.v_linear = nn.Linear(d_model, d_model)
        self.out_linear = nn.Linear(d_model, d_model)
        # Use the complete ProbAttention module
        self.attention = ProbAttention(
            scale=self.d_head ** 0.5, attention_dropout=dropout)

    def forward(self, q, k, v, attn_mask=None):
        B, Lq, _ = q.shape
        B, Lk, _ = k.shape  # Use k's own sequence length
        q = self.q_linear(q).view(B, Lq, self.n_heads, self.d_head)
        k = self.k_linear(k).view(B, Lk, self.n_heads, self.d_head)
        v = self.v_linear(v).view(B, Lk, self.n_heads, self.d_head)
        # Apply attention
        out, attn = self.attention(q, k, v, attn_mask)
        # Concatenate heads and pass through the final linear layer
        out = out.contiguous().view(B, Lq, self.n_heads * self.d_head)
        out = self.out_linear(out)
        return out
# --- Feed Forward Network ---


class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff=2048, dropout=0.1):
        super(FeedForward, self).__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        return self.linear2(self.dropout(F.relu(self.linear1(x))))

# --- Encoder Layer ---


class EncoderLayer(nn.Module):
    def __init__(self, d_model, n_heads, d_ff, dropout=0.1):
        super(EncoderLayer, self).__init__()
        self.attention = MultiHeadAttention(d_model, n_heads, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.ff = FeedForward(d_model, d_ff, dropout)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, attn_mask=None):
        new_x = self.attention(x, x, x, attn_mask)
        x = x + self.dropout(new_x)
        x = self.norm1(x)
        new_x = self.ff(x)
        x = x + self.dropout(new_x)
        x = self.norm2(x)
        return x

# --- Decoder Layer ---
class DecoderLayer(nn.Module):
    def __init__(self, d_model, n_heads, d_ff, dropout=0.1):
        super(DecoderLayer, self).__init__()
        self.self_attention = MultiHeadAttention(d_model, n_heads, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.cross_attention = MultiHeadAttention(d_model, n_heads, dropout)
        self.norm2 = nn.LayerNorm(d_model)
        self.ff = FeedForward(d_model, d_ff, dropout)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_out, self_mask=None, cross_mask=None):
        new_x = self.self_attention(x, x, x, self_mask)
        x = x + self.dropout(new_x)
        x = self.norm1(x)
        new_x = self.cross_attention(x, enc_out, enc_out, cross_mask)
        x = x + self.dropout(new_x)
        x = self.norm2(x)
        new_x = self.ff(x)
        x = x + self.dropout(new_x)
        x = self.norm3(x)
        return x


# --- Encoder ---
class Encoder(nn.Module):
    def __init__(self, layer, num_layers, norm_layer=None):
        super(Encoder, self).__init__()
        self.layers = nn.ModuleList([layer for _ in range(num_layers)])
        self.norm = norm_layer if norm_layer is not None else nn.Identity()

    def forward(self, x, attn_mask=None):
        for layer in self.layers:
            x = layer(x, attn_mask)
        return self.norm(x)


# --- Decoder ---
class Decoder(nn.Module):
    def __init__(self, layer, num_layers, norm_layer=None):
        super(Decoder, self).__init__()
        self.layers = nn.ModuleList([layer for _ in range(num_layers)])
        self.norm = norm_layer if norm_layer is not None else nn.Identity()

    def forward(self, x, enc_out, self_mask=None, cross_mask=None):
        for layer in self.layers:
            x = layer(x, enc_out, self_mask, cross_mask)
        return self.norm(x)


# --- Informer Model ---
class Informer(nn.Module):
    def __init__(self, enc_in, dec_in, c_out, seq_len, label_len, out_len,
                 d_model=512, n_heads=8, e_layers=2, d_layers=1, d_ff=512, dropout=0.05):
        super(Informer, self).__init__()
        self.seq_len = seq_len
        self.label_len = label_len
        self.out_len = out_len

        # Input embeddings
        self.enc_embedding = nn.Linear(enc_in, d_model)
        self.dec_embedding = nn.Linear(dec_in, d_model)
        self.positional_encoding = PositionalEncoding(d_model)

        # Encoder
        encoder_layer = EncoderLayer(d_model, n_heads, d_ff, dropout)
        self.encoder = Encoder(encoder_layer, e_layers,
                               norm_layer=nn.LayerNorm(d_model))

        # Decoder
        decoder_layer = DecoderLayer(d_model, n_heads, d_ff, dropout)
        self.decoder = Decoder(decoder_layer, d_layers,
                               norm_layer=nn.LayerNorm(d_model))

        # Final projection layer
        self.projection = nn.Linear(d_model, c_out)

    def forward(self, x_enc, x_dec=None):
        # x_enc: (B, seq_len, enc_in)
        # x_dec: (B, label_len + out_len, dec_in)
        if x_dec is None:
            zeros = torch.zeros(x_enc.size(0), self.out_len,
                                x_enc.size(-1), device=x_enc.device)
            x_dec = torch.cat([x_enc[:, -self.label_len:], zeros], dim=1)

        enc = self.positional_encoding(self.enc_embedding(x_enc))
        dec = self.positional_encoding(self.dec_embedding(x_dec))

        enc_out = self.encoder(enc)
        dec_out = self.decoder(dec, enc_out)

        out = self.projection(dec_out)
        return out[:, -self.out_len:, :]

## Data Preprocessing

In [32]:
# File containing your data (adjust path if needed)
DATA_FILE = "ConsumptionIndustry.csv"

# Read the CSV file.
# Adjust delimiter and decimal separator as needed.
df = pd.read_csv(DATA_FILE, sep=";", decimal=",",
                 parse_dates=["HourUTC", "HourDK"])
df = df.sort_values(by="HourUTC")

# Normalize the consumption values.
scaler = MinMaxScaler()
df["ConsumptionkWh"] = scaler.fit_transform(df[["ConsumptionkWh"]])


# Create sliding window sequences.
def create_sequences(data, input_len=336, output_len=24):
    X, y = [], []
    for i in range(len(data) - input_len - output_len):
        X.append(data[i: i + input_len])
        y.append(data[i + input_len: i + input_len + output_len])
    return np.array(X), np.array(y)


input_len = 336   # 14 days of hourly data
output_len = 24   # next 24 hours
X, y = create_sequences(df["ConsumptionkWh"].values, input_len, output_len)

# Convert arrays to tensors.
X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.float32)


# Create a custom Dataset.
class EnergyDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


dataset = EnergyDataset(X_tensor, y_tensor)
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_data, test_data = torch.utils.data.random_split(
    dataset, [train_size, test_size])

train_loader = DataLoader(train_data, batch_size=32, shuffle=True, num_workers=0)
test_loader = DataLoader(test_data, batch_size=32, shuffle=False, num_workers=0)

## PyTorch Lightning Module with Informer

In [33]:
class InformerModel(pl.LightningModule):
    def __init__(self, input_len=336, output_len=24, lr=1e-3):
        super().__init__()
        self.model = Informer(
            enc_in=1, # Number of features in the encoder input
            dec_in=1, # Number of features in the decoder input
            c_out=1, # Number of output features
            seq_len=input_len,
            label_len=input_len // 2,
            out_len=output_len,
            d_model=512,
            n_heads=8,
            e_layers=2,
            d_layers=1,
            d_ff=512,
            dropout=0.05
        )
        self.criterion = nn.MSELoss()
        self.lr = lr

    def forward(self, x):
        # x: (B, seq_len) -> add feature dimension.
        x = x.unsqueeze(-1)
        return self.model(x)

    def training_step(self, batch):
        x, y = batch
        y_pred = self(x)
        loss = self.criterion(y_pred.squeeze(), y)
        self.log("train_loss", loss, prog_bar=True)
        return loss

    def validation_step(self, batch):
        x, y = batch
        y_pred = self(x)
        loss = self.criterion(y_pred.squeeze(), y)
        self.log("val_loss", loss, prog_bar=True)
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.lr)

## Evaluation & Plotting Helper Functions

In [34]:
def evaluate_model(model, dataloader, device):
    model.eval()
    preds = []
    trues = []
    with torch.no_grad():
        for x, y in dataloader:
            x = x.to(device)
            y = y.to(device)
            output = model(x)
            preds.append(output.squeeze().cpu().numpy())
            trues.append(y.cpu().numpy())
    preds = np.concatenate(preds, axis=0)
    trues = np.concatenate(trues, axis=0)

    rmse = np.sqrt(np.mean((preds - trues) ** 2))
    mae = np.mean(np.abs(preds - trues))
    mape = np.mean(np.abs((preds - trues) / (trues + 1e-5))) * 100
    return rmse, mae, mape, preds, trues


def plot_predictions(preds, trues, sample=0, title="Prediction vs Actual", filename=None):
    plt.figure(figsize=(12, 6))
    plt.plot(trues[sample], label="Actual", marker="o")
    plt.plot(preds[sample], label="Predicted", marker="x")
    plt.title(title)
    plt.xlabel("Time Step")
    plt.ylabel("Normalized Consumption")
    plt.legend()
    plt.grid(True)
    if filename:
        plt.savefig(filename)
    plt.show()
    plt.close()


def plot_comparison(preds_fp32, preds_fp16, trues, sample=0, title="FP32 vs FP16 vs Actual", filename=None):
    plt.figure(figsize=(12, 6))
    plt.plot(trues[sample], label="Actual", marker="o")
    plt.plot(preds_fp32[sample], label="FP32 Predicted", marker="x")
    plt.plot(preds_fp16[sample], label="FP16 Predicted", marker="s")
    plt.title(title)
    plt.xlabel("Time Step")
    plt.ylabel("Normalized Consumption")
    plt.legend()
    plt.grid(True)
    if filename:
        plt.savefig(filename)
    plt.show()
    plt.close()


def plot_memory_usage(fp32_mem, fp16_mem, filename=None):
    plt.figure(figsize=(8, 6))
    plt.bar(['FP32', 'FP16'], [fp32_mem, fp16_mem], color=['blue', 'green'])
    plt.ylabel('Peak GPU Memory Usage (MB)')
    plt.title('GPU Memory Usage Comparison')
    if filename:
        plt.savefig(filename)
    plt.show()
    plt.close()

## Runner functions

In [35]:
def train_informer(precision_mode="32", epochs=10):
    trainer = pl.Trainer(
        max_epochs=epochs,
        precision=precision_mode,  # "32" for full precision or "16-mixed" for mixed precision
        accelerator="gpu" if torch.cuda.is_available() else "cpu",
        #devices=1,
        log_every_n_steps=10,
        max_steps=100
    )
    model = InformerModel()
    trainer.fit(model, train_dataloaders=train_loader,
                val_dataloaders=test_loader)
    return model


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def run_experiment(precision_mode, epochs=10):
    if torch.cuda.is_available():
        torch.cuda.reset_peak_memory_stats(device)
    start_time = time.time()
    model = train_informer(precision_mode=precision_mode, epochs=epochs)
    training_time = time.time() - start_time
    peak_memory = (torch.cuda.max_memory_allocated(device) /
                   (1024 ** 2)) if torch.cuda.is_available() else 0
    model.to(device)
    metrics = evaluate_model(model, test_loader, device)
    return model, training_time, peak_memory, metrics

## Full Precession vs Mixed Precession Example

In [None]:
# Run experiment for FP32
model_fp32, fp32_time, fp32_mem, metrics_fp32 = run_experiment("32", epochs=1)
rmse_fp32, mae_fp32, mape_fp32, preds_fp32, trues_fp32 = metrics_fp32

# Run experiment for FP16
model_fp16, fp16_time, fp16_mem, metrics_fp16 = run_experiment(
    "16-mixed", epochs=1)
rmse_fp16, mae_fp16, mape_fp16, preds_fp16, trues_fp16 = metrics_fp16

# Print summary metrics
print("=== FP32 ===")
print(f"Training Time: {fp32_time:.2f} sec")
print(f"Peak GPU Memory: {fp32_mem:.2f} MB")
print(f"RMSE: {rmse_fp32:.4f} | MAE: {mae_fp32:.4f} | MAPE: {mape_fp32:.2f}%\n")

print("=== FP16 ===")
print(f"Training Time: {fp16_time:.2f} sec")
print(f"Peak GPU Memory: {fp16_mem:.2f} MB")
print(f"RMSE: {rmse_fp16:.4f} | MAE: {mae_fp16:.4f} | MAPE: {mape_fp16:.2f}%\n")

GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/opt/homebrew/Caskroom/miniconda/base/envs/testing_env/lib/python3.9/site-packages/pytorch_lightning/trainer/setup.py:177: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.

  | Name      | Type     | Params | Mode 
-----------------------------------------------
0 | model     | Informer | 4.2 M  | train
1 | criterion | MSELoss  | 0      | train
-----------------------------------------------
4.2 M     Trainable params
0         Non-trainable params
4.2 M     Total params
16.849    Total estimated model params size (MB)
50        Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/opt/homebrew/Caskroom/miniconda/base/envs/testing_env/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.
/opt/homebrew/Caskroom/miniconda/base/envs/testing_env/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_steps=100` reached.
/opt/homebrew/Caskroom/miniconda/base/envs/testing_env/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/accelerator_connector.py:512: You passed `Trainer(accelerator='cpu', precision='16-mixed')` but AMP with fp16 is not supported on CPU. Using `precision='bf16-mixed'` instead.
Using bfloat16 Automatic Mixed Precision (AMP)
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name      | Type     | Params | Mode 
-----------------------------------------------
0 | model     | Informer | 4.2 M  | train
1 | criterion | MSELoss  | 0      | train
-----------------------------------------------
4.2 M     Trainable params
0         Non-trainable params
4.2 M     Total params
16.849    Total estimated model params size (MB)
50        Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

## Plot and Save Results

In [None]:
plot_predictions(preds_fp32, trues_fp32, sample=0,
                 title="FP32: Prediction vs Actual", filename="fp32_prediction.png")
plot_predictions(preds_fp16, trues_fp16, sample=0,
                 title="FP16: Prediction vs Actual", filename="fp16_prediction.png")
plot_comparison(preds_fp32, preds_fp16, trues_fp32, sample=0,
                title="Comparison: FP32 vs FP16 vs Actual", filename="combined_comparison.png")
plot_memory_usage(fp32_mem, fp16_mem, filename="gpu_memory_usage.png")

summary_text = (
    "=== Summary of Results ===\n"
    f"FP32  --> Training Time: {fp32_time:.2f} sec | Peak GPU Memory: {fp32_mem:.2f} MB | RMSE: {rmse_fp32:.4f}, MAE: {mae_fp32:.4f}, MAPE: {mape_fp32:.2f}%\n"
    f"FP16  --> Training Time: {fp16_time:.2f} sec | Peak GPU Memory: {fp16_mem:.2f} MB | RMSE: {rmse_fp16:.4f}, MAE: {mae_fp16:.4f}, MAPE: {mape_fp16:.2f}%\n"
)
print(summary_text)
with open("results_summary.txt", "w") as f:
    f.write(summary_text)