In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Subset
import numpy as np
from Models import MoELSTM
import os
from collections import OrderedDict
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader

from typing import List, Tuple, Optional, Dict
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from darts import TimeSeries
from darts.dataprocessing.transformers import Scaler
import random
from Models import MoELSTM, LSTMModel, train_model
from Preprocess import (
    compute_metrics,
    convert_timeseries_to_numpy,
    create_dataloader,
    load_building_series,
    split_series_list,
)
import pandas as pd
from collections import defaultdict
import os
import torch
import torch.optim as optim
from tqdm import tqdm


from Models import model_fn
from tqdm import tqdm
from my_utils import train_model, load_energy_data_feather, get_weights, set_weights


In [2]:
from AggregationStrategy import sync_aggregate,average_weights,sync_aggregate_norm,sync_aggregate_softmax, fedavgm_update

In [3]:
df = pd.read_feather("train_final.feather")

In [4]:
df.head()

Unnamed: 0,building_id,meter,timestamp,meter_reading,primary_use,air_temperature
7593144,0,0,2016-05-21 01:00:00,72.221012,Education,25.6
7593145,1,0,2016-05-21 01:00:00,39.611586,Education,25.6
7593146,2,0,2016-05-21 01:00:00,1.920567,Education,25.6
7593147,3,0,2016-05-21 01:00:00,111.532464,Education,25.6
7593148,4,0,2016-05-21 01:00:00,456.734799,Education,25.6


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11712248 entries, 7593144 to 20216099
Data columns (total 6 columns):
 #   Column           Dtype         
---  ------           -----         
 0   building_id      int64         
 1   meter            int64         
 2   timestamp        datetime64[ns]
 3   meter_reading    float64       
 4   primary_use      object        
 5   air_temperature  float64       
dtypes: datetime64[ns](1), float64(2), int64(2), object(1)
memory usage: 625.5+ MB


In [6]:


# Config
# List of models to experiment with
MODEL_NAMES = ["transformer"]

# Config
NUM_CLIENTS = 1410
CLIENT_FRAC = 0.15
NUM_ROUNDS = 50
LOCAL_EPOCHS = 5
LR = 0.001
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DATA_FILE ="train_final.feather" # "meter_0_data_cleaned.feather"


In [7]:


def load_energy_data_feather_transformer(cid, filepath="meter_0_data_cleaned.feather"):
    """Load, preprocess, and return train/test dataloaders for a client."""
    df = pd.read_feather(filepath)
    df = df[df['building_id'] == cid]
    df['meter_reading'] = df['meter_reading'].fillna(0)

    if df.empty:
        raise ValueError(f"No data found for building_id {cid}")

    try:
        ts = TimeSeries.from_dataframe(
            df,
            time_col='timestamp',
            value_cols='meter_reading',
            fill_missing_dates=True,
            freq='h'
        )
    except Exception as e:
        raise ValueError(f"Failed to construct TimeSeries: {e}")

    train_series, test_series = ts.split_before(0.75)

    if len(train_series) == 0 or len(test_series) == 0:
        raise ValueError(f"Empty time series for building_id {cid}. Train: {len(train_series)}, Test: {len(test_series)}")

    scaler = MinMaxScaler(feature_range=(0.1, 1))
    transformer = Scaler(scaler)
    transformed_train_series = transformer.fit_transform(train_series)
    # transformed_test_series = transformer.transform(test_series)

    X_train, y_train = convert_timeseries_to_numpy(transformed_train_series, input_len=168, output_len=24)
    # X_test, y_test = convert_timeseries_to_numpy(transformed_test_series, input_len=168, output_len=24)

    X_train = np.nan_to_num(X_train, nan=0.0)
    y_train = np.nan_to_num(y_train, nan=0.0)
    # X_test = np.nan_to_num(X_test, nan=0.0)
    # y_test = np.nan_to_num(y_test, nan=0.0)


    if len(X_train) == 0 :#or len(X_test) == 0:
        raise ValueError(f"Client {cid} has no data after preprocessing.")

    train_loader = create_dataloader(X_train, y_train, batch_size=256)
    # test_loader = create_dataloader(X_test, y_test, batch_size=256)

    return train_loader, 0 # test_loader


In [8]:
import math
class PositionalEncoding2(nn.Module):
    def __init__(self, d_model: int, max_len: int = 5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)  # (max_len, 1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))  # (d_model/2)

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)  # (1, max_len, d_model)
        self.register_buffer("pe", pe)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return x + self.pe[:, :x.size(1)]


In [9]:
class TimeSeriesTransformer2(nn.Module):
    def __init__(
        self,
        context_length: int = 168,
        forecast_horizon: int = 24,
        d_model: int = 128,
        n_heads: int = 8,
        n_layers: int = 4,
        d_ff: int = 512,
        dropout: float = 0.1,
    ):
        super().__init__()
        self.context_length = context_length
        self.forecast_horizon = forecast_horizon

        # 1. Input projection: from 1 → d_model
        self.input_embedding = nn.Linear(1, d_model)

        # 2. Forecast tokens (learnable embeddings)
        self.forecast_tokens = nn.Parameter(torch.randn(1, forecast_horizon, d_model))

        # 3. Positional encoding for (context + forecast) steps
        self.pos_encoding = PositionalEncoding2(d_model, max_len=context_length + forecast_horizon)

        # 4. Transformer encoder
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=n_heads,
            dim_feedforward=d_ff,
            dropout=dropout,
            batch_first=True  # Makes all inputs/outputs (batch, seq, dim)
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)

        # 5. Output projection: d_model → 1
        self.output_projection = nn.Linear(d_model, 1)

        self.dropout = nn.Dropout(dropout)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Args:
            x: Input tensor of shape (batch_size, context_length, 1)

        Returns:
            Tensor of shape (batch_size, forecast_horizon, 1)
        """
        batch_size = x.size(0)

        # 1. Project input from (1) to (d_model)
        x = self.input_embedding(x)  # (batch_size, context_length, d_model)

        # 2. Expand forecast tokens
        forecast_tokens = self.forecast_tokens.expand(batch_size, -1, -1)  # (batch_size, forecast_horizon, d_model)

        # 3. Concatenate context + forecast tokens
        full_input = torch.cat([x, forecast_tokens], dim=1)  # (batch_size, context + horizon, d_model)

        # 4. Add positional encoding
        full_input = self.pos_encoding(full_input)

        # 5. Causal mask (prevents forecast tokens from attending to future)
        seq_len = full_input.size(1)
        causal_mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=1).bool().to(x.device)

        # 6. Transformer encoder
        transformer_output = self.encoder(full_input, mask=causal_mask)  # (batch_size, seq_len, d_model)

        # 7. Slice only forecast part
        forecast_output = transformer_output[:, -self.forecast_horizon:, :]  # (batch_size, forecast_horizon, d_model)

        # 8. Project back to 1 value per timestep
        output = self.output_projection(self.dropout(forecast_output))  # (batch_size, forecast_horizon, 1)
        return output


In [10]:
class TimeSeriesDifficultyWeight:
    def __init__(self, num_clients, accumulate_iters=20):
        self.num_clients = num_clients
        self.last_loss = torch.ones(num_clients).float().to(DEVICE)
        self.learn_score = torch.zeros(num_clients).float().to(DEVICE)
        self.unlearn_score = torch.zeros(num_clients).float().to(DEVICE)
        self.ema_difficulty = torch.ones(num_clients).float().to(DEVICE)
        self.accumulate_iters = accumulate_iters

    def update(self, cid: int, loss_history: List[float]) -> float:
        """
        Update difficulty based on loss trend for a client.
        Expects a list of per-epoch losses.
        """
        current_loss = torch.tensor(loss_history[-1], dtype=torch.float32).to(DEVICE)
        previous_loss = self.last_loss[cid]
        delta = current_loss - previous_loss
        ratio = torch.log((current_loss + 1e-8) / (previous_loss + 1e-8))

        learn = torch.where(delta < 0, -delta * ratio, torch.tensor(0.0, device=current_loss.device))
        unlearn = torch.where(delta >= 0, delta * ratio, torch.tensor(0.0, device=current_loss.device))

        # EMA update
        momentum = (self.accumulate_iters - 1) / self.accumulate_iters
        self.learn_score[cid] = momentum * self.learn_score[cid] + (1 - momentum) * learn
        self.unlearn_score[cid] = momentum * self.unlearn_score[cid] + (1 - momentum) * unlearn

        # Difficulty score
        diff_ratio = (self.unlearn_score[cid] + 1e-8) / (self.learn_score[cid] + 1e-8)
        difficulty = diff_ratio #torch.pow(diff_ratio, 1 / 5)

        # Smooth difficulty over rounds
        self.ema_difficulty[cid] = momentum * self.ema_difficulty[cid] + (1 - momentum) * difficulty

        self.last_loss[cid] = current_loss
        return self.ema_difficulty[cid].item()

    def get_normalized_weights(self, client_ids: List[int]) -> List[float]:
        weights = [self.ema_difficulty[cid].item() for cid in client_ids]
        total = sum(weights)
        if total == 0:
            return [1.0 / len(client_ids)] * len(client_ids)
        return [w / total for w in weights]
    
    def get_sampling_probabilities(self, min_prob=0.05):
        difficulty = self.ema_difficulty
        inv_difficulty = 1.0 / (difficulty + 1e-6)
        inv_difficulty = inv_difficulty / inv_difficulty.sum()
        probs = torch.clamp(inv_difficulty, min=min_prob)
        return (probs / probs.sum()).cpu().numpy()



In [11]:


def train_model_transformer(model, train_loader, device=None, learning_rate=0.001, loss_fn=None, optimizer_class=optim.Adam, epochs=50):
    """Train the model and return the average loss."""
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model.to(device)
    loss_fn = loss_fn or nn.MSELoss()
    optimizer = optimizer_class(model.parameters(), lr=learning_rate)
    loss_history = []

    model.train()
    for epoch in range(epochs):
        epoch_loss = 0.0
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            # if y_batch.dim() == 3 and y_batch.shape[-1] == 1:
            #     y_batch = y_batch.squeeze(-1)

            optimizer.zero_grad()
            output = model(X_batch)
            loss = loss_fn(output.squeeze(-1), y_batch)
            # loss = loss_fn(output.squeeze(-1), y_batch)  # (batch_size, 24)

            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()
        loss_history.append(epoch_loss/len(train_loader))

    # fin_loss = epoch_loss / len(train_loader)

    return get_weights(model), loss_history


### FedAvg-diff

In [12]:
# difficulty_tracker = TimeSeriesDifficultyWeight(num_clients=NUM_CLIENTS)

for model_name in MODEL_NAMES:
    print(f"Starting experiment with model: {model_name}")
    difficulty_tracker = TimeSeriesDifficultyWeight(num_clients=NUM_CLIENTS)

    model_dir = os.path.join("results", model_name)
    os.makedirs(model_dir, exist_ok=True)

    global_model = model_fn(model_name).to(DEVICE)
    global_weights = get_weights(global_model)

    for rnd in range(NUM_ROUNDS):
        print(f"Round {rnd+1}/{NUM_ROUNDS}")

        # === Difficulty-aware sampling ===
        sampling_probs = difficulty_tracker.get_sampling_probabilities(min_prob=0.05)
        sampled_clients = np.random.choice(
            np.arange(NUM_CLIENTS),
            size=int(CLIENT_FRAC * NUM_CLIENTS),
            replace=False,
            p=sampling_probs
        )
        print(f"Sampled {len(sampled_clients)} clients")

        local_weights = []

        for cid in tqdm(sampled_clients, desc="Training clients"):
            local_model = model_fn(model_name).to(DEVICE)
            set_weights(local_model, global_weights)
            train_loader, test_loader = load_energy_data_feather(cid, filepath=DATA_FILE)

            updated_weights, loss_history = train_model_transformer(
                local_model, train_loader,
                device=DEVICE,
                learning_rate=LR,
                loss_fn=None,
                optimizer_class=optim.Adam,
                epochs=LOCAL_EPOCHS
            )
            # local_weights.append(updated_weights)

            # === Update difficulty score ===
            difficulty_tracker.update(cid, loss_history)

            local_weights.append(updated_weights)

        # === FedAvg-style aggregation ===
        global_weights = average_weights(local_weights)
        set_weights(global_model, global_weights)

        checkpoint_path = os.path.join(model_dir, f"{model_name}_round_{rnd+1}_fedAvg_diff.pt")
        torch.save(global_model.state_dict(), checkpoint_path)
        print(f"Saved global model to {checkpoint_path}")


Starting experiment with model: transformer
Round 1/50
Sampled 211 clients


Training clients:  28%|██▊       | 60/211 [05:46<13:50,  5.50s/it]

: 