In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Subset
import numpy as np
from Models import MoELSTM
import os
from collections import OrderedDict
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader
import math
from typing import List, Tuple, Optional, Dict
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from darts import TimeSeries
from darts.dataprocessing.transformers import Scaler
import random
from Models import MoELSTM, LSTMModel, train_model
from Preprocess import (
    compute_metrics,
    convert_timeseries_to_numpy,
    create_dataloader,
    load_building_series,
    split_series_list,
)
import pandas as pd
from collections import defaultdict
import os
import torch
import torch.optim as optim
from tqdm import tqdm


from Models import model_fn
from tqdm import tqdm
from my_utils import train_model, load_energy_data_feather, get_weights, set_weights


In [7]:
from AggregationStrategy import sync_aggregate,average_weights,sync_aggregate_norm,sync_aggregate_softmax, fedavgm_update

In [8]:
df = pd.read_feather("train_final.feather")

In [9]:
df.head()

Unnamed: 0,building_id,meter,timestamp,meter_reading,primary_use,air_temperature
7593144,0,0,2016-05-21 01:00:00,72.221012,Education,25.6
7593145,1,0,2016-05-21 01:00:00,39.611586,Education,25.6
7593146,2,0,2016-05-21 01:00:00,1.920567,Education,25.6
7593147,3,0,2016-05-21 01:00:00,111.532464,Education,25.6
7593148,4,0,2016-05-21 01:00:00,456.734799,Education,25.6


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11712248 entries, 7593144 to 20216099
Data columns (total 6 columns):
 #   Column           Dtype         
---  ------           -----         
 0   building_id      int64         
 1   meter            int64         
 2   timestamp        datetime64[ns]
 3   meter_reading    float64       
 4   primary_use      object        
 5   air_temperature  float64       
dtypes: datetime64[ns](1), float64(2), int64(2), object(1)
memory usage: 625.5+ MB


In [11]:


# Config
# List of models to experiment with
MODEL_NAMES = ["lstm", "gru", "moe_lstm", "moe_gru"]

# Config
NUM_CLIENTS = 1410
CLIENT_FRAC = 0.15
NUM_ROUNDS = 50
LOCAL_EPOCHS = 5
LR = 0.001
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DATA_FILE ="train_final.feather" # "meter_0_data_cleaned.feather"


In [12]:


def load_energy_data_feather(cid, filepath="meter_0_data_cleaned.feather"):
    """Load, preprocess, and return train/test dataloaders for a client."""
    df = pd.read_feather(filepath)
    df = df[df['building_id'] == cid]
    df['meter_reading'] = df['meter_reading'].fillna(0)

    if df.empty:
        raise ValueError(f"No data found for building_id {cid}")

    try:
        ts = TimeSeries.from_dataframe(
            df,
            time_col='timestamp',
            value_cols='meter_reading',
            fill_missing_dates=True,
            freq='h'
        )
    except Exception as e:
        raise ValueError(f"Failed to construct TimeSeries: {e}")

    train_series, test_series = ts.split_before(0.75)

    if len(train_series) == 0 or len(test_series) == 0:
        raise ValueError(f"Empty time series for building_id {cid}. Train: {len(train_series)}, Test: {len(test_series)}")

    scaler = MinMaxScaler(feature_range=(0.1, 1))
    transformer = Scaler(scaler)
    transformed_train_series = transformer.fit_transform(train_series)
    transformed_test_series = transformer.transform(test_series)

    X_train, y_train = convert_timeseries_to_numpy(transformed_train_series, input_len=168, output_len=24)
    # X_test, y_test = convert_timeseries_to_numpy(transformed_test_series, input_len=168, output_len=24)

    X_train = np.nan_to_num(X_train, nan=0.0)
    y_train = np.nan_to_num(y_train, nan=0.0)
    # X_test = np.nan_to_num(X_test, nan=0.0)
    # y_test = np.nan_to_num(y_test, nan=0.0)


    if len(X_train) == 0 :#or len(X_test) == 0:
        raise ValueError(f"Client {cid} has no data after preprocessing.")

    train_loader = create_dataloader(X_train, y_train, batch_size=512)
    # test_loader = create_dataloader(X_test, y_test, batch_size=256)

    return train_loader, 0 # test_loader


In [None]:


def train_model_transformer(model, train_loader, device=None, learning_rate=0.001, loss_fn=None, optimizer_class=optim.Adam, epochs=50):
    """Train the model and return the average loss."""
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model.to(device)
    loss_fn = loss_fn or nn.MSELoss()
    optimizer = optimizer_class(model.parameters(), lr=learning_rate)
    loss_history = []

    model.train()
    for epoch in range(epochs):
        epoch_loss = 0.0
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            if y_batch.dim() == 3 and y_batch.shape[-1] == 1:
                y_batch = y_batch.squeeze(-1)

            optimizer.zero_grad()
            output = model(X_batch)
            #loss = loss_fn(output, y_batch)
            loss = loss_fn(output.squeeze(-1), y_batch)  # (batch_size, 24)

            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()
        loss_history.append(epoch_loss/len(train_loader))

    # fin_loss = epoch_loss / len(train_loader)

    return get_weights(model), loss_history


### Diff-Sync FedAvg

In [13]:


class PositionalEncoding(nn.Module):
    """Positional encoding for transformer model"""
    
    def __init__(self, d_model: int, max_len: int = 5000):
        super().__init__()
        
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        
        self.register_buffer('pe', pe)
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return x + self.pe[:x.size(0), :]




In [14]:

class TimeSeriesTransformer(nn.Module):
    """Transformer model for univariate time series forecasting"""
    
    def __init__(
        self,
        context_length: int = 168,
        forecast_horizon: int = 24,
        d_model: int = 128,
        n_heads: int = 8,
        n_layers: int = 6,
        d_ff: int = 512,
        dropout: float = 0.1
    ):
        super().__init__()
        
        self.context_length = context_length
        self.forecast_horizon = forecast_horizon
        self.d_model = d_model
        
        # Input embedding and positional encoding
        self.input_embedding = nn.Linear(1, d_model)
        self.pos_encoding = PositionalEncoding(d_model, max_len=context_length + forecast_horizon)
        
        # Transformer encoder
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=n_heads,
            dim_feedforward=d_ff,
            dropout=dropout,
            batch_first=False
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)
        
        # Output projection
        self.output_projection = nn.Linear(d_model, 1)
        self.dropout = nn.Dropout(dropout)
        
        # Learnable forecast tokens
        self.forecast_tokens = nn.Parameter(torch.randn(forecast_horizon, 1, d_model))
        
    def forward(self, src: torch.Tensor, src_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
        """
        Forward pass
        
        Args:
            src: Input tensor of shape (batch_size, context_length, 1)
            src_mask: Optional attention mask
            
        Returns:
            Forecasted values of shape (batch_size, forecast_horizon, 1)
        """
        batch_size = src.size(0)
        
        # Embed input sequence
        src_embedded = self.input_embedding(src)  # (batch_size, context_length, d_model)
        src_embedded = src_embedded.transpose(0, 1)  # (context_length, batch_size, d_model)
        
        # Add positional encoding
        src_embedded = self.pos_encoding(src_embedded)
        
        # Prepare forecast tokens
        forecast_tokens = self.forecast_tokens.expand(-1, batch_size, -1)  # (forecast_horizon, batch_size, d_model)
        
        # Concatenate input and forecast tokens
        full_sequence = torch.cat([src_embedded, forecast_tokens], dim=0)  # (context_length + forecast_horizon, batch_size, d_model)
        
        # Create causal mask to prevent looking at future values
        seq_len = full_sequence.size(0)
        causal_mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=1).bool().to(src.device)
        
        # Apply transformer
        transformer_output = self.transformer_encoder(full_sequence, mask=causal_mask)
        
        # Extract forecast part
        forecast_output = transformer_output[self.context_length:]  # (forecast_horizon, batch_size, d_model)
        
        # Project to output dimension
        forecast_output = self.dropout(forecast_output)
        predictions = self.output_projection(forecast_output)  # (forecast_horizon, batch_size, 1)
        
        # Transpose back to (batch_size, forecast_horizon, 1)
        predictions = predictions.transpose(0, 1)
        
        return predictions





In [None]:



try:
  
    train_loader, _ = load_energy_data_feather(cid=1, filepath=DATA_FILE)
    
    print(f"Data loaded successfully for building_id=1")
    print(f"Training batches: {len(train_loader)}")
    
    # Create model
    model = TimeSeriesTransformer(
        context_length=168,
        forecast_horizon=24,
        d_model=128,
        n_heads=8,
        n_layers=6,
        dropout=0.1
    )
    
    # Setup training
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)
    criterion = nn.MSELoss()
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, factor=0.5)
    
    print(f"Model created successfully!")
    print(f"Training on device: {device}")
    print(f"Number of parameters: {sum(p.numel() for p in model.parameters()):,}")
    
    # Training loop with improvements
    num_epochs = 50
    best_loss = float('inf')
    
    for epoch in range(num_epochs):
        train_losses = []
        model.train()
        
        for batch_idx, (context, target) in enumerate(train_loader):
            context, target = context.to(device), target.to(device)
            
            optimizer.zero_grad()
            predictions = model(context)
            loss = criterion(predictions, target)
            loss.backward()
            
            # Gradient clipping to prevent exploding gradients
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            
            optimizer.step()
            train_losses.append(loss.item())
            
            if batch_idx % 10 == 0:
                print(f"Epoch {epoch}, Batch {batch_idx}, Loss: {loss.item():.6f}")
        
        avg_loss = np.mean(train_losses)
        scheduler.step(avg_loss)
        
        # Save best model
        if avg_loss < best_loss:
            best_loss = avg_loss
            torch.save(model.state_dict(), f'best_model_building_{1}.pth')
            print(f"New best model saved with loss: {best_loss:.6f}")
        
        print(f"Epoch {epoch} completed. Average Loss: {avg_loss:.6f}, LR: {optimizer.param_groups[0]['lr']:.2e}")
    
    print("Training completed!")
    
    # Load best model for inference
    model.load_state_dict(torch.load(f'best_model_building_{1}.pth'))
    model.eval()
    
    # Example inference and evaluation
    with torch.no_grad():
        all_predictions = []
        all_targets = []
        
        for context, target in train_loader:
            context = context.to(device)
            forecast = model(context)
            
            all_predictions.append(forecast.cpu())
            all_targets.append(target)
            
            if len(all_predictions) == 1:  # Print shapes for first batch
                print(f"Input shape: {context.shape}")  # Should be (batch_size, 168, 1)
                print(f"Forecast shape: {forecast.shape}")  # Should be (batch_size, 24)
                print(f"Target shape: {target.shape}")  # Should be (batch_size, 24)
        
        # Calculate overall metrics
        # all_predictions = torch.cat(all_predictions, dim=0)
        # all_targets = torch.cat(all_targets, dim=0)
        
        # mse = nn.MSELoss()(all_predictions, all_targets)
        # mae = nn.L1Loss()(all_predictions, all_targets)
        
        # print(f"\nFinal Model Performance:")
        # print(f"MSE: {mse.item():.6f}")
        # print(f"MAE: {mae.item():.6f}")
        # print(f"RMSE: {torch.sqrt(mse).item():.6f}")
            
except Exception as e:
    print(f"Error during execution: {e}")
    import traceback
    traceback.print_exc()


Data loaded successfully for building_id=1
Training batches: 8




Model created successfully!
Training on device: cuda
Number of parameters: 1,193,089
Epoch 0, Batch 0, Loss: 0.584771
New best model saved with loss: 0.137521
Epoch 0 completed. Average Loss: 0.137521, LR: 1.00e-04
Epoch 1, Batch 0, Loss: 0.063717
New best model saved with loss: 0.058602
Epoch 1 completed. Average Loss: 0.058602, LR: 1.00e-04
Epoch 2, Batch 0, Loss: 0.059831
New best model saved with loss: 0.048380
Epoch 2 completed. Average Loss: 0.048380, LR: 1.00e-04
Epoch 3, Batch 0, Loss: 0.043081
New best model saved with loss: 0.038962
Epoch 3 completed. Average Loss: 0.038962, LR: 1.00e-04
Epoch 4, Batch 0, Loss: 0.035663
New best model saved with loss: 0.032616
Epoch 4 completed. Average Loss: 0.032616, LR: 1.00e-04
Epoch 5, Batch 0, Loss: 0.030419
New best model saved with loss: 0.027737
Epoch 5 completed. Average Loss: 0.027737, LR: 1.00e-04
Epoch 6, Batch 0, Loss: 0.026169
New best model saved with loss: 0.023609
Epoch 6 completed. Average Loss: 0.023609, LR: 1.00e-04
Epoch