# Hero Run for Training Simple Feed Forward Ensamble

So not sure how mpi will be calling the models, but potentially x72 models. I wanted to have an ensamble of 3 models to calc the average output. So need x216 models, at 200 MB is 43.2 GB. Further investigation is needed.

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

from torch.amp import autocast
from torch.amp import GradScaler

import h5py as h5

from os.path import join

We are going to do hybrid heterogenious computing, so assert GPU is available

In [2]:
assert torch.cuda.is_available(), 'CUDA is not available.'

## Simple FF Model

Simple FF net. Flatten input so $(3*2*2 + 70*2*2*2)=852$. And flatten output $(70*2*2*2)=560$.
Also use float32 to minimize size (can move to float64 later), CESM will output float64 so will need to handle inside the ensamble

In [3]:
class simpleNN(nn.Module):
    def __init__(self):
        super(simpleNN, self).__init__()
        self.fc1 = nn.Linear(852,4096)
        self.fc2 = nn.Linear(4096, 4096)
        self.fc3 = nn.Linear(4096, 560)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [4]:
model = simpleNN()
model = model.double().cuda()

## Data Loading

Data was pre partitioned into training and test dataset

In [5]:
train_name = join('/home/squirt/Documents/data/weather_data/', 'train_data.h5')
test_name = join('/home/squirt/Documents/data/weather_data/', 'test_data.h5')

Method to load hdf5 file of the processed weather data

In [6]:
def load_hdf5(filename:str):
    '''
    Load data from an HDF5 file and return a list of dictionaries.
    Inputs:
        - filename (str): Path to the HDF5 file.
    Outputs:
        - data (list): List of dictionaries, where each dictionary represents an entry in the original list.
    '''
    data = []  # List to hold dictionaries
    with h5.File(filename, 'r') as f:
        # Iterate through groups (each representing an entry in the original list)
        for group_name in f:
            group = f[group_name]
            # Reconstruct dictionary from datasets and attributes
            entry = {
                # Attributes (metadata)
                'day': group.attrs['day'],
                'region': group.attrs['region'],
                'time': group.attrs['time'],

                # groups (numpy arrays)
                'landmass': group['landmass'][...],  # Use [...] to read the full dataset
                'x': group['x'][...],
                'y': group['y'][...],
            }
            data.append(entry)
    return data

Generate stacks to train

In [7]:
def stack_data(data:list[dict], key:str) -> torch.Tensor:
    return torch.stack([torch.tensor(entry[key]) for entry in data])


def generate_stacks(data:list[dict]) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    '''
    Create a PyTorch DataLoader from the data.
    Inputs:
        - data (list): List of dictionaries, where each dictionary represents an entry in the original list.
    Outputs:
        - landmass (torch.Tensor): Tensor of landmass data.
        - x (torch.Tensor): Tensor of x-coordinate data.
        - y (torch.Tensor): Tensor of y-coordinate data.
    '''
    landmass = stack_data(data, 'landmass')

    x = stack_data(data, 'x')
    x = x.transpose(2, 1)

    y = stack_data(data, 'y')
    y = y.transpose(2, 1)
    
    return (landmass, x, y)


class weather_dataset(Dataset):
    '''
    PyTorch Dataset class for weather data.
    '''
    def __init__(self, data:list[dict]):
        self.landmass, self.x, self.y = generate_stacks(data)
        self.length = len(self.landmass)

    def __len__(self):
        return self.length

    def __getitem__(self, idx):
        return (self.landmass[idx], self.x[idx], self.y[idx])

Partition training data into train and validation sets.

In [8]:
def get_dataloaders(fname:str, batch_size:int, split:int) -> tuple[DataLoader, DataLoader]:
    '''
    Create PyTorch DataLoader objects for training and validation data.
    Inputs:
        - fname (str): Path to the HDF5 file.
        - batch_size (int): Batch size for the DataLoader objects.
        - split (float): Fraction of the data to use for training.     
    Outputs:
        - train_loader (torch.utils.data.DataLoader): DataLoader for training data.
        - test_loader (torch.utils.data.DataLoader): DataLoader for test data.
    '''
    # Load data and create tensor 
    data = load_hdf5(fname)
    dataset = weather_dataset(data)
    
    train_size = int(split * len(dataset))
    val_size = len(dataset) - train_size
    
    # Split data into training and validation sets
    train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

    # Create DataLoader objects
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=8, pin_memory=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, num_workers=8, pin_memory=True)
    return train_loader, test_loader

## Training Loop 

In [9]:
optimizer = optim.Adam(model.parameters(), lr=0.001)

Define training loop, use mixed percision training

In [10]:
def train(model:nn.Module, dl:torch.utils.data.DataLoader, optim:torch.optim, loss:nn.Module) -> float:
    model.train()
    total_loss = .0
    scaler = GradScaler()

    for _, (l, x, y) in enumerate(dl):
        l = l.cuda()
        x = x.cuda()
        y = y.cuda()

        # Flatten and combine
        l = l.view(-1, 3*2*2)
        x = x.view(-1, 70*3*2*2)
        x = torch.cat((l, x), 1)

        y = y.view(-1, 70*2*2*2)

        # Forward pass
        optim.zero_grad()

        with autocast(device_type='cuda', dtype=torch.float16):
            y_pred = model(x)
            l = loss(y_pred, y)
            total_loss += l.item()

        # Preform backpass
        scaler.scale(l).backward()
        scaler.step(optim)
        scaler.update()
    
    return total_loss / len(dl)

Eval Loop

In [11]:
def eval(model:nn.Module, dl:torch.utils.data.DataLoader, loss:nn.Module) -> float:
    model.eval()
    total_loss = .0

    for _, (l, x, y) in enumerate(dl):
        l = l.cuda()
        x = x.cuda()
        y = y.cuda()

        # Flatten and combine
        l = l.view(-1, 3*2*2)
        x = x.view(-1, 70*3*2*2)
        x = torch.cat((l, x), 1)

        y = y.view(-1, 70*2*2*2)

        # Forward pass
        with autocast(device_type='cuda', dtype=torch.float16):
            y_pred = model(x)
            l = loss(y_pred, y)
            total_loss += l.item()

    return total_loss / len(dl)

In [12]:
train_loss = nn.MSELoss()

## Training

TODO - Normalize data

In [14]:
folds = 3 
results_dict = {}

test_ds = weather_dataset(load_hdf5(test_name)) 
test_loader = DataLoader(test_ds, batch_size=4096, shuffle=True, num_workers=8, pin_memory=True)

for i in range(folds):
    loss_fn = nn.MSELoss()
    model = simpleNN()
    model = model.double().cuda()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # Get Dataloaders
    train_loader, val_loader = get_dataloaders(train_name, 4096, 0.6)

    # Early Stopping 
    eval_loss = -1*float('inf') 
    train_loss  = float('inf') 
    j = 0
    print(f'Fold {i+1}')
    while train_loss > eval_loss:
        prev_eval_loss = eval_loss
        train_loss = train(model, train_loader, optimizer, loss_fn)
        eval_loss = eval(model, val_loader, loss_fn)
        print(f'\tEpoch {j} - Eval Loss: {eval_loss}')
        
        if abs(eval_loss - prev_eval_loss) < 0.01:
            print(f'\tEarly stopping at epoch {j} due to minimal change in eval loss')
            break
        j += 1

    
    # Test Loss
    test_loss = eval(model, test_loader, loss_fn)
    print(f'Fold {i+1} - Test Loss: {test_loss}')

Fold 1
	Epoch 0 - Eval Loss: 6115844.662589727
	Epoch 1 - Eval Loss: 2723509.928000697
	Epoch 2 - Eval Loss: 98147.90964271483
	Epoch 3 - Eval Loss: 20941.188989376627
	Epoch 4 - Eval Loss: 3668.0223015239662
	Epoch 5 - Eval Loss: 1092.7534572182558
	Epoch 6 - Eval Loss: 1305.1324568142782
	Epoch 7 - Eval Loss: 259.41750121681406
	Epoch 8 - Eval Loss: 264.82237080123025
	Epoch 9 - Eval Loss: 124.40882908244859
	Epoch 10 - Eval Loss: 46.73697941471736
	Epoch 11 - Eval Loss: 92.33288484527591
Fold 1 - Test Loss: 92.26096223416238
Fold 2
	Epoch 0 - Eval Loss: 8062120.4634836605
	Epoch 1 - Eval Loss: 3502085.809838276
	Epoch 2 - Eval Loss: 132901.56551182678
	Epoch 3 - Eval Loss: 51459.149403144875
	Epoch 4 - Eval Loss: 6949.590930923096
	Epoch 5 - Eval Loss: 2199.3836329882242
	Epoch 6 - Eval Loss: 958.4522179292824
	Epoch 7 - Eval Loss: 715.9168467586009
	Epoch 8 - Eval Loss: 280.82644643218964
	Epoch 9 - Eval Loss: 108.75267219971597
	Epoch 10 - Eval Loss: 86.86728625704839
	Epoch 11 - 