In [7]:
import numpy as np
import torch
from torch import nn
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, TensorDataset
from torch.optim.lr_scheduler import ReduceLROnPlateau, ExponentialLR, CosineAnnealingLR
import torch.nn.functional as F

# setting device on GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
#Additional Info when using cuda
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

Using device: cuda
NVIDIA H200
Memory Usage:
Allocated: 0.1 GB
Cached:    0.1 GB


In [8]:
model_name = 'Transformer'
SEQUENCE_SIZE = 5
LEARNING_RATE = 1e-4
BATCH_SIZE = 512
# run for 10 epochs for demostration, the actual epoch is 200
EPOCHS = 10
IN_FEATURES = 124
OUT_FEATURES = 128
D_MODEL = 256
# DROPOUT = 0.2
N_HEAD = 4
N_LAYER = 6
MAX_LEN = SEQUENCE_SIZE

In [9]:
# use all 8 years: change path
X_train=np.load('/projects/sds-lab/Shuochen/climsim/train_input.npy')
y_train=np.load('/projects/sds-lab/Shuochen/climsim/train_target.npy')
X_test=np.load('/projects/sds-lab/Shuochen/climsim/val_input.npy')
y_test=np.load('/projects/sds-lab/Shuochen/climsim/val_target.npy')
X_scoring = np.load('/projects/sds-lab/Shuochen/climsim/scoring_input.npy')
y_scoring = np.load('/projects/sds-lab/Shuochen/climsim/scoring_target.npy')

X_train = torch.Tensor(X_train)
y_train = torch.Tensor(y_train)
X_test = torch.Tensor(X_test)
y_test = torch.Tensor(y_test)
X_scoring = torch.Tensor(X_scoring)
y_scoring = torch.Tensor(y_scoring)

# create windows for data

In [10]:
def slice_data(X, y):

    print('original shape: ',X.shape,y.shape)
    # to tensor, reshape, transpose (time * space, 124 or 128) > (space, time, 124 or 128)
    X=X.reshape(int(X.shape[0]/384),384,124).permute(1,0,2)
    y=y.reshape(int(y.shape[0]/384),384,128).permute(1,0,2)   
    print(X.shape,y.shape)
    # # create sliding window (space, time, 124 or 128) > (space, num_window, 124 or 128, seq_size)
    # X = X.unfold(1,SEQUENCE_SIZE,SEQUENCE_SIZE)
    # y = y.unfold(1,SEQUENCE_SIZE,SEQUENCE_SIZE)
    # print(X.shape,y.shape)
    X = X.reshape(384, int(X.shape[1] / SEQUENCE_SIZE), SEQUENCE_SIZE, 124)
    y = y.reshape(384, int(y.shape[1] / SEQUENCE_SIZE), SEQUENCE_SIZE, 128)
    # print(X.shape,y.shape)
    X = X.flatten(0,1)
    y = y.flatten(0,1)
    # print(X.shape,y.shape)
    return X, y

# X, y = create_sliding_window(X, y)
X_train, y_train = slice_data(X_train, y_train)
X_test, y_test = slice_data(X_test, y_test)
X_scoring, y_scoring = slice_data(X_scoring, y_scoring)

print(X_train.shape, X_test.shape, X_scoring.shape)
print(y_train.shape, y_test.shape, y_scoring.shape)

original shape:  torch.Size([10091520, 124]) torch.Size([10091520, 128])
torch.Size([384, 26280, 124]) torch.Size([384, 26280, 128])
original shape:  torch.Size([1441920, 124]) torch.Size([1441920, 128])
torch.Size([384, 3755, 124]) torch.Size([384, 3755, 128])
original shape:  torch.Size([1681920, 124]) torch.Size([1681920, 128])
torch.Size([384, 4380, 124]) torch.Size([384, 4380, 128])
torch.Size([2018304, 5, 124]) torch.Size([288384, 5, 124]) torch.Size([336384, 5, 124])
torch.Size([2018304, 5, 128]) torch.Size([288384, 5, 128]) torch.Size([336384, 5, 128])


In [12]:
# create datasets
training_set = TensorDataset(X_train, y_train)
testing_set = TensorDataset(X_test, y_test)
scoring_set = TensorDataset(X_scoring, y_scoring)
# create dataloaders
train_dataloader = DataLoader(training_set, # dataset to turn into iterable
    batch_size=BATCH_SIZE, # how many samples per batch? 
    shuffle=True # shuffle data every epoch?
)
test_dataloader = DataLoader(testing_set,
    batch_size=BATCH_SIZE,
    shuffle=False # don't necessarily have to shuffle the testing data
)
scoring_dataloader = DataLoader(scoring_set,
    batch_size=BATCH_SIZE,
    shuffle=False # don't necessarily have to shuffle the testing data
)

# define model

In [13]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model=D_MODEL, max_len=SEQUENCE_SIZE):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=0.5)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe
        return x

class TransformerModel(nn.Module):
    def __init__(self, input_dim=IN_FEATURES, output_dim = OUT_FEATURES, d_model=D_MODEL, 
                 nhead=N_HEAD, num_layers=N_LAYER):
        super(TransformerModel, self).__init__()
        self.encoder = nn.Linear(input_dim, d_model)
        self.pos_encoder = PositionalEncoding()
        # self.pos_encoder = nn.Embedding(SEQUENCE_SIZE, d_model)
        encoder_layers = nn.TransformerEncoderLayer(d_model, nhead, batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers)
        self.decoder = nn.Linear(d_model, output_dim)
        # self.relu = nn.ReLU()
        # self.decoder_2 = nn.Linear(512, output_dim)        

    def forward(self, x):
        x = self.encoder(x)
        x = self.pos_encoder(x)
        # x = x+self.pos_encoder(torch.arange(SEQUENCE_SIZE, device=device))
        x = self.transformer_encoder(x)
        # x = self.decoder(x[:, -1, :])
        x = self.decoder(x)
        # x = self.relu(x)
        # x = self.decoder_2(x)
        return x

model = TransformerModel().to(device)
loss_fn = nn.MSELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
# scheduler is not used in this example
scheduler = ReduceLROnPlateau(optimizer, factor=0.5, patience=40)

In [14]:
train_loss_list = []
test_loss_list = []

for epoch in range(EPOCHS):
    train_loss = 0
    for batch, (X, y) in enumerate(train_dataloader):
        model.train()
        y_pred = model(X.to(device))
        loss = loss_fn(y_pred, y.to(device))
        train_loss += loss # accumulatively add up the loss per epoch 
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Divide total train loss by length of train dataloader (average loss per batch per epoch)
    train_loss /= len(train_dataloader)
    train_loss_list.append(train_loss.detach().cpu().numpy())
    
    model.eval()
    with torch.no_grad():
        test_loss = 0
        for X, y in test_dataloader:
            test_pred = model(X.to(device))
            test_loss += loss_fn(test_pred, y.to(device)) # accumulatively add up the loss per epoch
        
        # Divide total test loss by length of test dataloader (per batch)
        test_loss /= len(test_dataloader)
        test_loss_list.append(test_loss.detach().cpu().numpy())

    print(f"Epoch: {epoch} | Train loss: {train_loss:.5f} | Test loss: {test_loss:.5f}")

Epoch: 0 | Train loss: 0.00666 | Test loss: 0.00447
Epoch: 1 | Train loss: 0.00442 | Test loss: 0.00409
Epoch: 2 | Train loss: 0.00404 | Test loss: 0.00381
Epoch: 3 | Train loss: 0.00382 | Test loss: 0.00363
Epoch: 4 | Train loss: 0.00368 | Test loss: 0.00353
Epoch: 5 | Train loss: 0.00359 | Test loss: 0.00343
Epoch: 6 | Train loss: 0.00352 | Test loss: 0.00339
Epoch: 7 | Train loss: 0.00346 | Test loss: 0.00333
Epoch: 8 | Train loss: 0.00342 | Test loss: 0.00330
Epoch: 9 | Train loss: 0.00337 | Test loss: 0.00325
