# Imports

Ensure Colab can use necessary libraries

In [1]:
!pip install optuna torch torchvision torchaudio scikit-learn

Collecting optuna
  Downloading optuna-4.4.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.16.4-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-ma

Colab imports

In [2]:
from google.colab import drive
import os
drive.mount('/content/drive')
os.chdir('/content/drive/MyDrive/Internal Project Sp25/Summer Docs/Files')

Mounted at /content/drive


Standard imports

In [3]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader, Dataset, Subset

Load data

In [4]:
df = pd.read_csv('berkeley_decade_weather_data.csv', parse_dates=["date"], index_col="date")

# Preprocessing

Sinusoidal encoding of day of year

In [5]:
def sinusoidal_encode(df):
    """
    Create day of year feature. Represent via sine/cosine transformation to preserve cyclical nature of days of year.
    """
    df = df.copy()
    df.index = pd.to_datetime(df.index, dayfirst=True)

    day_of_year = df.index.dayofyear
    is_leap_year = df.index.is_leap_year
    days_in_year = np.where(is_leap_year, 366, 365)
    normalized_day = day_of_year / days_in_year

    df['sin_day'] = np.sin(2 * np.pi * normalized_day)
    df['cos_day'] = np.cos(2 * np.pi * normalized_day)

    return df


encoded_df = sinusoidal_encode(df)

Define target and exogenous columns. Scale exogeneous columns (all are continuous)

In [6]:
target_cols = ['temperature_max', 'temperature_min']
exog_cols = ['precipitation_total', 'temperature_morning', 'temperature_afternoon', 'temperature_night', 'temperature_evening', 'cloud_cover_afternoon', 'humidity_afternoon', 'sin_day', 'cos_day']
input_cols = target_cols + exog_cols

y_scaler = StandardScaler()
y_scaled = pd.DataFrame(y_scaler.fit_transform(encoded_df[target_cols]), columns=target_cols, index=df.index)

X_scaler = StandardScaler()
X_scaled = pd.DataFrame(X_scaler.fit_transform(encoded_df[exog_cols]), columns=exog_cols, index=df.index)

df_scaled = pd.concat([y_scaled, X_scaled], axis=1)

In [7]:
def create_sequences(data, target_cols, exog_cols, seq_len, forecast_len):
    X, y = [], []
    for i in range(len(data) - seq_len - forecast_len):
        past_target = data[i:i+seq_len][target_cols].values
        past_exog = data[i:i+seq_len][exog_cols].values
        future_target = data[i+seq_len:i+seq_len+forecast_len][target_cols].values
        X.append(np.hstack([past_target, past_exog]))  # shape: seq_len x (target+exog)
        y.append(future_target)  # shape: forecast_len x target_dim
    return np.array(X), np.array(y)

In [8]:
class TimeSeriesDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [9]:
class TimeSeriesTransformer(nn.Module):
    def __init__(self, input_dim, d_model, nhead, num_layers, forecast_len, output_dim):
        super().__init__()
        self.forecast_len = forecast_len
        self.input_proj = nn.Linear(input_dim, d_model)
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.output_proj = nn.Linear(d_model, forecast_len * output_dim)

    def forward(self, x):
        x = self.input_proj(x)
        x = self.transformer(x)
        x = x.mean(dim=1)  # Global average pooling
        x = self.output_proj(x)
        return x.view(x.shape[0], self.forecast_len, -1)

In [10]:
from sklearn.model_selection import train_test_split

SEQ_LEN = 7
FORECAST_LEN = 1

X, y = create_sequences(df_scaled, target_cols, exog_cols, SEQ_LEN, FORECAST_LEN)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

In [None]:
import optuna
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import TimeSeriesSplit
from torch.utils.data import DataLoader, Subset

def objective(trial):
    choices = [
        (64, 2), (64, 4), (64, 8),
        (128, 2), (128, 4), (128, 8),
        (256, 4), (256, 8)
    ]
    d_model, nhead = trial.suggest_categorical("d_model_nhead", choices)
    num_layers = trial.suggest_int("num_layers", 1, 4)
    lr = trial.suggest_float("lr", 1e-5, 1e-2, log=True)
    batch_size = trial.suggest_categorical("batch_size", [32, 64, 128])
    n_epochs = trial.suggest_int("n_epochs", 10, 50)

    dataset = TimeSeriesDataset(X_train, y_train)
    val_losses = []

    tscv = TimeSeriesSplit(n_splits=5)
    for train_idx, val_idx in tscv.split(X_train):
        train_loader = DataLoader(Subset(dataset, train_idx), batch_size=batch_size, shuffle=False, num_workers=2, pin_memory=True)
        val_loader = DataLoader(Subset(dataset, val_idx), batch_size=batch_size, num_workers=2, pin_memory=True)

        model_copy = TimeSeriesTransformer(
            input_dim=X_train.shape[2],
            d_model=d_model,
            nhead=nhead,
            num_layers=num_layers,
            forecast_len=FORECAST_LEN,
            output_dim=len(target_cols)
        )
        optimizer = torch.optim.Adam(model_copy.parameters(), lr=lr)
        criterion = nn.MSELoss()

        model_copy.train()
        for epoch in range(n_epochs):
            for xb, yb in train_loader:
                pred = model_copy(xb)
                loss = criterion(pred, yb)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

        model_copy.eval()
        all_preds, all_targets = [], []
        with torch.no_grad():
            for xb, yb in val_loader:
                pred = model_copy(xb)
                all_preds.append(pred.cpu().numpy())
                all_targets.append(yb.cpu().numpy())

        all_preds = np.concatenate(all_preds)
        all_targets = np.concatenate(all_targets)
        val_loss = np.mean((all_preds - all_targets)**2)
        val_losses.append(val_loss)

    return np.mean(val_losses)

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=5)

print("Best parameters:", study.best_params)
print("Best validation MSE:", study.best_value)

[I 2025-08-02 17:49:06,657] A new study created in memory with name: no-name-8067c3f0-e74f-4cd5-b1fd-c26a90558b51
[I 2025-08-02 17:56:36,022] Trial 0 finished with value: 0.3995414078235626 and parameters: {'d_model_nhead': (128, 8), 'num_layers': 3, 'lr': 3.448040266285779e-05, 'batch_size': 32, 'n_epochs': 19}. Best is trial 0 with value: 0.3995414078235626.
[I 2025-08-02 18:09:06,691] Trial 1 finished with value: 0.4135200083255768 and parameters: {'d_model_nhead': (256, 4), 'num_layers': 4, 'lr': 4.257391847971217e-05, 'batch_size': 32, 'n_epochs': 14}. Best is trial 0 with value: 0.3995414078235626.
[I 2025-08-02 18:16:21,696] Trial 2 finished with value: 0.4026462435722351 and parameters: {'d_model_nhead': (64, 2), 'num_layers': 4, 'lr': 0.0001627254068249582, 'batch_size': 64, 'n_epochs': 25}. Best is trial 0 with value: 0.3995414078235626.


In [None]:
best_params = study.best_params
print("Best params:", best_params)

d_model, nhead = best_params["d_model_nhead"]

model = TimeSeriesTransformer(
    input_dim=X_train.shape[2],
    d_model=d_model,
    nhead=nhead,
    num_layers=best_params["num_layers"],
    forecast_len=FORECAST_LEN,
    output_dim=len(target_cols)
)

optimizer = torch.optim.Adam(model.parameters(), lr=best_params["lr"])
criterion = nn.MSELoss()
batch_size = best_params["batch_size"]
n_epochs = best_params.get("n_epochs", 30)

train_loader = DataLoader(TimeSeriesDataset(X_train, y_train), batch_size=batch_size, shuffle=True)

model.train()
for epoch in range(n_epochs):
    for xb, yb in train_loader:
        pred = model(xb)
        loss = criterion(pred, yb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

# Evaluate on test set
model.eval()
with torch.no_grad():
    preds = model(torch.tensor(X_test, dtype=torch.float32)).numpy()
    y_true = y_test.reshape(-1, len(target_cols))
    y_pred = preds.reshape(-1, len(target_cols))

# Undo StandardScaler transformation
y_true_unscaled = y_scaler.inverse_transform(y_true)
y_pred_unscaled = y_scaler.inverse_transform(y_pred)

mae = mean_absolute_error(y_true_unscaled, y_pred_unscaled)
rmse = np.sqrt(mean_squared_error(y_true_unscaled, y_pred_unscaled))
r2 = r2_score(y_true_unscaled, y_pred_unscaled)

print(f"Test MAE: {mae:.4f}")
print(f"Test RMSE: {rmse:.4f}")
print(f"Test R2: {r2:.4f}")

In [None]:
save_path = "/content/drive/MyDrive/Internal Project Sp25/Summer Docs/Files/transformer_2"

torch.save(model.state_dict(), save_path)
print(f"Model saved to {save_path}")