In [None]:
#Google colab setup
!pip install ray

In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import mean_squared_error, mean_absolute_error
import os
import xgboost as xgb
import ray
from ray import train
from ray import tune
from ray.air import session
from ray.tune.schedulers import ASHAScheduler

For computing on google colab resources.

In [None]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

In [None]:
device = torch.device("cpu")
if torch.cuda.is_available():
    device = torch.device("cuda")

In [None]:
class CustomDataset(Dataset):
    def __init__(self, data, target):
        self.data = data
        self.target = target

    def __getitem__(self, index):
        return self.data[index], self.target[index]

    def __len__(self):
        return len(self.data)

In [None]:
class RMSLELoss(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.mse = torch.nn.MSELoss()

    def forward(self, pred, actual):
        return torch.sqrt(self.mse(torch.log(pred + 1), torch.log(actual + 1)))

In [None]:
class EarlyStopping:
    def __init__(self, tolerance=5, min_delta=0):

        self.tolerance = tolerance
        self.min_delta = min_delta
        self.counter = 0
        self.early_stop = False

    def __call__(self, train_loss, validation_loss):
        if (validation_loss - train_loss) > self.min_delta:
            self.counter +=1
            if self.counter >= self.tolerance:
                self.early_stop = True
        else:
            self.counter = 0

In [None]:
class LSTMModel(torch.nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = torch.nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = torch.nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)

        out, _ = self.lstm(x, (h0, c0))

        out = self.fc(out[:, -1, :])
        return out

Uncomment the line in cell below if you want to use ray framework.


In [None]:
def train_fn(config):
  device = torch.device("cpu")
  if torch.cuda.is_available():
    device = torch.device("cuda")

  train_loader = DataLoader(config["ds"], batch_size=config["batch_size"], shuffle=False)

  test_dataloader = DataLoader(config["test_ds"], batch_size=config["batch_size"], shuffle=False)

  early_stopping = EarlyStopping(tolerance=config["tolerance"], min_delta=config["min_delta"])

  model = LSTMModel(config["input_size"], config["hidden_size"], config["num_layer"], 1)
  model.to(device)
  criterion = torch.nn.HuberLoss()
  optimizer = torch.optim.SGD(model.parameters(), lr=config["learning_rate"])
  epoch_test_loss = 0

  for epoch in range(config["epoch"]):
      epoch_test_loss = []
      epoch_loss = []
      model.train()
      for batch_x, batch_y in train_loader:
          batch_x = batch_x.to(device)
          batch_y = batch_y.to(device)
          optimizer.zero_grad()
          y_pred = model(batch_x)
          loss = criterion(y_pred, batch_y).to(device)
          loss.backward()
          epoch_loss.append(loss.item())
          optimizer.step()
      epoch_loss = np.sum(epoch_loss)

      model.eval()

      with torch.no_grad():
          for batch_test_x, batch_test_y in test_dataloader:
              batch_test_x = batch_test_x.to(device)
              batch_test_y = batch_test_y.to(device)
              y_test_pred = model(batch_test_x)
              test_loss = criterion(y_test_pred, batch_test_y).to(device)
              epoch_test_loss.append(test_loss.item())
          epoch_test_loss = np.sum(epoch_test_loss)

      early_stopping(epoch_loss, epoch_test_loss)
      if early_stopping.early_stop:
        train.report({"loss": epoch_test_loss})
        break

      train.report({"loss": epoch_test_loss})

In [None]:
def find_best_config(ds, test_ds, input_size):
  config = {
      "batch_size": tune.choice([16, 32, 64]),
      "hidden_size": tune.choice([2 ** i for i in range(9)]),
      "num_layer": tune.choice([1, 2, 3, 4, 5, 6, 7, 8]),
      "epoch": tune.choice([60, 80, 100, 150, 200, 500, 700, 1000]),
      "learning_rate": tune.loguniform(0.005, 0.1),
      "ds": ds,
      "test_ds": test_ds,
      "input_size": input_size,
      "tolerance": tune.choice([1, 2, 3, 4, 5]),
      "min_delta": tune.choice([1, 2, 3, 5, 10, 15, 20, 25, 30])
  }
  scheduler = ASHAScheduler(
        max_t=1000,
        grace_period=1,
        reduction_factor=2)

  ray.init(object_store_memory=10**10)
  tuner = tune.Tuner(
          tune.with_resources(
              tune.with_parameters(train_fn),
              resources={"cpu": 1, "gpu": 1}
          ),
          tune_config=tune.TuneConfig(
              metric="loss",
              mode="min",
              scheduler=scheduler,
              num_samples=100,
          ),
          param_space=config,
      )
  results = tuner.fit()

  return results.get_best_result("loss")

In [None]:
def prepare_data_for_station(station):
  data_path = "/content/drive/MyDrive/master/data/network_input" #google colab
  #data_path = "data/network_input"

  dataset_path = os.path.join(data_path, station)
  data = pd.read_pickle(dataset_path)

  feat_data = data.loc[:, data.columns != "target_value"]
  target_data = data.loc[:, data.columns == "target_value"]

  feat_arr = feat_data.to_numpy()
  target_arr = target_data.to_numpy()

  target_arr = np.array(target_arr, dtype=np.float32).reshape(-1, 1)
  data_arr_2 = []
  for row in feat_arr:
      row = np.vstack(row)
      data_arr_2.append(row)

  target_arr_2 = []
  for row in target_arr:
      row = np.hstack(row)
      for item in row:
          target_arr_2.append(item)
  data_arr_2 = np.array(data_arr_2, dtype=np.float32)
  data_flatten = np.array([row.flatten() for row in data_arr_2])

  seq_len = 4
  sequences = []
  targets = []
  flat_seq = []
  flat_targets = []

  for i in range(len(data_flatten) - seq_len + 1):
          sequences.append(data_flatten[i:i + seq_len])  # Wyłączenie ostatniej kolumny (target) z sekwencji
          targets.append(np.array([target_arr_2[i + seq_len -1]]))
          flat_seq.append(data_flatten[i:i + seq_len].flatten())  # Wyłączenie ostatniej kolumny (target) z sekwencji
          flat_targets.append(np.array([target_arr_2[i + seq_len-1]]).flatten())

  sequences = np.array(sequences, dtype=np.float32)
  targets = np.array(targets, dtype=np.float32)

  flat_seq = np.array(flat_seq, dtype=np.float32)
  flat_targets = np.array(flat_targets, dtype=np.float32)

  train_sequences = sequences[:int(sequences.shape[0] * 0.7)]
  train_targets = targets[:int(targets.shape[0] * 0.7)]
  test_sequences = sequences[int(sequences.shape[0] * 0.7):]
  test_targets = targets[int(targets.shape[0] * 0.7):]

  train_flat_sequences = flat_seq[:int(flat_seq.shape[0] * 0.7)]
  train_flat_targets = flat_targets[:int(flat_targets.shape[0] * 0.7)]
  test_flat_sequences = flat_seq[int(flat_seq.shape[0] * 0.7):]
  test_flat_targets = flat_targets[int(flat_targets.shape[0] * 0.7):]

  data_tensor = torch.tensor(train_sequences, dtype=torch.float32)
  target_tensor = torch.tensor(train_targets, dtype=torch.float32)

  test_data_tensor = torch.tensor(test_sequences, dtype=torch.float32)
  test_target_tensor = torch.tensor(test_targets, dtype=torch.float32)

  return data_tensor, target_tensor, test_data_tensor, test_target_tensor, train_flat_sequences, train_flat_targets, test_flat_sequences, test_flat_targets

In [None]:
II_112_1_full = "full_II_112_1.pickle"
II_113_1_full = "full_II_113_1.pickle"
II_114_1_full = "full_II_114_1.pickle"
II_115_1_full = "full_II_115_1.pickle" # too few data
II_116_1_full = "full_II_116_1.pickle" # too few data
II_131_1_full = "full_II_131_1.pickle"
II_132_1_full = "full_II_132_1.pickle"
II_292_1_full = "full_II_292_1.pickle"
II_297_1_full = "full_II_297_1.pickle"
II_298_1_full = "full_II_298_1.pickle"
II_472_1_full = "full_II_472_1.pickle"
II_922_1_full = "full_II_922_1.pickle" # too few data
II_924_1_full = "full_II_924_1.pickle"
II_931_1_full = "full_II_931_1.pickle"
II_932_1_full = "full_II_932_1.pickle" # too few data
II_936_1_full = "full_II_936_1.pickle" # too few data
II_940_1_full = "full_II_940_1.pickle"
II_949_1_full = "full_II_949_1.pickle" # too few data
II_951_1_full = "full_II_951_1.pickle" # too few data
II_952_1_full = "full_II_952_1.pickle"
II_957_1_full = "full_II_957_1.pickle" # too few data
II_1345_1_full = "full_II_1345_1.pickle"
II_1346_1_full = "full_II_1346_1.pickle"
II_1351_1_full = "full_II_1351_1.pickle"
II_1352_1_full = "full_II_1352_1.pickle"

II_112_1_grace = "grace_II_112_1.pickle"
II_113_1_grace = "grace_II_113_1.pickle"
II_114_1_grace = "grace_II_114_1.pickle"
II_115_1_grace = "grace_II_115_1.pickle" # too few data
II_116_1_grace = "grace_II_116_1.pickle" # too few data
II_131_1_grace = "grace_II_131_1.pickle"
II_132_1_grace = "grace_II_132_1.pickle"
II_292_1_grace = "grace_II_292_1.pickle"
II_297_1_grace = "grace_II_297_1.pickle"
II_298_1_grace = "grace_II_298_1.pickle"
II_472_1_grace = "grace_II_472_1.pickle"
II_922_1_grace = "grace_II_922_1.pickle" # too few data
II_924_1_grace = "grace_II_924_1.pickle"
II_931_1_grace = "grace_II_931_1.pickle"
II_932_1_grace = "grace_II_932_1.pickle" # too few data
II_936_1_grace = "grace_II_936_1.pickle" # too few data
II_940_1_grace = "grace_II_940_1.pickle"
II_949_1_grace = "grace_II_949_1.pickle" # too few data
II_951_1_grace = "grace_II_951_1.pickle" # too few data
II_952_1_grace = "grace_II_952_1.pickle"
II_957_1_grace = "grace_II_957_1.pickle" # too few data
II_1345_1_grace = "grace_II_1345_1.pickle"
II_1346_1_grace = "grace_II_1346_1.pickle"
II_1351_1_grace = "grace_II_1351_1.pickle"
II_1352_1_grace = "grace_II_1352_1.pickle"

In [None]:
out_dir = "/content/drive/MyDrive/master/data/models" #google colab
#out_dir = "data/models"

In [None]:
station = II_297_1_full

In [None]:
stations_full = [II_472_1_full, II_952_1_full, II_1345_1_full, II_1346_1_full, II_1351_1_full, II_1352_1_full]
stations_grace = [II_472_1_grace, II_952_1_grace, II_1345_1_grace, II_1346_1_grace, II_1351_1_grace, II_1352_1_grace]

In [None]:
data_tensor, target_tensor, test_data_tensor, test_target_tensor, train_flat_sequences, train_flat_targets, test_flat_sequences, test_flat_targets = prepare_data_for_station(station)
ds = CustomDataset(data_tensor, target_tensor)
test_ds = CustomDataset(test_data_tensor, test_target_tensor)

best_result = find_best_config(ds, test_ds, data_tensor.shape[2])

best_config = best_result.metrics["config"]

print(best_config)


You can replace best_config["variable"] with your values that you want to test

In [None]:
ds = CustomDataset(data_tensor, target_tensor)
test_ds = CustomDataset(test_data_tensor, test_target_tensor)

old_loss = 99999999999

train_loader = DataLoader(ds, batch_size=best_config["batch_size"], shuffle=False)

test_dataloader = DataLoader(test_ds, batch_size=best_config["batch_size"], shuffle=False)

early_stopping = EarlyStopping(tolerance=best_config["tolerance"], min_delta=best_config["min_delta"])

model = LSTMModel(best_config["input_size"], best_config["hidden_size"], best_config["num_layer"], 1)
model.to(device)
criterion = torch.nn.HuberLoss()

optimizer = torch.optim.SGD(model.parameters(), lr=best_config["learning_rate"])

epoch_test_loss = 0
epoch_test_losses = []
epoch_losses = []
for epoch in range(best_config["epoch"]):
    epoch_test_loss = []
    epoch_loss = []
    model.train()
    for batch_x, batch_y in train_loader:
        batch_x = batch_x.to(device)
        batch_y = batch_y.to(device)
        optimizer.zero_grad()
        y_pred = model(batch_x)
        loss = criterion(y_pred, batch_y).to(device)
        loss.backward()
        epoch_loss.append(loss.item())
        optimizer.step()
    epoch_loss = np.sum(epoch_loss)
    epoch_losses.append(epoch_loss)

    model.eval()

    with torch.no_grad():
        for batch_test_x, batch_test_y in test_dataloader:
            batch_test_x = batch_test_x.to(device)
            batch_test_y = batch_test_y.to(device)
            y_test_pred = model(batch_test_x)
            test_loss = criterion(y_test_pred, batch_test_y).to(device)
            epoch_test_loss.append(test_loss.item())
        epoch_test_loss = np.sum(epoch_test_loss)
        epoch_test_losses.append(epoch_test_loss)

    early_stopping(epoch_loss, epoch_test_loss)
    if early_stopping.early_stop:
        break

In [None]:
y_true = []
y_pred = []
model.cpu()
model.eval()
with torch.no_grad():
    for batch_x, batch_y in test_dataloader:
        y_true += batch_y.tolist()
        y_pred += model(batch_x).tolist()
        break
mse = mean_squared_error(y_true, y_pred)
mae = mean_absolute_error(y_true, y_pred)

print(f"MSE: {mse:.4f}")
print(f"MAE: {mae:.4f}")
print(y_pred[0], y_pred[1], y_pred[2], y_true[0])

In [None]:
torch.save(model, os.path.join(out_dir, "model_" + station[:-6] + ".pt"))

In [None]:
xgb_model = xgb.XGBRegressor()
trained_model = xgb_model.fit(train_flat_sequences, train_flat_targets)

test_forecasts = trained_model.predict(test_flat_sequences)
print(test_forecasts.shape)

In [None]:
xgb_mse = mean_squared_error(test_flat_targets, test_forecasts)
xgb_mae = mean_absolute_error(test_flat_targets, test_forecasts)

print(f"MSE: {xgb_mse:.4f}")
print(f"MAE: {xgb_mae:.4f}")