In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, Dataset
from sklearn.model_selection import train_test_split
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
import matplotlib.dates as mdates
import pytorch_lightning as pl
from pytorch_lightning import Trainer, seed_everything
from pytorch_lightning.loggers.csv_logs import CSVLogger

In [45]:
class TimeSeriesDataset(Dataset):
    def __init__(self, X: np.ndarray, y: np.ndarray, seq_len: int = 1):
        self.X = torch.tensor(X).float()
        self.y = torch.tensor(y).float()
        self.seq_len = seq_len

    def __len__(self):
        return self.X.__len__() - (self.seq_len-1)

    def __getitem__(self, index):
        return (self.X[index:index+self.seq_len], self.y[index+self.seq_len-1])

In [58]:
class ColoradoDataModule(pl.LightningDataModule):
  def __init__(self, seq_len = 1, batch_size = 128, num_workers=0):
    super().__init__()
    self.seq_len = seq_len
    self.batch_size = batch_size
    self.num_workers = num_workers
    self.X_train = None
    self.y_train = None
    self.X_val = None
    self.y_val = None
    self.X_test = None
    self.y_test = None
    self.columns = None
    self.preprocessing = None

  def prepare_data(self):
    pass

  def setup(self, stage=None):
    if stage == 'fit' and self.X_train is not None:
      return 
    if stage == 'test' and self.X_test is not None:
      return
    if stage is None and self.X_train is not None and self.X_test is not None:  
      return

    # add colorado data preprocessing instead
    df = pd.read_csv('ColoradoData_Boulder.csv')
    df.index = df['Start_DateTime']
    df = df[['Start_DateTime', 'Energy_Consumption']].sort_index()
    df.dropna(inplace=True)
    df['Start_DateTime'] = pd.to_datetime(df['Start_DateTime'], format='%Y-%m-%d %H:%M:%S')
    df.set_index('Start_DateTime', inplace=True)

    X = df.copy()
    y = X['Energy_Consumption'].shift(-1).ffill()
    self.columns = X.columns

    X_cv, X_test, y_cv, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
    X_train, X_val, y_train, y_val = train_test_split(X_cv, y_cv, test_size=0.25, shuffle=False)

    # scaling the data splits
    preprocessing = StandardScaler()
    preprocessing.fit(X_train)

    if stage == 'fit' or stage is None:
      self.X_train = preprocessing.transform(X_train)
      self.y_train = y_train.values.reshape((-1, 1))
      self.X_val = preprocessing.transform(X_val)
      self.y_val = y_val.values.reshape((-1, 1))

    if stage == 'test' or stage is None:
      self.X_test = preprocessing.transform(X_test)
      self.y_test = y_test.values.reshape((-1, 1))

  def train_dataloader(self):
    train_dataset = TimeSeriesDataset(self.X_train, self.y_train, seq_len=self.seq_len)
    train_loader = DataLoader(train_dataset, batch_size=self.batch_size, shuffle=False, num_workers=self.num_workers)
    return train_loader

  def val_dataloader(self):
    val_dataset = TimeSeriesDataset(self.X_val, self.y_val, seq_len=self.seq_len)
    val_loader = DataLoader(val_dataset, batch_size=self.batch_size, shuffle=False, num_workers=self.num_workers)
    return val_loader

  def test_dataloader(self):
    test_dataset = TimeSeriesDataset(self.X_test, self.y_test, seq_len=self.seq_len)
    test_loader = DataLoader(test_dataset, batch_size=self.batch_size, shuffle=False, num_workers=self.num_workers)
    return test_loader   

In [70]:
class LSTMRegressor(pl.LightningModule):
  def __init__(self, n_features, hidden_size, seq_len, batch_size,num_layers, dropout, learning_rate, criterion):
    super(LSTMRegressor, self).__init__()
    self.n_features = n_features
    self.hidden_size = hidden_size
    self.seq_len = seq_len
    self.batch_size = batch_size
    self.num_layers = num_layers
    self.dropout = dropout
    self.criterion = criterion
    self.learning_rate = learning_rate

    self.lstm = nn.LSTM(input_size=n_features, hidden_size=hidden_size, num_layers=num_layers, 
                        dropout=dropout, batch_first=True)
    self.linear = nn.Linear(hidden_size, 1)
        
  def forward(self, x):
    lstm_out, _ = self.lstm(x)
    y_pred = self.linear(lstm_out[:,-1])
    return y_pred
  
  def configure_optimizers(self):
    return torch.optim.Adam(self.parameters(), lr=self.learning_rate)

  def training_step(self, batch, batch_idx):
    x, y = batch
    y_hat = self(x)
    train_loss = self.criterion(y_hat, y)
    self.log('train_loss', train_loss)
    return train_loss

  def validation_step(self, batch, batch_idx):
    x, y = batch
    y_hat = self(x)
    val_loss = self.criterion(y_hat, y)
    self.log('val_loss', val_loss)
    return val_loss
  
  def test_step(self, batch, batch_idx):
    x, y = batch
    y_hat = self(x)
    test_loss = self.criterion(y_hat, y)
    self.log('test_loss', test_loss)
    return test_loss

In [61]:
p = dict(
  seq_len = 12,
  batch_size = 8,
  criterion = nn.MSELoss(),
  max_epochs = 5,
  n_features = 7,
  hidden_size = 100,
  num_layers = 1,
  dropout = 1, # can be 0.2 if more output layers are present
  learning_rate = 0.001,
  # num_workers = 5,
)

In [71]:
seed_everything(1)

csv_logger = CSVLogger('./', name='lstm', version='0'),

trainer = Trainer(
  max_epochs=p['max_epochs'],
  logger=csv_logger
)

model = LSTMRegressor(
  n_features = p['n_features'],
  hidden_size = p['hidden_size'],
  seq_len = p['seq_len'],
  batch_size = p['batch_size'],
  criterion = p['criterion'],
  num_layers = p['num_layers'],
  dropout = p['dropout'],
  learning_rate = p['learning_rate']
)

dm = ColoradoDataModule(
  seq_len = p['seq_len'],
  batch_size = p['batch_size'],
  # num_workers = p['num_workers']
)

trainer.fit(model, dm)
trainer.test(model, datamodule=dm)

Seed set to 1
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/lightning_fabric/loggers/csv_logs.py:268: Experiment logs directory ./lstm/0 exists and is not empty. Previous log files in this directory will be deleted when the new ones are saved!

  | Name      | Type    | Params | Mode 
----------------------------------------------
0 | criterion | MSELoss | 0      | train
1 | lstm      | LSTM    | 43.6 K | train
2 | linear    | Linear  | 101    | train
----------------------------------------------
43.7 K    Trainable params
0         Non-trainable params
43.7 K    Total params
0.175     Total estimated model params size (MB)
3         Modules in train mode
0         Modules in eval mode


                                                                            

/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=5` in the `DataLoader` to improve performance.
/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=5` in the `DataLoader` to improve performance.


Epoch 4: 100%|██████████| 72/72 [00:00<00:00, 121.81it/s, v_num=0]

`Trainer.fit` stopped: `max_epochs=5` reached.


Epoch 4: 100%|██████████| 72/72 [00:00<00:00, 120.58it/s, v_num=0]


/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=5` in the `DataLoader` to improve performance.


Testing DataLoader 0: 100%|██████████| 23/23 [00:00<00:00, 351.26it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        test_loss            34.27167892456055
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'test_loss': 34.27167892456055}]

In [1]:
metrics = pd.read_csv('./lstm/0/metrics.csv')
train_loss = metrics[['train_loss', 'step', 'epoch']][~np.isnan(metrics['train_loss'])]
val_loss = metrics[['val_loss', 'epoch']][~np.isnan(metrics['val_loss'])]
test_loss = metrics['test_loss'].iloc[-1]

fig, axes = plt.subplots(1, 2, figsize=(16, 5), dpi=100)
axes[0].set_title('Train loss per batch')
axes[0].plot(train_loss['step'], train_loss['train_loss'])
axes[1].set_title('Validation loss per epoch')
axes[1].plot(val_loss['epoch'], val_loss['val_loss'], color='orange')
plt.show(block = True)

print('MSE:')
print(f"Train loss: {train_loss['train_loss'].iloc[-1]:.3f}")
print(f"Val loss:   {val_loss['val_loss'].iloc[-1]:.3f}")
print(f'Test loss:  {test_loss:.3f}')

NameError: name 'pd' is not defined