## Simple LSTM with Colorado Dataset

In [14]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, Dataset
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler
import matplotlib.dates as mdates
import pytorch_lightning as pl
import holidays

# Check if GPU or MPS is available
device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")

print (f"Using device: {device}")



Using device: mps


## Preprocessing Colorado Data (Convert + Features)

In [15]:
def convert_to_hourly(data, start_date, end_date):
    # Convert date/time columns to datetime
    data['Start_DateTime'] = pd.to_datetime(data['Start_DateTime'])
    data['End_Time'] = pd.to_datetime(data['End_DateTime'])
    data['Charging_Time'] = pd.to_timedelta(data['Charging_Time'])

    ####################### FILTER DATASET  #######################

    # Take data from 30/11/2021 to 30/11/2023
    data = data[(data['Start_DateTime'] >= start_date) & (data['Start_DateTime'] <= end_date)].copy()

    # Calculate the end of the charging interval as start time + charging time
    data['Charging_EndTime'] = data['Start_DateTime'] + data['Charging_Time']

    # Sort the data by the Start_DateTime column
    data = data.sort_values(by=['Start_DateTime'], ascending=True)

    # Remove duplicates
    data = data.drop_duplicates(subset=['Start_DateTime', 'Charging_Time', 'Energy_Consumption'])


    ####################### CONVERT DATASET TO HOURLY  #######################

    # Split the session into hourly intervals
    hourly_rows = []

    # Iterate over each row in the dataframe to break charging sessions into hourly intervals
    for _, row in data.iterrows():
        start, end = row['Start_DateTime'], row['Charging_EndTime']
        energy = row['Energy_Consumption']

        # Generate hourly intervals
        hourly_intervals = pd.date_range(
            start=start.floor('h'), end=end.ceil('h'), freq='h')
        total_duration = (end - start).total_seconds()

        for i in range(len(hourly_intervals) - 1):
            interval_start = max(start, hourly_intervals[i])
            interval_end = min(end, hourly_intervals[i+1])
            interval_duration = (interval_end - interval_start).total_seconds()

            energy_fraction = (interval_duration / total_duration) * energy

            hourly_rows.append({
                'Time': hourly_intervals[i],
                'Energy_Consumption': energy_fraction,
                "Session_Count": 1  # Count of sessions in the interval
            })

    # Create a new dataframe from the hourly intervals
    hourly_df = pd.DataFrame(hourly_rows)

    # Aggregate the hourly intervals
    hourly_df = hourly_df.groupby('Time').agg({
        'Energy_Consumption': 'sum',
        'Session_Count': 'sum'
    }).reset_index()

    # Convert the Time column to datetime
    hourly_df['Time'] = pd.to_datetime(hourly_df['Time'], format="%d-%m-%Y %H:%M:%S")
    hourly_df = hourly_df.set_index('Time')

    # Define time range for all 24 hours
    start_time = hourly_df.index.min().normalize()  # 00:00:00
    end_time = hourly_df.index.max().normalize() + pd.Timedelta(days=1) - pd.Timedelta(hours=1)  # 23:00:00

    # Change range to time_range_full, so from 00:00:00 to 23:00:00
    time_range_full = pd.date_range(start=start_time, end=end_time, freq='h')

    # Reindex the hourly data to include all hours in the range
    hourly_df = hourly_df.reindex(time_range_full, fill_value=0)

    # Return the hourly data
    return hourly_df

def add_features(hourly_df, start_date, end_date):
  ####################### TIMED BASED FEATURES  #######################
  hourly_df['Day_of_Week'] = hourly_df.index.dayofweek

  # Add hour of the day
  hourly_df['Hour_of_Day'] = hourly_df.index.hour

  # Add month of the year
  hourly_df['Month_of_Year'] = hourly_df.index.month

  # Add year
  hourly_df['Year'] = hourly_df.index.year

  # Add day/night
  hourly_df['Day/Night'] = (hourly_df['Hour_of_Day']
                            >= 6) & (hourly_df['Hour_of_Day'] <= 18)

  # Add holiday
  us_holidays = holidays.US(years=range(start_date.year, end_date.year + 1))
  hourly_df['IsHoliday'] = hourly_df.index.map(lambda x: 1 if x.date() in us_holidays else 0)

  # Add weekend
  hourly_df['Weekend'] = (hourly_df['Day_of_Week'] >= 5).astype(int)

  ####################### CYCLIC FEATURES  #######################
  # Cos and sin transformations for cyclic features (hour of the day, day of the week, month of the year)

  hourly_df['HourSin'] = np.sin(2 * np.pi * hourly_df['Hour_of_Day'] / 24)
  hourly_df['HourCos'] = np.cos(2 * np.pi * hourly_df['Hour_of_Day'] / 24)
  hourly_df['DayOfWeekSin'] = np.sin(2 * np.pi * hourly_df['Day_of_Week'] / 7)
  hourly_df['DayOfWeekCos'] = np.cos(2 * np.pi * hourly_df['Day_of_Week'] / 7)
  hourly_df['MonthOfYearSin'] = np.sin(2 * np.pi * hourly_df['Month_of_Year'] / 12)
  hourly_df['MonthOfYearCos'] = np.cos(2 * np.pi * hourly_df['Month_of_Year'] / 12)

  ####################### SEASONAL FEATURES  #######################
  month_to_season = {1: 'Winter', 2: 'Winter', 3: 'Spring', 4: 'Spring', 5: 'Spring', 6: 'Summer',
                     7: 'Summer', 8: 'Summer', 9: 'Autumn', 10: 'Autumn', 11: 'Autumn', 12: 'Winter'}
  hourly_df['Season'] = hourly_df['Month_of_Year'].map(month_to_season)

  ####################### HISTORICAL CONSUMPTION FEATURES  #######################
  # Lag features
  # 1h
  hourly_df['Energy_Consumption_1h'] = hourly_df['Energy_Consumption'].shift(1)

  # 6h
  hourly_df['Energy_Consumption_6h'] = hourly_df['Energy_Consumption'].shift(6)

  # 12h
  hourly_df['Energy_Consumption_12h'] = hourly_df['Energy_Consumption'].shift(12)

  # 24h
  hourly_df['Energy_Consumption_24h'] = hourly_df['Energy_Consumption'].shift(24)

  # 1 week
  hourly_df['Energy_Consumption_1w'] = hourly_df['Energy_Consumption'].shift(24*7)

  # Rolling average
  # 24h
  hourly_df['Energy_Consumption_rolling'] = hourly_df['Energy_Consumption'].rolling(window=24).mean()

  return hourly_df

# Define the start and end dates
start_date = pd.to_datetime('2021-11-30')
end_date = pd.to_datetime('2023-11-30')

# Load the data
data = pd.read_csv('Colorado/Preprocessing/TestDataset/CleanedColoradoData.csv')

# Convert to hourly data
hourly_df = convert_to_hourly(data=data, start_date=start_date, end_date=end_date)

# Add features
hourly_df = add_features(hourly_df, start_date, end_date)

print(hourly_df.info())

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 17544 entries, 2021-11-30 00:00:00 to 2023-11-30 23:00:00
Freq: h
Data columns (total 22 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Energy_Consumption          17544 non-null  float64
 1   Session_Count               17544 non-null  int64  
 2   Day_of_Week                 17544 non-null  int32  
 3   Hour_of_Day                 17544 non-null  int32  
 4   Month_of_Year               17544 non-null  int32  
 5   Year                        17544 non-null  int32  
 6   Day/Night                   17544 non-null  bool   
 7   IsHoliday                   17544 non-null  int64  
 8   Weekend                     17544 non-null  int64  
 9   HourSin                     17544 non-null  float64
 10  HourCos                     17544 non-null  float64
 11  DayOfWeekSin                17544 non-null  float64
 12  DayOfWeekCos                17544 non-null  f

## Scaling Dataset and Creating Sequences

In [21]:
scaler = MinMaxScaler()
features = ['Session_Count', 'HourSin', 'HourCos', 'DayOfWeekSin', 'DayOfWeekCos',
            'MonthOfYearSin', 'MonthOfYearCos', 'IsHoliday', 'Weekend',
            'Energy_Consumption_1h', 'Energy_Consumption_6h', 'Energy_Consumption_12h',
            'Energy_Consumption_24h']
target = 'Energy_Consumption'

hourly_df = scaler.fit_transform(hourly_df[features + [target]])

# Convert the numpy array to a PyTorch tensor
hourly_tensor = torch.tensor(hourly_df, dtype=torch.float32)

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

## Create Sequences, Split dataset and Dataloaders

In [23]:
def create_sequences(data, target_idx, seq_length):
    X, y = [], []
    for i in range(len(data) - seq_length):
        # Append all the features except target
        X.append(data[i:i+seq_length, :-1])
        y.append(data[i+seq_length, target_idx])  # Target feature
    return torch.stack(X), torch.tensor(y)

# Number of past hours used for prediction
seq_length = 24
target_idx = target
X, y = create_sequences(hourly_tensor, target_idx=target_idx, seq_length=seq_length)

TypeError: new(): invalid data type 'str'

## DataLoaders 

## LSTM Model

In [11]:
class LSTM(nn.Module):
    def __init__(self, input_size=1, hidden_layer_size=100, output_size=1):
        super().__init__()
        self.hidden_layer_size = hidden_layer_size
        self.lstm = nn.LSTM(input_size, hidden_layer_size)
        self.linear = nn.Linear(hidden_layer_size, output_size)
        self.hidden_cell = (torch.zeros(1, 1, self.hidden_layer_size), torch.zeros(1, 1, self.hidden_layer_size))
        self.name = "LSTM"

    def forward(self, input_seq):
        lstm_out, self.hidden_cell = self.lstm(
            input_seq.view(len(input_seq), 1, -1), self.hidden_cell)
        predictions = self.linear(lstm_out.view(len(input_seq), -1))
        return predictions[-1]

## Training and Validation

## Testing Model