## GRU preprocessing

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('../data/TCQ_DemandSum_DST.csv')
df['Actual'] = df['Sum of DST']
print(df)

       Year  Month  Day  Period  Sum of Demand_Sum   Sum of DST  Sum of TCQ  \
0      2023   July    1       1               6120  5633.853332  486.146668   
1      2023   July    1       2               6020  5554.891608  465.108392   
2      2023   July    1       3               5911  5462.731620  448.268380   
3      2023   July    1       4               5843  5408.646667  434.353333   
4      2023   July    1       5               5798  5375.573341  422.426659   
...     ...    ...  ...     ...                ...          ...         ...   
12331  2024  March   13      44               6617  6094.329111  522.670889   
12332  2024  March   13      45               6468  5975.891026  492.108974   
12333  2024  March   13      46               6333  5844.349746  488.650254   
12334  2024  March   13      47               6198  5762.694101  435.305899   
12335  2024  March   13      48               6048  5653.727449  394.272551   

            Actual  
0      5633.853332  
1      55

In [2]:
import matplotlib.pyplot as plt

# Convert month names to numeric representation
month_dict = {'January': 1, 'February': 2, 'March': 3, 'April': 4, 'May': 5, 'June': 6,
              'July': 7, 'August': 8, 'September': 9, 'October': 10, 'November': 11, 'December': 12}
df['Month'] = df['Month'].map(month_dict)

# Combine year, month, day and period to form datetime
df['datetime'] = pd.to_datetime(df[['Year', 'Month', 'Day']]) + pd.to_timedelta((df['Period'] - 1) * 30, unit='minutes')

# Set datetime column as the index
df.set_index('datetime', inplace=True)

In [3]:
from sklearn.preprocessing import StandardScaler

# Define sequence length
sequence_length = 5  # Define the length of each sequence

# Extract "Actual" column to form single feature seqeunce
demand_seq = df['Actual'].values

# Scale the values
scaler = StandardScaler()
scaled_demand_seq = scaler.fit_transform(demand_seq.reshape(-1, 1))

# Initialize lists to store input sequences and corresponding labels
input_sequences = []
labels = []

# Create overlapping sequences of length sequence_length
for i in range(len(scaled_demand_seq) - sequence_length):
    # Extract a sequence of length sequence_length
    sequence = scaled_demand_seq[i : i + sequence_length]
    # Append the sequence to the input_sequences list
    input_sequences.append(sequence)
    # Append the next value (label) after the sequence to the labels list
    labels.append(scaled_demand_seq[i + sequence_length])

# Convert the lists to numpy arrays
input_sequences = np.array(input_sequences)
labels = np.array(labels)

# Print the shapes of input_sequences and labels
print("Input sequences shape:", input_sequences.shape)
print("Labels shape:", labels.shape)

Input sequences shape: (12331, 5, 1)
Labels shape: (12331, 1)


In [4]:
# Splitting into train test set
from sklearn.model_selection import train_test_split

# Splitting into train and test sets. Set shuffle = False to preserve temporal order of the data
X_train, X_test, y_train, y_test = train_test_split(input_sequences, labels, test_size=0.2, shuffle=False)

# For validation set
# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=False)


In [5]:
import tensorflow as tf

# Define the GRU model
model = tf.keras.Sequential([
    tf.keras.layers.GRU(units=64, input_shape=(sequence_length, 1)),
    tf.keras.layers.Dense(units=1)  # Output layer for regression
])

# Compile the model with RMSE as the evaluation metric
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['RootMeanSquaredError'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)



Epoch 1/10


  super().__init__(**kwargs)


[1m247/247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - RootMeanSquaredError: 0.6301 - loss: 0.4361 - val_RootMeanSquaredError: 0.1605 - val_loss: 0.0258
Epoch 2/10
[1m247/247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - RootMeanSquaredError: 0.1574 - loss: 0.0248 - val_RootMeanSquaredError: 0.1309 - val_loss: 0.0172
Epoch 3/10
[1m247/247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - RootMeanSquaredError: 0.1327 - loss: 0.0176 - val_RootMeanSquaredError: 0.1114 - val_loss: 0.0124
Epoch 4/10
[1m247/247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - RootMeanSquaredError: 0.1128 - loss: 0.0127 - val_RootMeanSquaredError: 0.1043 - val_loss: 0.0109
Epoch 5/10
[1m247/247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - RootMeanSquaredError: 0.1018 - loss: 0.0104 - val_RootMeanSquaredError: 0.0952 - val_loss: 0.0091
Epoch 6/10
[1m247/247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/

<keras.src.callbacks.history.History at 0x2900b49d0>

In [6]:
# Evaluation
# Evaluate the model on test data
loss, rmse = model.evaluate(X_test, y_test)
print("Test RMSE:", rmse)

[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - RootMeanSquaredError: 0.1051 - loss: 0.0111
Test RMSE: 0.12274099141359329
