In [1]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, LSTM

In [2]:
# Read in energy generation data as dataframe
generation_df = pd.read_csv('Data/energy_generation_data.csv', index_col=0, parse_dates=True)

generation_df.head()

Unnamed: 0_level_0,coal,natural gas,nuclear,petroleum,other,solar,hydro,wind
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2023-12-31,1870097,4270620,2245684,7293,221858,305983,632266,1074159
2023-12-30,1964710,4369981,2215492,7661,224151,322528,628762,1064144
2023-12-29,1948705,4482785,2228321,8311,216563,312185,718066,1109628
2023-12-28,1776641,4480968,2224448,9639,209483,309365,701614,1119395
2023-12-27,1679634,4397551,2215389,8236,218310,262265,723729,1093226


In [3]:
# Reorder to dates ascending
generation_df = generation_df.sort_index()

generation_df.head()

Unnamed: 0_level_0,coal,natural gas,nuclear,petroleum,other,solar,hydro,wind
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2019-01-01,2596861,2799020,2476849,10747,179678,102279,804553,561924
2019-01-02,2815323,3652337,2487248,6718,197419,105653,905440,491233
2019-01-03,2702126,3595766,2485906,8444,198361,109039,869689,709083
2019-01-04,2664270,3543125,2469281,7251,193085,110691,849861,584117
2019-01-05,2482182,3043584,2442032,5209,185661,67261,824607,702299


In [4]:
# Create total column
generation_df['total_generated'] = generation_df.sum(axis=1)

# Isolate total column to use in univariate forecast
total_generated_df = generation_df[['total_generated']]

total_generated_df.head()

Unnamed: 0_level_0,total_generated
date,Unnamed: 1_level_1
2019-01-01,9531911
2019-01-02,10661371
2019-01-03,10678414
2019-01-04,10421681
2019-01-05,9752835


In [7]:
def split_time_dataset(df, test_size=0.2):
    '''
    Split a time-series dataframe with a datetime index into training
    and test sets.
    '''
    train_size = int(len(df) * (1 - test_size))
    
    train_data, test_data = df.iloc[:train_size], df.iloc[train_size:]
    
    return train_data, test_data

In [12]:
def create_windows(data, window_size=10):
    '''
    Return data segmented into windows of a set size as a numpy array.
    '''
    X, y = [], []
    
    for i in range(len(data) - window_size - 1):
        X.append(data[i:(i + window_size), 0])
        y.append(data[i + window_size, 0])
        
    return np.array(X), np.array(y)

In [10]:
def scale_data(train_data, test_data):
    '''
    Fit an instance of MinMaxScaler using training data and then
    transforming the training and test sets.
    '''
    scaler = MinMaxScaler()
    
    train_data_scaled = scaler.fit_transform(train_data)
    test_data_scaled = scaler.transform(test_data)
    
    return train_data_scaled, test_data_scaled

In [13]:
# Split total_generated_df into training and test sets
train_data, test_data = split_time_dataset(total_generated_df)

# Scale data
train_data_scaled, test_data_scaled = scale_data(train_data, test_data)

# Create windowed datasets
X_train, y_train = create_windows(train_data_scaled)
X_test, y_test = create_windows(test_data_scaled)

print(X_train.shape, X_train[:3], y_train[:3], X_test.shape, sep='\n')

(1449, 10)
[[0.20317011 0.38060215 0.38327951 0.34294815 0.23787607 0.20615883
  0.30500588 0.28446734 0.37684068 0.4728949 ]
 [0.38060215 0.38327951 0.34294815 0.23787607 0.20615883 0.30500588
  0.28446734 0.37684068 0.4728949  0.45130461]
 [0.38327951 0.34294815 0.23787607 0.20615883 0.30500588 0.28446734
  0.37684068 0.4728949  0.45130461 0.34619074]]
[0.45130461 0.34619074 0.35414428]
(355, 10)
