# Exercise 11

The purpose of this exercise is to develop as accurate as possible RNN (recurrent neural network) model that predicts electricity consumption in a property. 

Prediction:
1. Predict the next hour electricity consumption
2. Predict the next day (24h) electricity consumption 

The data set shows the hourly consumption of the property for the period from 1 January 2017 to 28 February 2022. The files are named as year-month.csv i.e. 2022-2.csv includes consumption from February 2022. Note: the timestamp 1.1.2017 00:00 is the electricity consumption from 1.1.2017 00:00 to 01:00. 

Zipped dataset file can be found from the Moodle: Electricity_consumption.zip

Dataset splitting:
1. Training dataset: 1.1.2017-30.9.2020
2. Test dataset: 1.10.2020-30.9.2021
3. "Another test dataset", which is used to when your RNN is ready: 1.10.2021-28.2.2022 


Enrich data using open data from (Finnish meteorological Institute)[https://en.ilmatieteenlaitos.fi/]. 
- The weather station location is: Jyväskylä lentoasema, 137208, longitude: 62.39, latitude: 25.69
- Enriched dataset have to contain AT LEAST the temperature, but you can use other information such as windy, humidity etc.


In [1]:
import glob
import os
import pandas as pd 
import numpy as np
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
tf.keras.backend.set_floatx('float64')

from sklearn.preprocessing import MinMaxScaler

from keras.models import Sequential
from keras.layers import Dense, SimpleRNN
from sklearn.metrics import mean_squared_error
import math


In [20]:
# files are ANSI-encoded
# got code for reading from Suvi Tarkiainen, added skipfooter
files = glob.glob('electricity_consumption/*.csv')
df_consumption = pd.concat([pd.read_csv(fp,encoding='windows-1252',sep=';', decimal = ',', skipfooter = 1, engine='python').assign(Year=os.path.basename(fp).split('-')[0]) 
       for fp in files])

# TO DO: Is this UTC+2 time? Haven't used datetime's tzinfo-attribute.

# Create datetime-column and drop extra
df_consumption[['Date', 'Time']] = df_consumption['Time'].str.split(' ', 1, expand = True)
df_consumption['Datetime'] = df_consumption['Time'] + ' ' + df_consumption['Date'] + df_consumption['Year']   
df_consumption['Datetime'] = pd.to_datetime(df_consumption['Datetime'], dayfirst = True)
df_consumption.drop(['Time', 'Date', 'Year'], axis = 1, inplace = True)

#df_consumption.loc[:,'Datetime'] = pd.to_datetime(df_consumption.loc[:, 'Datetime'], format='%Y-%m-%d %H:%M:%S')
#df_consumption.sort_values(by="Datetime", inplace = True)
#df_consumption.set_index('Datetime', inplace = True)
df_consumption

## Checking datetime conversion
# df_consumption.loc['2022-03-01 00:00:00'] # March 1st returns empty
# df_consumption.loc['2022-01-03 00:00:00'] # January 3rd returns value

Unnamed: 0,Power,Datetime
0,1.39,2017-01-01 00:00:00
1,3.08,2017-01-01 01:00:00
2,0.84,2017-01-01 02:00:00
3,1.66,2017-01-01 03:00:00
4,0.76,2017-01-01 04:00:00
...,...,...
667,3.61,2022-02-28 19:00:00
668,2.42,2022-02-28 20:00:00
669,5.97,2022-02-28 21:00:00
670,1.12,2022-02-28 22:00:00


In [21]:
# Enrich data using open data from (Finnish meteorological Institute)[https://en.ilmatieteenlaitos.fi/].
# The weather station location is: Jyväskylä lentoasema, 137208, longitude: 62.39, latitude: 25.69
# Enriched dataset have to contain AT LEAST the temperature, but you can use other information such as windy, humidity etc.
# I chose Precipitation amount, Air temperature, Wind speed

df_weather = pd.read_csv('data/ex-11-weather-data-jkl-lentoasema.csv')

df_weather['Datetime'] = df_weather['Year'].astype(str) + '.' + df_weather['m'].astype(str) + '.' + df_weather['d'].astype(str) + ' ' + df_weather['Time'] + ':00'
df_weather['Datetime'] = pd.to_datetime(df_weather['Datetime'])

# Data UTC-ajassa, joten oikeasti +2h, toisen datan osalta ei tietoa onko UTC vai UTC+2. 
# TO DO: Pitäisikö lisätä timedate-objectiin tzinfo (https://docs.python.org/3/library/datetime.html) tai jollain muulla tavalla mätsätä

df_weather.drop(['Year', 'm', 'd', 'Time', 'Time zone' ], axis = 1, inplace = True)

# df_weather.dtypes
# df_weather.set_index('Datetime', inplace = True)
# print(df_weather.loc['2022-03-01 00:00:00']) # March 1st returns value - mutta tätä ei tarvita
# print(df_weather.loc['2022-01-03 00:00:00']) # January 3rd returns value

df_weather

Unnamed: 0,Precipitation amount (mm),Air temperature (degC),Wind speed (m/s),Datetime
0,0.0,2.2,4.9,2017-01-01 00:00:00
1,0.0,1.9,4.1,2017-01-01 01:00:00
2,0.0,1.7,4.1,2017-01-01 02:00:00
3,0.0,1.3,4.8,2017-01-01 03:00:00
4,0.0,0.7,4.4,2017-01-01 04:00:00
...,...,...,...,...
45236,0.0,-1.1,2.0,2022-02-28 20:00:00
45237,0.0,-0.9,2.3,2022-02-28 21:00:00
45238,0.0,-1.6,2.1,2022-02-28 22:00:00
45239,0.0,-2.2,1.9,2022-02-28 23:00:00


In [4]:
#df_consumption_weather = pd.merge(df, df_weather, how='inner', left_on='Datetime', right_on='Datetime')
df_consumption_weather = pd.merge(df_consumption, df_weather, how='inner', on='Datetime')

df_consumption_weather

Unnamed: 0,Power,Datetime,Precipitation amount (mm),Air temperature (degC),Wind speed (m/s)
0,1.39,2017-01-01 00:00:00,0.0,2.2,4.9
1,3.08,2017-01-01 01:00:00,0.0,1.9,4.1
2,0.84,2017-01-01 02:00:00,0.0,1.7,4.1
3,1.66,2017-01-01 03:00:00,0.0,1.3,4.8
4,0.76,2017-01-01 04:00:00,0.0,0.7,4.4
...,...,...,...,...,...
45235,3.61,2022-02-28 19:00:00,0.0,-1.0,1.6
45236,2.42,2022-02-28 20:00:00,0.0,-1.1,2.0
45237,5.97,2022-02-28 21:00:00,0.0,-0.9,2.3
45238,1.12,2022-02-28 22:00:00,0.0,-1.6,2.1


In [5]:
df_consumption_weather.isnull().sum()

Power                          0
Datetime                       0
Precipitation amount (mm)    233
Air temperature (degC)       161
Wind speed (m/s)             409
dtype: int64

In [6]:
df_consumption_weather.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45240 entries, 0 to 45239
Data columns (total 5 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   Power                      45240 non-null  float64       
 1   Datetime                   45240 non-null  datetime64[ns]
 2   Precipitation amount (mm)  45007 non-null  float64       
 3   Air temperature (degC)     45079 non-null  float64       
 4   Wind speed (m/s)           44831 non-null  float64       
dtypes: datetime64[ns](1), float64(4)
memory usage: 2.1 MB


In [7]:
df_consumption_weather.set_index('Datetime', inplace = True)

# Seems like the longest period without some enriched data is 52h
# for now I'll just interpolate it, but it is not the best way for this sort of data
# TO DO if time: when filling, take into account time of day etc.
df_consumption_weather.interpolate(method= 'time', limit_direction='both', limit = 26, inplace = True) 

# https://www.kaggle.com/code/juejuewang/handle-missing-values-in-time-series-for-beginners/report
# https://machinelearningmastery.com/resample-interpolate-time-series-data-python/

In [8]:
df_consumption_weather.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 45240 entries, 2017-01-01 00:00:00 to 2022-02-28 23:00:00
Data columns (total 4 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Power                      45240 non-null  float64
 1   Precipitation amount (mm)  45240 non-null  float64
 2   Air temperature (degC)     45240 non-null  float64
 3   Wind speed (m/s)           45240 non-null  float64
dtypes: float64(4)
memory usage: 1.7 MB


In [9]:
df_consumption_weather.shape[0]

45240

In [10]:
#sns.lineplot(y='Power', data=df_consumption_weather) #hue='Wind speed (m/s)'

In [23]:
# Dataset splitting:

# Training dataset: 1.1.2017-30.9.2020
# Test dataset: 1.10.2020-30.9.2021
# "Another test dataset", which is used to when your RNN is ready: 1.10.2021-28.2.2022

df_train = df_consumption_weather.loc['2017-01-01 00:00:00':'2020-09-30 23:00:00']
df_test = df_consumption_weather.loc['2020-10-01 00:00:00':'2021-09-30 23:00:00']
df_test2 = df_consumption_weather.loc['2021-10-01 00:00:00':'2022-02-28 23:00:00'] 


In [12]:
df_train.shape

(35064, 4)

In [24]:
print(f'{df_train.shape} ja {1369 * 24}')
print(f'{df_test.shape} ja {365 * 24}')
print(f'{df_test2.shape} ja {151 * 24}')

(32855, 4) ja 32856
(8760, 4) ja 8760
(3625, 4) ja 3624


In [27]:
# Saving prepared data to csv-files to be used for rest of the training of Exercise 11 and Exercise 12

df_train.to_csv('data/df_11_train.csv')
df_test.to_csv('data/df_11_test.csv')
df_test2.to_csv('data/df_11_test2.csv')

# Rest of the training is done on another workbook, because my local environment run into this issue 
https://github.com/tensorflow/models/issues/9706

In [14]:
def split_to_features_and_target(data):
    y = data[['Power']].copy()
    X = data.drop(['Power'], axis = 1).copy()
    return X, y

X_train, y_train = split_to_features_and_target(df_train)
X_test, y_test = split_to_features_and_target(df_test)
X_test2, y_test2 = split_to_features_and_target(df_test2)

# Scaling

scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_test2_scaled = scaler.transform(X_test2)

scalero = MinMaxScaler()
scalero.fit(y_train)
y_train_scaled = scalero.transform(y_train)
y_test_scaled = scalero.transform(y_test)
y_test2_scaled = scalero.transform(y_test2)

In [15]:
X_train_scaled

array([[0.        , 0.5264    , 0.32885906],
       [0.        , 0.5216    , 0.27516779],
       [0.        , 0.5184    , 0.27516779],
       ...,
       [0.        , 0.6624    , 0.0738255 ],
       [0.        , 0.6528    , 0.04697987],
       [0.        , 0.6528    , 0.06711409]])

In [16]:
# Stepify Satu Sadulta! Muokattu hieman.

def stepify(data):
    lookback = 72
    X=[]

    for i in range(len(data) - lookback - 1):
        t=[]
        for j in range(0, lookback):
            t.append(data[[(i + j)], :])
        X.append(t)
        
    X = np.array(X)
    X = X.reshape(X.shape[0], lookback, data.shape[1]) # data.shape[1] is used for getting no, features/targets
    print(X.shape)
    return X

X_train_scaled_steps = stepify(X_train_scaled)
X_test_scaled_steps = stepify(X_test_scaled)
X_test2_scaled_steps = stepify(X_test2_scaled)

y_train_scaled_steps = stepify(y_train_scaled)
y_test_scaled_steps = stepify(y_test_scaled)
y_test2_scaled_steps = stepify(y_test2_scaled)



(34991, 72, 3)
(16727, 72, 3)
(9359, 72, 3)
(34991, 72, 1)
(16727, 72, 1)
(9359, 72, 1)


In [17]:
print(X_train_scaled_steps)

[[[0.         0.5264     0.32885906]
  [0.         0.5216     0.27516779]
  [0.         0.5184     0.27516779]
  ...
  [0.01360544 0.3008     0.37583893]
  [0.         0.2944     0.3557047 ]
  [0.         0.288      0.34899329]]

 [[0.         0.5216     0.27516779]
  [0.         0.5184     0.27516779]
  [0.         0.512      0.32214765]
  ...
  [0.         0.2944     0.3557047 ]
  [0.         0.288      0.34899329]
  [0.         0.2816     0.26845638]]

 [[0.         0.5184     0.27516779]
  [0.         0.512      0.32214765]
  [0.         0.5024     0.29530201]
  ...
  [0.         0.288      0.34899329]
  [0.         0.2816     0.26845638]
  [0.         0.2688     0.2885906 ]]

 ...

 [[0.         0.6368     0.03355705]
  [0.         0.6128     0.04026846]
  [0.         0.6112     0.05369128]
  ...
  [0.         0.6704     0.0738255 ]
  [0.         0.6704     0.0738255 ]
  [0.         0.6672     0.10067114]]

 [[0.         0.6128     0.04026846]
  [0.         0.6112     0.05369128]


In [19]:
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, SimpleRNN
from sklearn.metrics import mean_squared_error
import math

# 1 hour prediction

input_layer = tf.keras.Input(shape=(72,3))
rnn = tf.keras.layers.SimpleRNN(72)(input_layer)
output = tf.keras.layers.Dense(1)(rnn)
rnn_model = tf.keras.Model(inputs = input_layer,
                           outputs = output)

rnn_model.compile(optimizer='adam',
                   loss='mse',
                   metrics=['mean_squared_error'])

rnn_model.fit(X_train_scaled_steps, y_train_scaled_steps, epochs=10, verbose=1)
test_results = rnn_model.evaluate(X_test_scaled_steps, y_test_scaled_steps, verbose=0)
predictions = rnn_model.predict(X_test_scaled_steps)
print(f"Test loss {test_results[0]}")

NotImplementedError: Cannot convert a symbolic Tensor (simple_rnn_1/strided_slice:0) to a numpy array. This error may indicate that you're trying to pass a Tensor to a NumPy call, which is not supported

# Conclusion

Please see Exercise11-training-model-in-colab.ipynb



In [None]:
# own notes, please ignore

## split when column
# split_date_train = datetime(2020, 9, 30) + timedelta(days=1)
# split_date_test = datetime(2021, 9 ,30) + timedelta(days=1)

# df_training = df_consumption_weather.loc[df_consumption_weather['Datetime'] <= split_date_train ]
# df_test = df_consumption_weather.loc[(df_consumption_weather['Datetime'] > split_date_train) & (df_consumption_weather['Datetime'] <= split_date_test)]
# df_test2 = df_consumption_weather.loc[df_consumption_weather['Datetime'] > split_date_test]


# --

# split_date_train = datetime(2020, 9, 30) + timedelta(hours=23)
# split_date_test = datetime(2021, 9, 30) + timedelta(hours=23)
# df_train = df_consumption_weather[:split_date_train]
# df_test = df_consumption_weather[split_date_train + timedelta(hours=1):split_date_test]
# df_test2 = df_consumption_weather[split_date_test + timedelta(hours=1):]
