In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)



In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Bidirectional, Dense,Dropout,LSTM,Activation, RepeatVector, SimpleRNN

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error

from datetime import datetime

import matplotlib.pyplot as plt

import os, glob


# Data Context

### The energy network of the Netherlands is managed by a few companies. The energy data is collected from following companies.

- Enexis
- Liander
- Stedin
- Enduris
- Westlandinfra
- Rendo
- Coteq

## 1st use-case
### Time-series analysis on electricity data from Stedin company

---------
## To understand data properly, we will need to introduce a year column into the dataset,
## since each file, states the data year, I will extract the year from the file and populate it with a separate column

# Link to alternate model

# [Energy conumption in NL](https://www.kaggle.com/raaavan/cnn-lstm-time-series)

In [None]:
years = ['2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020']
companies = ['stedin', 'liander','enduris', 'enexis','westland-infra','rendo','coteq'] 

In [None]:

path = r'../input/dutch-energy/Electricity/' 

stedin=[]
liander = []
enduris = []
enexis = []
westland_infra = []
rendo = []
coteq = []

for company in companies:
    all_files = glob.glob(f"{path}/{company}*.csv")
    for file in all_files:
        print(company, file)
        for year in years:
            if year in file:
                print(f"adding column year {year} to {file}")
        
                comp_df = company
                print(f"processing for : {comp_df}")
                comp_df = pd.read_csv(file, index_col=None, header=0)
                comp_df['year'] = year

                if company == companies[0]:
                    stedin.append(comp_df)

                elif company == companies[1]:
                    liander.append(comp_df)
                    
                elif company == companies[2]:
                        enduris.append(comp_df)
                        
                elif company == companies[3]:
                    enexis.append(comp_df)

                elif company == companies[4]:
                    westland_infra.append(comp_df)

                elif company == companies[5]:
                    rendo.append(comp_df)

                elif company == companies[6]:
                    coteq.append(comp_df)  


                
        print('-------------------------------')

        


In [None]:
stedin_df = pd.concat(stedin, axis=0, ignore_index=True)
# liander_df = pd.concat(liander, axis=0, ignore_index=True)
# enduris_df = pd.concat(enduris, axis=0, ignore_index=True)
# enexis_df = pd.concat(enexis, axis=0, ignore_index=True)
# westland_infra_df = pd.concat(westland_infra, axis=0, ignore_index=True)
# rendo_df = pd.concat(rendo, axis=0, ignore_index=True)
# coteq_df = pd.concat(coteq, axis=0, ignore_index=True)

In [None]:
stedin_df.info()

In [None]:
stedin_df.isna().sum()

In [None]:
stedin_df = stedin_df.drop('STANDAARDDEVIATIE', 1)

In [None]:
stedin_df.head(5)

In [None]:
stedin_df.info()

### Let's focus on the first top 10 cities based on their annual electricity consumption

In [None]:
top_10_cities = stedin_df['city'].value_counts().sort_values(ascending = False).nlargest(10)

In [None]:
top_10_df = stedin_df[stedin_df['city'].isin(top_10_cities.index)] 

In [None]:
top_10_df.head(2)

In [None]:
pd.DataFrame(top_10_df.year.value_counts())

In [None]:
fig, ax = plt.subplots(figsize=(10,7))
top_10_df.groupby(['year'])['annual_consume'].sum().plot(kind = 'bar', ax=ax, title = 'Total annual consumption')

### Total consumption among 10 cities shows a slight decrease in year 2019
-------

In [None]:
fig, ax = plt.subplots(figsize=(10,7))
top_10_df.groupby(['year'])['smartmeter_perc'].sum().plot(kind = 'bar', ax=ax, title = 'total smart meter usage')

### over time the smart meter usage kept on increasing since 2015 and within last 5 year it has shown an exponential growth

-----------

### total consumption per connection in top 10 cities

In [None]:
top_10_df['consumption_per_connection'] = top_10_df['annual_consume'] / (top_10_df['num_connections'] * top_10_df['perc_of_active_connections'] / 100)
top_10_df.loc[top_10_df['consumption_per_connection'] == np.inf, 'consumption_per_connection'] = 0

In [None]:
cpc = top_10_df.groupby(['year'])['consumption_per_connection'].sum()
cpc.plot(kind='bar', title= 'total consumption per connection', figsize=(10, 7))

In [None]:
city_conumption = top_10_df.groupby(['city'])['annual_consume'].sum().sort_values(ascending=False).head(10)
city_conumption.plot(kind='bar', title= 'total consumption per city (top 10 cities)', figsize=(10, 7))

### the graph shows first 3 cites have a huge consumption rate while in remaming cities the rate is pretty much low.
--------

In [None]:
t1c_ac = top_10_df.groupby(['year','city'])['annual_consume'].sum().sort_values(ascending = False).head(110)
t1c_ac.unstack().plot(kind='bar', title= 'yearly total consumption per city (top 10 cities)', figsize=(15, 10))

----------
## top 10 cites that uses smart meter more often

In [None]:
top_10_cities_sm=stedin_df.groupby(['city'])['smartmeter_perc'].sum().sort_values(ascending = False).head(10)
top_10_cities_sm.plot(kind='bar', title= 'total smart meter per city (top 10 cities)', figsize=(12, 8))

In [None]:
sm_10_df = stedin_df[stedin_df['city'].isin(top_10_cities_sm.index)] 

In [None]:
top_10_sm_yearly = sm_10_df.groupby(['year','city'])['smartmeter_perc'].mean().sort_values(ascending = False)
top_10_sm_yearly.unstack().plot(kind='bar', title= 'average yearly smart-meter in top 10 cities', figsize=(15, 10))

## Time series analysis

In [None]:
stedin_df.annual_consume.sort_values(ascending = True)

In [None]:
def plot_df(df, x, y, title="", xlabel='Date', ylabel='Value', dpi=100):
    plt.figure(figsize=(16,5), dpi=dpi)
    plt.plot(x, y, color='tab:red')
    plt.gca().set(title=title, xlabel=xlabel, ylabel=ylabel)
    plt.show()

plot_df(stedin_df, x=stedin_df.index, y=stedin_df.annual_consume, title='data distribution from 2009 to 2020.')    

In [None]:
dataset = stedin_df.annual_consume.values.reshape(-1,1)

sclar = MinMaxScaler(feature_range=(0,1))
dataset = sclar.fit_transform(dataset)
dataset.shape

In [None]:
def load_data(dataset, seq_len):
    X_train = []
    y_train = []
    split_size = int(0.8 * len(dataset))
    
    for i in range(seq_len, len(dataset)):
        X_train.append(dataset[i - seq_len: i, 0])
        y_train.append(dataset[i, 0])

   
    X_test = X_train[split_size:]
    y_test = y_train[split_size:]

   
    X_train = X_train[:split_size]
    y_train = y_train[:split_size]

    X_train = np.array(X_train)
    y_train = np.array(y_train)

    X_test = np.array(X_test)
    y_test = np.array(y_test)

    return [X_train, y_train, X_test, y_test]

In [None]:
seq_len = 20 #choose sequence length

X_train, y_train, X_test, y_test = load_data(dataset, seq_len)

X_train = X_train.reshape(X_train.shape[0], seq_len, 1)
X_test = X_test.reshape(X_test.shape[0], seq_len, 1)

X_train.shape,X_test.shape,y_train.shape,y_test.shape

In [None]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Bidirectional(LSTM(64, return_sequences=True, input_shape=(X_train.shape[1],1))))
model.add(Bidirectional(LSTM(64,return_sequences= False, activation= 'relu')))
model.add(tf.keras.layers.Dense(32, activation = 'relu'))
model.add(tf.keras.layers.Dense(16, activation = 'relu'))
model.add(Dense(1, activation = 'linear'))

model.compile(loss='mse', optimizer = 'adam')
hist = model.fit(X_train, y_train, epochs=5, validation_split=0.3, batch_size=1024)
model.summary()

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(12,7))
ax.plot(hist.history['loss'], 'b' ,label = 'train loss', linewidth=2)
ax.plot(hist.history['val_loss'], 'r', label ='Validation loss', linewidth=2)
ax.set_title('model loss')
ax.set_ylabel('mse')
ax.set_xlabel('epoch')
ax.legend()
plt.show()

In [None]:
pred = model.predict(X_test)

In [None]:
y_test_in.shape

In [None]:
def eval_score(y_test, y_true):
    mae = mean_absolute_error(y_test, y_true)
    mse = mean_squared_error(y_test, y_true)
     
    
    return f"mae : {mae}, mse : {mse}"

In [None]:
eval_score(y_test, pred)

In [None]:
def plot_predictions(test, predicted, title):
    plt.figure(figsize=(16, 4))
    plt.plot(test, color='blue', label='Actual power consumption data')
    plt.plot(predicted, alpha=0.7, color='red', label='Predicted power consumption data')
    plt.title(title)
    plt.xlabel('Time')
    plt.ylabel('Normalized power consumption scale')
    plt.legend()
    plt.show()


plot_predictions(y_test, pred, "Predictions made by model")