# Wind Power forecasting for the day-ahead energy market - Data Challenge
by Compagnie Nationale du Rhône, ENS Paris & Collège de France

<p align="center"><img src="https://cap.img.pmdstatic.net/fit/http.3A.2F.2Fprd2-bone-image.2Es3-website-eu-west-1.2Eamazonaws.2Ecom.2Fcap.2F2019.2F10.2F04.2Fea495374-9115-4be7-a91a-e9bc5b305b0b.2Ejpeg/768x432/background-color/ffffff/focus-point/992%2C1086/quality/70/dangereuses-pour-la-sante-peu-ecolo-faut-il-en-finir-avec-les-eoliennes-1352031.jpg" width="600"/></p>

Challenge website: https://challengedata.ens.fr/participants/challenges/34/

## Notebook setup

In [None]:
# For personal Google Colab configuration only, please skip this cell.
from google.colab import drive
drive.mount('/content/gdrive')
%cd /content/gdrive/MyDrive/wind-power-forecasting-challenge
!pip install urllib3==1.25.4 folium==0.2.1 boto3 mlflow mpld3 --quiet
import pandas as pd
import os, mlflow
cred = pd.read_csv('aws_credentials.csv',index_col=0, squeeze=True)
os.environ['AWS_ACCESS_KEY_ID'] = cred.AWS_ACCESS_KEY_ID
os.environ['AWS_SECRET_ACCESS_KEY'] = cred.AWS_SECRET_ACCESS_KEY
mlflow.set_tracking_uri(f"http://{cred.AWS_USERNAME}:{cred.AWS_PASSWORD}@{cred.AWS_URL}")

In [3]:
# Load and configure libraires
import pandas as pd
import tensorflow as tf
import importlib
import core
import mlflow, mlflow.tensorflow
import mpld3
import os
import pandas as pd
mlflow.tensorflow.autolog(every_n_iter=1,log_models=False)
mpld3.enable_notebook()

## Challenge presentation

The objective of this challenge is to design and train an ML/DL model to predict the hourly electrical production of six independent wind farms owned by CNR for the day ahead, using multiple Numerical Weather Predictions (NWP) models.

## Data
### First exploration

In this challenge, we are provided with a **training dataset** and a **test dataset**.

The **training dataset** is composed of different hourly weather forecasts (X) for a period of 8 consecutive months (from May the 1st of 2018 to January the 15th of 2019), together with the associated observed power production in MW (Y). In the **test dataset**, only predictions are provided for another period of 8 months (January the 16th of 2019 to September the 30rd of 2019). The performance of our model is then evaluated online, by submitting its predictions on the test dataset.

In [7]:
# Load data
df = core.load_data()
df.sample(5)

Unnamed: 0_level_0,WF,Time,NWP1_00h_D-2_U,NWP1_00h_D-2_V,NWP1_00h_D-2_T,NWP1_06h_D-2_U,NWP1_06h_D-2_V,NWP1_06h_D-2_T,NWP1_12h_D-2_U,NWP1_12h_D-2_V,...,NWP4_12h_D-1_U,NWP4_12h_D-1_V,NWP4_12h_D-1_CLCT,NWP4_00h_D_U,NWP4_00h_D_V,NWP4_00h_D_CLCT,NWP4_12h_D_U,NWP4_12h_D_V,NWP4_12h_D_CLCT,Production
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10017,2,2018-10-05 10:00:00,-2.4174,4.3334,292.91,-1.1047,4.2202,293.11,-1.4876,4.5239,...,-1.11628,3.61954,-2.3e-05,-0.951876,3.449261,-2.2e-05,,,,0.42
34684,6,2018-09-25 20:00:00,,,,,,,1.5862,-6.1266,...,0.721509,-1.190149,-2.2e-05,0.597763,-1.212407,-1.8e-05,0.554162,-1.325859,-2.3e-05,0.61
66449,5,2019-08-09 08:00:00,2.8611,6.9973,293.81,1.1254,8.53,294.39,0.4473,8.3214,...,,,,,,,,,,
34860,6,2018-10-03 04:00:00,-0.9959,-8.4966,284.96,-1.1464,-9.0395,284.31,-1.9634,-9.1864,...,-0.373555,-4.74417,86.14591,-1.13301,-4.015365,85.622509,,,,2.22
62832,5,2019-02-15 04:00:00,-2.9261,10.294,275.77,0.3523,7.1205,274.78,-1.4186,7.8341,...,,,,,,,,,,


A **training example** is thus associated to the power production (*Production* column) of one of the six considered wind farms (*WF* column) at a given date and time (*Time* column). For a given couple (*WF*, *Time*), we then have several weather forecasts (*NWP\<i>_\<HourOfTheRun>_\<DayOfTheRun>_\<Variable>* columns), each of them giving an estimation of a particular weather *Variable*, produced at different times (*HourOfTheRun*, *DayOfTheRun*) before the target *Time*, and coming from different NWP models (*i*). For instance, the run *NWP1_00h_D-2_U* is estimating the weather variable *U* for a given target *Time* using the first NWP model, and is produced at midnight two days before this target *Time*.

The runs are coming from 4 different NWP models ($i\in[1,4]$), and are forecasting 4 weather variables at various time:
 
NWP Variable | Prediction description | NWP 1 (hourly) | NWP 2 (every 3 hours) | NWP 3 (every 3 hours) | NWP 4 (hourly)
------ | ----- | ----- | ----- | ----- | -----
Wind speed U,V (m/s) | 10min average [H-10min,H] | x (@100m) | x (@100m) | x (@100m) | x (@10m)
Temperature of air T (m/s) | 1hour average [H-1,H] | x |  | x |
Total cloud cover CLCT (%) | instant value at H | | | | x

Further details about these forecasts wan be found on the challenge webpage.

In [None]:
# Data parameters
FORECAST_MEMORY = 0.9

In [None]:
# Preprocess data
df = utilities.calculate_best_forecasts(df, FORECAST_MEMORY)
df = utilities.interpolate_nans(df)
df = utilities.augment_data(df)
df = utilities.normalize_data(df)

# Model

In [None]:
# Model parameters
WINDOW_SIZE = 72  # In hours
BATCH_SIZE = 2000
EPOCHS = 10
UNITS = 32

In [None]:
# Training function
def train_model(t_train, x_train, y_train, t_valid, x_valid, y_valid):

    mlflow.log_params({'window_size':WINDOW_SIZE, 'units':UNITS, 'layer_type':'GRU'})
    
    # Make learning datasets
    dataset_train = utilities.get_windowed_dataset(x_train, y_train, WINDOW_SIZE, BATCH_SIZE, shuffle=True)
    dataset_valid = utilities.get_windowed_dataset(x_valid, y_valid, WINDOW_SIZE, BATCH_SIZE, shuffle=False) if x_valid is not None else None
        
    # Define model
    model = tf.keras.Sequential([
                tf.keras.layers.InputLayer(input_shape=next(iter(dataset_train))[0].shape[1:]),
                tf.keras.layers.GRU(UNITS, return_sequences=True),
                tf.keras.layers.Dropout(0.6),
                tf.keras.layers.GRU(UNITS, return_sequences=True),
                tf.keras.layers.Dropout(0.6),
                tf.keras.layers.GRU(UNITS),
                tf.keras.layers.Dense(1, activation='relu')
            ])
    model.compile(loss='mse', 
                  optimizer=tf.keras.optimizers.Adam())
    
    # Train model
    history = model.fit(dataset_train, 
                        validation_data=dataset_valid,
                        epochs=EPOCHS,
                        verbose=1, 
                        callbacks=[])#tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.00001,patience=30)])
    utilities.plot_learning_curves(history)
    
    # Check predictions
    y_train_predict = utilities.predict(model, dataset_train, t_train)
    utilities.plot_predictions(t_train, y_train, y_train_predict, 'train')
    if dataset_valid is not None :
        y_valid_predict = utilities.predict(model, dataset_valid, t_valid)
        utilities.plot_predictions(t_valid, y_valid, y_valid_predict, 'valid')
            
    return model, history

# Holdout validation

In [None]:
HOLDOUT_VAL_SPLIT = 0.85

In [None]:
# TRAIN ONLY ONE WIND FARM
# ================================
def train_holdout_validation(wf_num, nested_run=False) :

    with mlflow.start_run(nested=nested_run):    
        mlflow.log_params({'wf':wf_num, 'split':HOLDOUT_VAL_SPLIT})

        # Extract wf data
        df_wf = utilities.extract_wf_data(df, wf_num)
        
        # Train
        t_train, x_train, y_train, t_valid, x_valid, y_valid = utilities.split_holdout_validation(df_wf, HOLDOUT_VAL_SPLIT, WINDOW_SIZE)
        model, history = train_model(t_train, x_train, y_train, t_valid, x_valid, y_valid)
            
    return model, history    

In [None]:
# Train one wind farm
mlflow.set_experiment('holdout_validation')
model, history = train_holdout_validation(3)

In [None]:
# Train all wind farms
mlflow.set_experiment('holdout_validation')
with mlflow.start_run() :
    for wf_num in df.WF.unique(): 
        train_holdout_validation(wf_num, nested_run = True)

# Forward chaining validation

In [None]:
# Forward chaining parameters
FC_VAL_NB = 4
FC_VAL_SIZE = 0.05

In [None]:
def train_forward_chaining_validation(wf_num):

    # Extract wf data
    df_wf = utilities.extract_wf_data(df, wf_num)
    
    # Train models
    metrics = []
    datas = utilities.split_forward_chaining_validation(df_wf, FC_VAL_SIZE, FC_VAL_NB, WINDOW_SIZE)
    for (t_train, x_train, y_train, t_valid, x_valid, y_valid) in datas :
        with mlflow.start_run(nested=True) :
            model, history = train_model(t_train, x_train, y_train, t_valid, x_valid, y_valid)
            metrics.append(history.history)
    
    # Calculate mean and std errors
    metrics = utilities.get_mean_std_metrics(metrics)
    mlflow.log_metrics(metrics)

In [None]:
mlflow.set_experiment('forward_chaining_validation')
for wf_num in df.WF.unique():
    with mlflow.start_run():    
        mlflow.log_params({'wf':wf_num,'valid_size':FC_VAL_SIZE,'nb_valid':FC_VAL_NB, 'nlayers':3, 
                        'layer_type':'GRU','units':UNITS,'epochs':EPOCHS, 'parent':True})
        train_forward_chaining_validation(wf_num)

In [None]:
wf_num = 4
mlflow.set_experiment('forward_chaining_validation')
for UNITS in [32,64] :
    with mlflow.start_run() :
        mlflow.log_params({'wf':wf_num,'valid_size':FC_VAL_SIZE,'nb_valid':FC_VAL_NB, 'nlayers':1, 
                           'layer_type':'GRU', 'units':UNITS, 'epochs':EPOCHS, 'parent':True })
        train_forward_chaining_validation(wf_num)

# Full training

In [None]:
# TRAIN ALL WIND FARMS AND PREDICT
# ================================
def train_full(wf_num, nested_run=False) :
    with mlflow.start_run(nested=nested_run): 
        mlflow.log_param('wf',wf_num)

        # Extract data
        df_wf = utilities.extract_wf_data(df, wf_num)        
        
        # Train model
        t_train, x_train, y_train = utilities.get_train_dataset(df_wf, WINDOW_SIZE)
        model, history = train_model(t_train, x_train, y_train, None, None, None)

        # Predict on test data
        t_test, x_test = utilities.get_test_dataset(df_wf, WINDOW_SIZE)
        dataset_test = utilities.get_windowed_dataset(x_test, None, WINDOW_SIZE, BATCH_SIZE, shuffle=False)
        y_test_predict = utilities.predict(model, dataset_test, t_test)    
        utilities.plot_predictions(t_test, None, y_test_predict, 'test')

    return y_test_predict

In [None]:
mlflow.set_experiment('Full training')
with mlflow.start_run():
    predictions = [train_full(wf_num, nested_run=True) for wf_num in df.WF.unique()]
    utilities.save_predictions(predictions)