# Initialization

In [2]:
# Google Colab config
from google.colab import drive
drive.mount('/content/gdrive')
%cd /content/gdrive/MyDrive/wind-power-forecasting-challenge
!pip install urllib3==1.25.4 folium==0.2.1 boto3 mlflow mpld3 --quiet

Mounted at /content/gdrive
/content/gdrive/MyDrive/wind-power-forecasting-challenge
[K     |████████████████████████████████| 133kB 16.0MB/s 
[K     |████████████████████████████████| 71kB 8.9MB/s 
[K     |████████████████████████████████| 102kB 12.0MB/s 
[K     |████████████████████████████████| 14.2MB 192kB/s 
[K     |████████████████████████████████| 890kB 51.3MB/s 
[K     |████████████████████████████████| 7.2MB 50.4MB/s 
[K     |████████████████████████████████| 71kB 11.9MB/s 
[K     |████████████████████████████████| 348kB 65.6MB/s 
[K     |████████████████████████████████| 81kB 12.0MB/s 
[K     |████████████████████████████████| 61kB 10.7MB/s 
[K     |████████████████████████████████| 1.1MB 50.6MB/s 
[K     |████████████████████████████████| 153kB 60.4MB/s 
[K     |████████████████████████████████| 163kB 59.6MB/s 
[K     |████████████████████████████████| 133kB 68.0MB/s 
[K     |████████████████████████████████| 92kB 13.1MB/s 
[K     |████████████████████████████

In [3]:
# Load libraires
import pandas as pd
import tensorflow as tf
import importlib
import utilities
import mlflow
import mlflow.tensorflow
import mpld3
import os
import pandas as pd
cred = pd.read_csv('aws_credentials.csv',index_col=0, squeeze=True)
os.environ['AWS_ACCESS_KEY_ID'] = cred.AWS_ACCESS_KEY_ID
os.environ['AWS_SECRET_ACCESS_KEY'] = cred.AWS_SECRET_ACCESS_KEY
mlflow.set_tracking_uri(f"http://{cred.AWS_USERNAME}:{cred.AWS_PASSWORD}@{cred.AWS_URL}")
mlflow.tensorflow.autolog(every_n_iter=1,log_models=False)
mpld3.enable_notebook()

# Data

In [None]:
# Data parameters
FORECAST_MEMORY = 0.9

In [None]:
# Load data
df = utilities.load_data()

In [None]:
# Preprocess data
df = utilities.calculate_best_forecasts(df, FORECAST_MEMORY)
df = utilities.interpolate_nans(df)
df = utilities.augment_data(df)
df = utilities.normalize_data(df)

# Model

In [None]:
# Model parameters
WINDOW_SIZE = 72  # In hours
BATCH_SIZE = 2000
EPOCHS = 10
UNITS = 32

In [None]:
# Training function
def train_model(t_train, x_train, y_train, t_valid, x_valid, y_valid):

    mlflow.log_params({'window_size':WINDOW_SIZE, 'units':UNITS, 'layer_type':'GRU'})
    
    # Make learning datasets
    dataset_train = utilities.get_windowed_dataset(x_train, y_train, WINDOW_SIZE, BATCH_SIZE, shuffle=True)
    dataset_valid = utilities.get_windowed_dataset(x_valid, y_valid, WINDOW_SIZE, BATCH_SIZE, shuffle=False) if x_valid is not None else None
        
    # Define model
    model = tf.keras.Sequential([
                tf.keras.layers.InputLayer(input_shape=next(iter(dataset_train))[0].shape[1:]),
                tf.keras.layers.GRU(UNITS, return_sequences=True),
                tf.keras.layers.Dropout(0.6),
                tf.keras.layers.GRU(UNITS, return_sequences=True),
                tf.keras.layers.Dropout(0.6),
                tf.keras.layers.GRU(UNITS),
                tf.keras.layers.Dense(1, activation='relu')
            ])
    model.compile(loss='mse', 
                  optimizer=tf.keras.optimizers.Adam())
    
    # Train model
    history = model.fit(dataset_train, 
                        validation_data=dataset_valid,
                        epochs=EPOCHS,
                        verbose=1, 
                        callbacks=[])#tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.00001,patience=30)])
    utilities.plot_learning_curves(history)
    
    # Check predictions
    y_train_predict = utilities.predict(model, dataset_train, t_train)
    utilities.plot_predictions(t_train, y_train, y_train_predict, 'train')
    if dataset_valid is not None :
        y_valid_predict = utilities.predict(model, dataset_valid, t_valid)
        utilities.plot_predictions(t_valid, y_valid, y_valid_predict, 'valid')
            
    return model, history

# Holdout validation

In [None]:
HOLDOUT_VAL_SPLIT = 0.85

In [None]:
# TRAIN ONLY ONE WIND FARM
# ================================
def train_holdout_validation(wf_num, nested_run=False) :

    with mlflow.start_run(nested=nested_run):    
        mlflow.log_params({'wf':wf_num, 'split':HOLDOUT_VAL_SPLIT})

        # Extract wf data
        df_wf = utilities.extract_wf_data(df, wf_num)
        
        # Train
        t_train, x_train, y_train, t_valid, x_valid, y_valid = utilities.split_holdout_validation(df_wf, HOLDOUT_VAL_SPLIT, WINDOW_SIZE)
        model, history = train_model(t_train, x_train, y_train, t_valid, x_valid, y_valid)
            
    return model, history    

In [None]:
# Train one wind farm
mlflow.set_experiment('holdout_validation')
model, history = train_holdout_validation(3)

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/data/util/structure.py", line 106, in normalize_element
    spec = type_spec_from_value(t, use_fallback=False)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/data/util/structure.py", line 480, in type_spec_from_value
    (element, type(element).__name__))
TypeError: Could not build a TypeSpec for          NWP1_T    NWP1_U    NWP1_V  ...   NWP0_WD    NWP0_V    NWP0_T
ID                                   ...                              
12479 -1.615689  1.060732  0.753838  ...  0.502134  0.618127 -1.630719
12480 -1.604885  1.106177  0.770680  ...  0.482312  0.631559 -1.624942
12481 -1.607586  1.011376  0.509652  ...  0.516019  0.549842 -1.626386
12482 -1.621091  0.936556  0.395467  ...  0.574580  0.441365 -1.628775
12483 -1.637298  0.860163  0.230809  ...  0.638724  0.325042 -1.632608
...         ...       ...       ...  ...       ...       ...       ...
17777 -1.182329 

TypeError: ignored

In [None]:
# Train all wind farms
mlflow.set_experiment('holdout_validation')
with mlflow.start_run() :
    for wf_num in df.WF.unique(): 
        train_holdout_validation(wf_num, nested_run = True)

# Forward chaining validation

In [None]:
# Forward chaining parameters
FC_VAL_NB = 4
FC_VAL_SIZE = 0.05

In [None]:
def train_forward_chaining_validation(wf_num):

    # Extract wf data
    df_wf = utilities.extract_wf_data(df, wf_num)
    
    # Train models
    metrics = []
    datas = utilities.split_forward_chaining_validation(df_wf, FC_VAL_SIZE, FC_VAL_NB, WINDOW_SIZE)
    for (t_train, x_train, y_train, t_valid, x_valid, y_valid) in datas :
        with mlflow.start_run(nested=True) :
            model, history = train_model(t_train, x_train, y_train, t_valid, x_valid, y_valid)
            metrics.append(history.history)
    
    # Calculate mean and std errors
    metrics = utilities.get_mean_std_metrics(metrics)
    mlflow.log_metrics(metrics)

In [None]:
mlflow.set_experiment('forward_chaining_validation')
for wf_num in df.WF.unique():
    with mlflow.start_run():    
        mlflow.log_params({'wf':wf_num,'valid_size':FC_VAL_SIZE,'nb_valid':FC_VAL_NB, 'nlayers':3, 
                        'layer_type':'GRU','units':UNITS,'epochs':EPOCHS, 'parent':True})
        train_forward_chaining_validation(wf_num)

Output hidden; open in https://colab.research.google.com to view.

In [None]:
wf_num = 4
mlflow.set_experiment('forward_chaining_validation')
for UNITS in [32,64] :
    with mlflow.start_run() :
        mlflow.log_params({'wf':wf_num,'valid_size':FC_VAL_SIZE,'nb_valid':FC_VAL_NB, 'nlayers':1, 
                           'layer_type':'GRU', 'units':UNITS, 'epochs':EPOCHS, 'parent':True })
        train_forward_chaining_validation(wf_num)

In [None]:
mlflow.end_run()

# Full training

In [None]:
# TRAIN ALL WIND FARMS AND PREDICT
# ================================
def train_full(wf_num, nested_run=False) :
    with mlflow.start_run(nested=nested_run): 
        mlflow.log_param('wf',wf_num)

        # Extract data
        df_wf = utilities.extract_wf_data(df, wf_num)        
        
        # Train model
        t_train, x_train, y_train = utilities.get_train_dataset(df_wf, WINDOW_SIZE)
        model, history = train_model(t_train, x_train, y_train, None, None, None)

        # Predict on test data
        t_test, x_test = utilities.get_test_dataset(df_wf, WINDOW_SIZE)
        dataset_test = utilities.get_windowed_dataset(x_test, None, WINDOW_SIZE, BATCH_SIZE, shuffle=False)
        y_test_predict = utilities.predict(model, dataset_test, t_test)    
        utilities.plot_predictions(t_test, None, y_test_predict, 'test')

    return y_test_predict

In [None]:
mlflow.set_experiment('Full training')
with mlflow.start_run():
    predictions = [train_full(wf_num, nested_run=True) for wf_num in df.WF.unique()]
    utilities.save_predictions(predictions)

Output hidden; open in https://colab.research.google.com to view.