In [None]:
import timeit
import datetime as dt
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
try:
    import seaborn as sns
    # Seaborn style (figure aesthetics only)
    sns.set(context='paper', style='whitegrid', font_scale=1.2)
    sns.set_style('ticks', {'xtick.direction':'in', 'ytick.direction':'in'})
except ImportError:
    print('Seaborn not installed. Going without it.')

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [None]:
from scipy import stats

## PV Data

5 seconds resolution MiRIS PV from 13/05/2019 to 21/06/2019.

In [None]:
pv = pd.read_csv('miris_pv.csv', index_col=0, parse_dates=True)

In [None]:
# Resampling the dataset from 5-seconds to 15-minutes resolution (using mean)
pv = pv.resample('15min').mean()

## Weather Data

15-minute resolution weather data

The file is composed of forecast of several weather variables:

    CD = low clouds (0 to 1)
    CM = medium clouds (0 to 1)
    CU = high clouds (0 to 1)
    PREC = precipitation (mm / 15 min)
    RH2m = relative humidity (%)
    SNOW = snow height (mm)
    ST = Surface Temperature (°C)
    SWD = Global Horizontal Irradiance (W/m2)
    SWDtop = Total Solar Irradiance at the top of the atmosphere (W/m2)
    TT2M = temperature 2 meters above the ground (°C)
    WS100m = Wind speed at 100m from the ground (m/s)
    WS10m = Wind speed at 10m from the ground (m/s)

In [None]:
we = pd.read_csv('weather_data.csv', index_col=0, parse_dates=True)

### Cleaning data

In [None]:
# Dropping SNOW and SWDtop from the dataset
we.drop('SNOW', axis=1, inplace=True)
we.drop('SWDtop', axis=1, inplace=True)

In [None]:
# Joining pv production and weather data into single dataframe
df = pd.concat([pv, we], axis=1)

In [None]:
df.dropna(inplace=True)

In [None]:
df.head()

In [None]:
df[['PV']].plot(figsize=(12,4.5))
plt.show()

### Features engineering from time-series data

In [None]:
def engineer_features(dataframe, window=24, copy_data=True, resample=True):
    if copy_data:
        df = dataframe.copy()
    if resample:
        df = df.resample('1H').mean()
    
    # Engineer features from time-series data
    columns = df.columns
    for col in columns:
        for i in range(1, window+1):
            # Shift data by lag of 1 to window=24 hours
            df[col+'_{:d}h'.format(i)] = df[col].shift(periods=i)  # time-lag
    for col in columns:
        df[col+'_diff'] = df[col].diff()  # first-difference
    
    # Rolling windows (24-hours) on time-shifted PV production
    df['roll_mean'] = df['PV_1h'].rolling(window=24, win_type='hamming').mean()
    df['roll_max'] = df['PV_1h'].rolling(window=24).max()
    
    # Hour-of-day indicators with cyclical transform
    dayhour_ind = df.index.hour
    df['hr_sin'] = np.sin(dayhour_ind*(2.*np.pi/24))
    df['hr_cos'] = np.cos(dayhour_ind*(2.*np.pi/24))
    
    # Month indicators with cyclical transform
    month_ind = df.index.month
    df['mnth_sin'] = np.sin((month_ind-1)*(2.*np.pi/12))
    df['mnth_cos'] = np.cos((month_ind-1)*(2.*np.pi/12))

    # Encoding sunshine hours
    sun_ind = df['PV'] > 0.
    df['sun'] = sun_ind.astype(int)
    
    # Drop rows with NaN values
    df.dropna(inplace=True)

    return df

In [None]:
df2 = engineer_features(df)
df2.head()

### Train, validation, and test datasets (time-series data)

In [None]:
weather_forecast = False

if weather_forecast:
    # Hour-ahead weather forecast is being utilized
    y = df2['PV']
    X = df2.drop('PV', axis=1)
else:
    # Hour-ahead weather forecast is NOT being utilized
    y = df2['PV']
    X = df2.drop(columns=['PV', 'CD', 'CM', 'CU', 'PREC', 'RH2m', 
                          'ST', 'SWD', 'TT2M', 'WS100m', 'WS10m'])

In [None]:
# Train and test dataset split (w/o shuffle)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, shuffle=False)
print(X_train.shape, X_test.shape)

#### MODEL: Pipeline from SelectKBest and RandomForest

In [None]:
# Pipeline: SelectKBest and RandomForest
# SelectKBest is used for features reduction
selectbest = SelectKBest(score_func=mutual_info_regression, k='all')
forest = RandomForestRegressor(criterion='mse', bootstrap=True)
# Creating a pipeline
pipe = Pipeline(steps=[('preprocess', 'passthrough'), 
                       ('kbest', selectbest), 
                       ('forest', forest)])
# Parameters of pipeline for the randomized search with cross-validation
param_dists = {'preprocess': [None, StandardScaler()], 
               'kbest__k': stats.randint(low=32, high=128), 
               'forest__n_estimators': stats.randint(low=200, high=1000),
               'forest__max_depth': [1, 3, 5, None], 
               'forest__max_samples': stats.uniform(loc=0.2, scale=0.8),
               }
NITER = 100  # number of random search iterations
time_start = timeit.default_timer()
search = RandomizedSearchCV(estimator=pipe, param_distributions=param_dists, 
                            cv=TimeSeriesSplit(n_splits=3),
                            scoring='neg_mean_squared_error',
                            n_iter=NITER, refit=True, n_jobs=-1)
search.fit(X_train, y_train)
time_end = timeit.default_timer()
time_elapsed = time_end - time_start
print('Execution time (hour:min:sec): {}'.format(str(dt.timedelta(seconds=time_elapsed))))
print('Best parameter (CV score = {:.3f}):'.format(search.best_score_))
print(search.best_params_)

In [None]:
# Feature importance analysis with random forests
best_params = {'n_estimators': search.best_params_['forest__n_estimators'],
               'max_depth': search.best_params_['forest__max_depth'],
               'max_samples': search.best_params_['forest__max_samples'],
               }
forest = RandomForestRegressor(criterion='mse', **best_params)
forest.fit(X_train, y_train)

In [None]:
TOP = 15
feature_importance = forest.feature_importances_
feature_importance = 100.0 * (feature_importance / feature_importance.max())
sorted_idx = np.argsort(feature_importance)[-TOP:]
pos = np.arange(sorted_idx.shape[0]) + .25

# Plot relative feature importance
fig, ax = plt.subplots(figsize=(7,5))
ax.barh(pos, feature_importance[sorted_idx][-TOP:], align='center', color='magenta', alpha=0.6)
plt.yticks(pos, X_train.columns[sorted_idx][-TOP:])
ax.set_xlabel('Feature Relative Importance')
ax.grid(axis='x')
plt.tight_layout()
plt.show()

#### Prediction

In [None]:
X_test.head()

In [None]:
# Make single-step predictions for 24 hours ahead
y_preds = search.predict(X_test.values[:24,:])

In [None]:
mse = mean_squared_error(y_test.values[:24], y_preds)
print('MSE:', mse)
mae = mean_absolute_error(y_test.values[:24], y_preds)
print('MAE:', mae)

In [None]:
plt.figure(figsize=(6,4))
plt.plot(y_test.index[:24], y_test.values[0:24], lw=2, label='true values')
plt.plot(y_test.index[:24], y_preds, ls='--', lw=1.5, marker='+', ms=10, label='predictions')
plt.text(y_test.index[20], 0.35, 'MAE: {:.3f}'.format(mae), horizontalalignment='center', fontweight='bold')
plt.legend(loc='upper right')
plt.grid(axis='y')
plt.xticks(rotation=45)
plt.xlabel('Day/Hour')
plt.ylabel('PV power')
plt.show()

### Walk-forward multi-step prediction

In [None]:
WALK = 12  # walk-forward for WALK hours
STEP = 24  # multi-step predict for STEP hours ahead

In [None]:
# With STEP=24 and WALK=12, we are making a 24-hour ahead predictions 
# after each hour, and move forward in time for 12 hours in total. 
# In other words, we walk forward for 12 hours, and each time we move 
# forward (by one hour) we make a brand new 24-hour ahead predictions. 
# Predicted values are being utilized as past observations for making
# new predictions as we walk forward in time. Hence, as we move away in 
# time from the present moment we are relying more and more on predicted 
# values to make new predictions!

def walk_forward(X_values, y_predicted, window=24):
    # There are eleven different original
    # variables (PV plus 10 weather vars)
    X_parts = []
    j = 0; k = 0
    for i in range(11):
        k = j + window
        X_part = X_values[j:k]
        X_part = pd.Series(X_part)
        if i == 0:
            # time-shifted PV production
            X_part = X_part.shift(periods=1, fill_value=y_predicted)
        else:
            # time-shifted weather features
            X_part = X_part.shift(periods=1, fill_value=np.NaN)
            X_part.fillna(method='bfill', inplace=True)  # back-fill
        X_parts.append(X_part.values)
        j += window
    X_parts = np.asarray(X_parts).reshape(1,-1)
    X_rest = X_values[-18:]   # other features
    X_values = np.r_[X_parts[0], X_rest]
    return X_values

def plot_predictions(walk, y_test, y_pred):
    plt.figure(figsize=(6,4))
    plt.title('walk forward +{:2d} hours'.format(walk+1))
    plt.plot(y_test.values[walk:walk+STEP], lw=2.5, label='true values')
    plt.plot(y_pred, ls='--', lw=1.5, marker='+', ms=10, label='predictions')
    mae = mean_absolute_error(y_test.values[walk:walk+STEP], y_pred)
    plt.text(STEP-2, 0.35, 'MAE: {:.3f}'.format(mae), 
             horizontalalignment='right', 
             fontweight='bold')
    plt.legend(loc='upper right')
    plt.ylim(top=0.5)
    plt.grid(axis='y')
    plt.xlabel('Hour')
    plt.ylabel('PV power')
    plt.show()    

In [None]:
for k in range(WALK):
    X_test_values = X_test.values[k,:]
    y_pred_values = []
    for i in range(STEP):
        # Predict next time-step value
        y_predict = search.predict(X_test_values.reshape(1,-1))[0]
        y_pred_values.append(y_predict)
        # Walk-forward for a single time step
        X_test_values = walk_forward(X_test_values, y_predict)
    # Plot walk-forward predictions against true values
    plot_predictions(k, y_test, y_pred_values)