In [None]:
import timeit
import datetime as dt
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
try:
    import seaborn as sns
    # Seaborn style (figure aesthetics only)
    sns.set(context='paper', style='whitegrid', font_scale=1.2)
    sns.set_style('ticks', {'xtick.direction':'in', 'ytick.direction':'in'})
except ImportError:
    print('Seaborn not installed. Going without it.')

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

## PV Data

5 seconds resolution MiRIS PV from 13/05/2019 to 21/06/2019.

In [None]:
pv = pd.read_csv('miris_pv.csv', index_col=0, parse_dates=True)

In [None]:
# Resampling the dataset from 5-seconds to 15-minutes resolution (using mean)
pv = pv.resample('15min').mean()

## Weather Data

15-minute resolution weather data

The file is composed of forecast of several weather variables:

    CD = low clouds (0 to 1)
    CM = medium clouds (0 to 1)
    CU = high clouds (0 to 1)
    PREC = precipitation (mm / 15 min)
    RH2m = relative humidity (%)
    SNOW = snow height (mm)
    ST = Surface Temperature (°C)
    SWD = Global Horizontal Irradiance (W/m2)
    SWDtop = Total Solar Irradiance at the top of the atmosphere (W/m2)
    TT2M = temperature 2 meters above the ground (°C)
    WS100m = Wind speed at 100m from the ground (m/s)
    WS10m = Wind speed at 10m from the ground (m/s)

In [None]:
we = pd.read_csv('weather_data.csv', index_col=0, parse_dates=True)

#### Cleaning data

In [None]:
# Dropping SNOW and SWDtop from the dataset
we.drop('SNOW', axis=1, inplace=True)
we.drop('SWDtop', axis=1, inplace=True)

In [None]:
# Joining pv production and weather data into single dataframe
df = pd.concat([pv, we], axis=1)

In [None]:
df.dropna(inplace=True)

In [None]:
df.head()

All data is numeric; no need to handle text and categorical features 

In [None]:
# Separating features and labels
X = df.drop('PV', axis=1) # Features
y = df['PV'] # Labels

Currently, features are not scaled

#### Selecting and training a model

In [None]:
# Performing train-test-split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
# Using Grid Search for tuning Random Forest Regressor hyperparameters
time_start = timeit.default_timer()
parameters = {'n_estimators': [100, 500, 1000], 'max_depth': [1, 3, None]}
rfr_gs = GridSearchCV(estimator=RandomForestRegressor(criterion='mse'),
                     param_grid=parameters, cv=TimeSeriesSplit(n_splits=3),
                     n_jobs=-1)
rfr_gs.fit(X_train, y_train)
print(rfr_gs.best_params_)
time_end = timeit.default_timer()
time_elapsed = time_end - time_start
print('Execution time (hour:min:sec): {}'.format(str(dt.timedelta(seconds=time_elapsed))))

In [None]:
# Choosing optimal hyperparameters from Grid Search output
rfr = RandomForestRegressor(criterion='mse', **rfr_gs.best_params_)
rfr.fit(X_train, y_train)

#### Predicting on test set

In [None]:
y_preds = rfr.predict(X_test)

In [None]:
# MAE of predictions
mean_absolute_error(y_test, y_preds)

In [None]:
# Visualize the predicted labels on test set
start = 1
end = start + 24*4
plt.figure(figsize=(15,8))
plt.plot(y_preds[start:end], ls='--', label='predictions')
plt.plot(y_test.iloc[start:end].values, label='actual')
plt.grid()
plt.legend();