In [None]:
import timeit
import datetime as dt
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
try:
    import seaborn as sns
    # Seaborn style (figure aesthetics only)
    sns.set(context='paper', style='whitegrid', font_scale=1.2)
    sns.set_style('ticks', {'xtick.direction':'in', 'ytick.direction':'in'})
except ImportError:
    print('Seaborn not installed. Going without it.')

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from sklearn.pipeline import Pipeline

In [None]:
from scipy.stats import uniform

## PV Data

5 seconds resolution MiRIS PV from 13/05/2019 to 21/06/2019.

In [None]:
pv = pd.read_csv('miris_pv.csv', index_col=0, parse_dates=True)

In [None]:
# Resampling the dataset from 5-seconds to 15-minutes resolution (using mean)
pv = pv.resample('15min').mean()

## Weather Data

15-minute resolution weather data

The file is composed of forecast of several weather variables:

    CD = low clouds (0 to 1)
    CM = medium clouds (0 to 1)
    CU = high clouds (0 to 1)
    PREC = precipitation (mm / 15 min)
    RH2m = relative humidity (%)
    SNOW = snow height (mm)
    ST = Surface Temperature (°C)
    SWD = Global Horizontal Irradiance (W/m2)
    SWDtop = Total Solar Irradiance at the top of the atmosphere (W/m2)
    TT2M = temperature 2 meters above the ground (°C)
    WS100m = Wind speed at 100m from the ground (m/s)
    WS10m = Wind speed at 10m from the ground (m/s)

In [None]:
we = pd.read_csv('weather_data.csv', index_col=0, parse_dates=True)

#### Cleaning data

In [None]:
# Dropping SNOW and SWDtop from the dataset
we.drop('SNOW', axis=1, inplace=True)
we.drop('SWDtop', axis=1, inplace=True)

In [None]:
# Joining pv production and weather data into single dataframe
df = pd.concat([pv, we], axis=1)

In [None]:
df.dropna(inplace=True)

In [None]:
df.head()

In [None]:
df['PV'].plot(); plt.show()

In [None]:
def engineer_features(dataframe, resample=False):
    df = dataframe.copy()
    if resample:
        df = df.resample('1H').mean()
    
    # Engineer features from time-series data
    for col in df.columns:
        for i in range(1,25):
            # Shift data by lag of 1 to 24 hours
            df[col+'_{:d}h'.format(i)] = df[col].shift(periods=i)  # time-lag
        df[col+'_diff'] = df[col].diff()  # difference
    
    # Hour-of-day indicators with cyclical transform
    dayhour_ind = df.index.hour
    df['hr_sin'] = np.sin(dayhour_ind*(2.*np.pi/24))
    df['hr_cos'] = np.cos(dayhour_ind*(2.*np.pi/24))
    
    # Month indicators with cyclical transform
    month_ind = df.index.month
    df['mnth_sin'] = np.sin((month_ind-1)*(2.*np.pi/12))
    df['mnth_cos'] = np.cos((month_ind-1)*(2.*np.pi/12))
    
    # Encoding sunshine hours
    sun_ind = df['PV'] > 0.
    df['sun'] = sun_ind.astype(int)
    
    # Drop rows with NaN values
    df.dropna(inplace=True)

    return df

In [None]:
dfr = engineer_features(df, resample=True)
dfr.head()

In [None]:
def time_series_data_split(df, start_date, window_days, train_percent=0.8, print_dates=False):
    # DATASETS DETERMINED USING THE WINDOW METHOD
    # Preserving time ordering!
    features = [col for col in df.columns if col != 'PV']
    # window_days - width of the dataset window in days
    # this window determines the size of the dataset that
    # will be split into the training and validation sets
    # train_percent - percent of the dataset used for training
    
    # Training period
    st = pd.to_datetime(start_date, utc=True)
    et = st + dt.timedelta(days=int(train_percent*window_days)-1)
    X_train = df[features].loc[st:et].values
    y_train = df['PV'].loc[st:et].values
    
    # Validation period
    sv = et #+ dt.timedelta(days=1)
    ev = sv + dt.timedelta(days=int((1-train_percent)*window_days))
    X_test = df[features].loc[sv:ev].values  # validation
    y_test = df['PV'].loc[sv:ev].values   # validation
    
    # Testing period (one day after)
    sn = ev #+ dt.timedelta(days=1)
    en = sn + dt.timedelta(days=1)
    X_new = df[features].loc[sn:en].values
    y_true = df['PV'].loc[sn:en].values
    
    if print_dates:
        print('  Training period:', st.date(), '=>', et.date())
        print('Validation period:', sv.date(), '=>', ev.date())
        print('   Testing period:', sn.date(), '=>', en.date())
    
    return X_train, y_train, X_test, y_test, X_new, y_true

In [None]:
start_date = '2019-05-15'  # start date
window_days = 30
X_train, y_train, X_test, y_test, X_new, y_true = time_series_data_split(dfr, start_date, window_days, print_dates=True)

In [None]:
# Pipeline: SelectKBest and RandomForest
# SelectKBest is used for features reduction
selectbest = SelectKBest(score_func=mutual_info_regression, k='all')
forest = RandomForestRegressor(criterion='mse')
# Creating a pipeline
pipe = Pipeline(steps=[('kbest', selectbest), ('forest', forest)])
# Parameters of pipeline for the randomized search with cross-validation
param_dists = {'kbest__k': stats.randint(low=32, high=128), 
               'forest__n_estimators': stats.randint(low=200, high=1000),
               'forest__max_depth': [1, 3, 5, None]}
NITER = 100  # number of random search iterations
time_start = timeit.default_timer()
search = RandomizedSearchCV(estimator=pipe, param_distributions=param_dists, 
                            cv=TimeSeriesSplit(n_splits=3),
                            scoring='neg_mean_squared_error',
                            n_iter=NITER, refit=True, n_jobs=-1)
search.fit(X_train, y_train)
time_end = timeit.default_timer()
time_elapsed = time_end - time_start
print('Execution time (hour:min:sec): {}'.format(str(dt.timedelta(seconds=time_elapsed))))
print('Best parameter (CV score = {:.3f}):'.format(search.best_score_))
print(search.best_params_)

In [None]:
# Random Forest with grid-search hyperparameter optimization
parameters = {'n_estimators':[250, 500, 1000],
              'max_depth':[1, 5, None]}
time_start = timeit.default_timer()
search = GridSearchCV(estimator=RandomForestRegressor(criterion='mse'), 
                      param_grid=parameters, cv=TimeSeriesSplit(n_splits=3),
                      scoring='neg_mean_squared_error', 
                      refit=True, n_jobs=-1)
search.fit(X_train, y_train)
time_end = timeit.default_timer()
time_elapsed = time_end - time_start
print('Execution time (hour:min:sec): {}'.format(str(dt.timedelta(seconds=time_elapsed))))
print('Best parameter (CV score = {:.3f}):'.format(search.best_score_))
print(search.best_params_)

In [None]:
# Feature importance analysis with random forests
forest = RandomForestRegressor(criterion='mse', **search.best_params_)
forest.fit(X_train, y_train)

In [None]:
TOP = 10
feature_importance = forest.feature_importances_
feature_importance = 100.0 * (feature_importance / feature_importance.max())
sorted_idx = np.argsort(feature_importance)[-TOP:]
pos = np.arange(sorted_idx.shape[0]) + .25
# Plot relative feature importance
fig, ax = plt.subplots(figsize=(7,4))
ax.barh(pos, feature_importance[sorted_idx][-TOP:], align='center', color='magenta', alpha=0.6)
plt.yticks(pos, dfr.columns[sorted_idx][-TOP:])
ax.set_xlabel('Feature Relative Importance')
ax.grid(axis='x')
plt.tight_layout()
plt.show()

In [None]:
# Make predictions for new data
y_pred = search.predict(X_new)

In [None]:
plt.plot(y_pred, label='prediction')
plt.plot(y_true, lw=2, label='true value')
plt.legend()
plt.grid(axis='y')
plt.xlabel('Hours')
plt.show()

# Nastavak

All data is numeric; no need to handle text and categorical features 

In [None]:
# Separating features and labels
X = df.drop('PV', axis=1) # Features
y = df['PV'] # Labels

Currently, features are not scaled

#### Selecting and training a model

In [None]:
# Performing train-test-split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
# Using Grid Search for tuning Random Forest Regressor hyperparameters
time_start = timeit.default_timer()
parameters = {'n_estimators': [100, 500, 1000], 'max_depth': [1, 3, None]}
rfr_gs = GridSearchCV(estimator=RandomForestRegressor(criterion='mse'),
                     param_grid=parameters, cv=TimeSeriesSplit(n_splits=3),
                     n_jobs=-1)
rfr_gs.fit(X_train, y_train)
print(rfr_gs.best_params_)
time_end = timeit.default_timer()
time_elapsed = time_end - time_start
print('Execution time (hour:min:sec): {}'.format(str(dt.timedelta(seconds=time_elapsed))))

In [None]:
# Choosing optimal hyperparameters from Grid Search output
rfr = RandomForestRegressor(criterion='mse', **rfr_gs.best_params_)
rfr.fit(X_train, y_train)

#### Predicting on test set

In [None]:
y_preds = rfr.predict(X_test)

In [None]:
# MAE of predictions
mean_absolute_error(y_test, y_preds)

In [None]:
# Visualize the predicted labels on test set
start = 1
end = start + 24*4
plt.figure(figsize=(15,8))
plt.plot(y_preds[start:end], ls='--', label='predictions')
plt.plot(y_test.iloc[start:end].values, label='actual')
plt.grid()
plt.legend();