# Wind Generation Data EDA and Forecasting

Notebook below explores the dataset containing wind generation in four of the German Transmission System Operators (TSOs)


![German TSOs](https://www.cleanenergywire.org/sites/default/files/resize/styles/large/public/images/factsheet/130514-regelzonen-nep-800x535.jpg?itok=RE8S7NLK)

*image credit:https://www.cleanenergywire.org

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
names = ['TenneTTSO','50Hertz','TransnetBW','Amprion']
infileloc = '/kaggle/input/wind-power-generation/'
wind_data_df = []

for filename in names:
    infile = infileloc + filename + '.csv'
    DF = pd.read_csv(infile, infer_datetime_format=True)
#     print(infile)
    wind_data_df.append(DF)

wind_data_df[0]

In [None]:
#Create a function to go through each df in wind data and return a time series dataframe in long format since they are currently in wide format

def return_ts_df(df, TSOname):
    varname = TSOname
    df = pd.melt(df, id_vars = 'Date', var_name = 'Time', value_name = varname)
    df['Dates'] = df['Date'].astype(str) +' '+df['Time'].astype(str)
    df['Dates'] = pd.to_datetime(df['Dates'], format = '%d/%m/%Y %H:%M:%S')
    df.drop(columns=['Date','Time'], inplace=True)
    cols = ['Dates',varname]
    df = df[cols]
    df = df.set_index('Dates')
    df = df.sort_index()
    df.reset_index(drop=False, inplace=True)
    return df

ten_df = return_ts_df(wind_data_df[0],'TennetTSO')
fiftyHz_df = return_ts_df(wind_data_df[1],'50Hertz')
transnet_df = return_ts_df(wind_data_df[2],'TransnetBW')
amprion_df = return_ts_df(wind_data_df[3],'Amprion')


In [None]:
ten_df[:10]

In [None]:
# print(ten_df.info())

#Get some statistics

print(ten_df.describe())
print(fiftyHz_df.describe())
print(transnet_df.describe())
print(amprion_df.describe())

In [None]:
from functools import reduce

#combine dataframes

dataframes = [ten_df,fiftyHz_df,transnet_df,amprion_df]
wind_data_all = reduce(lambda x,y: pd.merge(x,y, on='Dates', how='outer'),dataframes)
    
wind_data_all

In [None]:
wind_data_all.info()

# Resample to Monthly wind data

In [None]:
#Monthly wind data
wind_data_all_M = wind_data_all.resample('M', on='Dates').mean()
wind_data_all_M


In [None]:
%matplotlib inline

plt.style.use('seaborn-whitegrid')
plt.figure(figsize=(15,12))
sns.relplot(data=wind_data_all_M, kind = "line")

plt.title("Total Monthly Wind Generation in Terra Watt Hour")
plt.xlabel("Date")
plt.ylabel("TWH")
plt.show()

# Daily data



In [None]:
#Daily wind data
wind_data_all_D = wind_data_all.resample('D', on='Dates').mean()
wind_data_all_D

In [None]:
wind_data_all_D.reset_index(inplace=True)
wind_data_all_D['month'] = wind_data_all_D['Dates'].dt.month
wind_data_all_D['dayofweek_name'] = wind_data_all_D['Dates'].dt.day_name()

#Officially spring is during the months of March, April and May in Germany. Summer is from June through to August. Autumn is during the months of September, October and November and winter is from December to February.

seasons = ['Winter','Winter','Spring','Spring','Spring','Summer','Summer','Summer','Autumn','Autumn','Autumn','Winter']
month_to_season = dict(zip(range(1,13), seasons))

wind_data_all_D['Season'] = wind_data_all_D['Dates'].dt.month.map(month_to_season)
wind_data_all_D

In [None]:
fig, ax = plt.subplots(figsize=(15,9))

sns.lineplot(x="Dates", 
             y="TennetTSO",
             data = wind_data_all_D,
             color='r',
#              hue = 'dayofweek_name', 
             dashes=False,
             ax=ax)
sns.lineplot(x="Dates", 
             y="50Hertz", 
             data = wind_data_all_D,
             color='b',
#              hue = 'dayofweek_name',
             dashes=False,
             ax=ax)   
sns.lineplot(x="Dates", 
             y="TransnetBW", 
             data = wind_data_all_D,
             color='g',
#              hue = 'dayofweek_name', 
             dashes=False,
             ax=ax) 
sns.lineplot(x="Dates", 
             y="Amprion", 
             data = wind_data_all_D,
             color='y',
#              hue = 'dayofweek_name',
             dashes=False,
             ax=ax) 
ax.legend(['TennetTSO', '50Hertz','TransnetBW','Amprion'], facecolor='w')
plt.title("Total Daily Wind Generation in Terra Watt Hour")
plt.xlabel("Date")
plt.ylabel("TWH")
plt.show()

In [None]:
wind_data_all_D

In [None]:
wind_data_all_D.set_index('Dates',inplace=True)
wind_data_all_D.drop(['month'],axis=1, inplace=True)

### Let's look at the wind generation profile for one of the seasons

In [None]:
#Let's look at one of the seasons
wind_data_all_D[wind_data_all_D['Season']=='Winter'].plot(subplots=True, sharex=True, figsize=(10,10))
# wind_data_all_D[wind_data_all_D['Season']=='Spring'].plot(subplots=True, sharex=True, figsize=(10,10))

plt.show()

**Interesting tidbit from a google search:**

> California is approximately 403,882 sq km, while Germany is approximately 357,022 sq km, making Germany 88.4% the size of California. Meanwhile, the population of California is ~37.3 million people (42.9 million more people live in Germany).

This may be the reason why the wind generation in all four of them appear to have similar profiles.


# Hourly data

In [None]:
#Hourly wind data
wind_data_all_H = wind_data_all.resample('H', on='Dates').mean()
wind_data_all_H.reset_index(inplace=True)
wind_data_all_H

#save this dataframe for modeling later
ts = wind_data_all_H.copy(deep=True)

In [None]:
wind_data_all_H['month'] = wind_data_all_H['Dates'].dt.month
wind_data_all_H['dayofweek_name'] = wind_data_all_H['Dates'].dt.day_name()
wind_data_all_H.head()

In [None]:
#using seasons and month to season mapping from earlier

wind_data_all_H['Season'] = wind_data_all_H['Dates'].dt.month.map(month_to_season)

In [None]:
wind_data_all_H[500:505]

In [None]:
#Recover default matplotlib settings 
import matplotlib as mpl
import matplotlib.dates as mdates
from matplotlib.dates import DateFormatter

# Use white grid plot background from seaborn
sns.set(context='notebook',style='whitegrid', font_scale=1.5)

mpl.rcParams.update(mpl.rcParamsDefault)
%matplotlib inline

In [None]:
# wind_data_all_H.plot()

fig, ax = plt.subplots(figsize=(20,9))

sns.lineplot(x="Dates", 
             y="TennetTSO",
             data = wind_data_all_H,
             color='r',
#              hue = 'dayofweek_name', 
             dashes=False,
             ax=ax)
sns.lineplot(x="Dates", 
             y="50Hertz", 
             data = wind_data_all_H,
             color='b',
#              hue = 'dayofweek_name',
             dashes=False,
             ax=ax)   
sns.lineplot(x="Dates", 
             y="TransnetBW", 
             data = wind_data_all_H,
             color='g',
#              hue = 'dayofweek_name', 
             dashes=False,
             ax=ax) 
sns.lineplot(x="Dates", 
             y="Amprion", 
             data = wind_data_all_H,
             color='y',
#              hue = 'dayofweek_name',
             dashes=False,
             ax=ax) 
ax.legend(['TennetTSO', '50Hertz','TransnetBW','Amprion'], facecolor='w')
plt.title("Total Hourly Wind Generation in Terra Watt Hour")
plt.xlabel("Date")
plt.ylabel("TWH")
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(20,9))

sns.lineplot(x="Dates", 
             y="TennetTSO",
             data = wind_data_all_H,
#              color='r',
             hue = 'Season', 
#              style='Season',
             dashes=False,
             ax=ax)

plt.title("Total Hourly Wind Generation in Terra Watt Hour")
plt.xlabel("Date")
plt.ylabel("TWH")
plt.show()

### Smoothing

In [None]:
!pip install tsmoothie

In [None]:
from tsmoothie.smoother import *
from tsmoothie.utils_func import create_windows



In [None]:
## Use Kalman filter to smooth data for visualization 

smoother = KalmanSmoother(component = 'level_longseason',
                         component_noise ={'level':0.5,
                                          'longseason':0.1},
                         n_longseasons=365)

smoother.smooth(wind_data_all_H[['TennetTSO', '50Hertz','TransnetBW','Amprion']].T)


In [None]:
color = {0:'red', 1:'orange', 2:'green', 3:'purple'}

cols = ['TennetTSO', '50Hertz','TransnetBW','Amprion']

for i, name in enumerate(wind_data_all_H[cols]):
    plt.figure(figsize=(8,4))
    plt.plot(wind_data_all_H.index, smoother.data[i], c=color[i], label=name, alpha=0.3)
    plt.plot(wind_data_all_H.index, smoother.smooth_data[i], c=color[i], label=name+'smooth')
    plt.legend()
    plt.show



In [None]:

wind_data_all_H = wind_data_all_H.set_index('Dates')
wind_data_all_H


In [None]:
plt.figure(figsize=(15,7))
# wind_data_all_H.TennetTSO.plot()
df = wind_data_all_H.reset_index()
fig, (ax1,ax2,ax3,ax4) = plt.subplots(4,1, figsize=(15,7))

ax1.plot( 'Dates', 'TennetTSO', data=df, marker='', color='olive', linewidth=2, linestyle='dashed', label="TennetTSO")
ax1.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc='lower left',ncol=2, mode="expand", borderaxespad=0.)

ax2.plot( 'Dates', '50Hertz', data=df, marker='', color='green', linewidth=2, linestyle='dashed', label="50Hertz")
ax2.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc='lower left',ncol=2, mode="expand", borderaxespad=0.)

ax3.plot( 'Dates', 'TransnetBW', data=df, marker='', color='purple', linewidth=2, linestyle='dashed', label="TransnetBW")
ax3.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc='lower left',ncol=2, mode="expand", borderaxespad=0.)

ax4.plot( 'Dates', 'Amprion', data=df, marker='', color='red', linewidth=2, linestyle='dashed', label="Amprion")
ax4.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc='lower left',ncol=2, mode="expand", borderaxespad=0.)

plt.show()

# Look for any correlation

In [None]:
# wind_data_all_H.set_index('Dates',inplace=True)

data_v1 = wind_data_all_H[['TennetTSO', '50Hertz','TransnetBW','Amprion']]
data_v1

In [None]:
# compute the correlation matrix

df_corr = data_v1.corr(method='spearman')

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(df_corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(6, 6))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(df_corr, mask=mask, cmap=cmap, vmax=1.0, annot=True, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

They seem to be all correlated with each other.



# Time Series Modeling

In [None]:
#import required python packages

import datetime
import lightgbm as lgb
import xgboost as xgb
import plotly.express as px

from typing import Optional, List, Dict
from fbprophet import Prophet
from xgboost import plot_importance, plot_tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

plt.style.use('fivethirtyeight')

Let's work on the first time series of TenneT TSO

In [None]:
ts1 = ten_df.resample('H', on = 'Dates').mean().reset_index()
ts1 = ts1.rename(columns={'TennetTSO':'TWH_Generation'})
ts1

### Generating date related features

In [None]:
    #Officially spring is during the months of March, April and May in Germany. Summer is from June through to August. Autumn is during the months of September, October and November and winter is from December to February.

    seasons = ['Winter','Winter','Spring','Spring','Spring','Summer','Summer','Summer','Autumn','Autumn','Autumn','Winter']
    season_num = [4,4,1,1,1,2,2,2,3,3,3,4]
    month_to_season_num = dict(zip(range(1,13), season_num))

In [None]:
month_to_season_num

In [None]:
def return_date_features(df, datetime_col):
    """
    Create date related features for the datetime column that is passed 
    returns Input dataframe with additional time related feature columns
    """
    df['date_hour'] = df[datetime_col].dt.floor('H')
    df['date'] = df[datetime_col].dt.floor('D')
    df['hour_of_day'] = df[datetime_col].dt.hour
    df['day_of_week'] = df[datetime_col].dt.dayofweek
    df['month_of_year'] = df[datetime_col].dt.month
    df['day_of_year'] = df[datetime_col].dt.dayofyear
    df['week_of_year'] = df[datetime_col].dt.isocalendar().week
    
    df['Season'] = df[datetime_col].dt.month.map(month_to_season_num)
    
    return df
    

In [None]:
ts1 = return_date_features(ts1, 'Dates')
ts1.head()

In [None]:
#plot the time series
fig = ts1.plot(x="Dates", y="TWH_Generation", style='.', figsize=(15,5))

In [None]:
#plotting additional aggregates for the features that were created

def lineplot_agg_grouped(df, group_col, target_col, y_label, agg_type = 'mean', x_ticks = 0):
    """
    plots the aggregate value of a column in a dataframe grouped by another column
    """
    agg_df = df.groupby([group_col]).agg({target_col:agg_type}).reset_index()
    plt.figure(figsize=(6,4))
    ax = sns.lineplot(data=agg_df, x=agg_df[group_col], y=target_col, marker='o')
    ax.set(ylabel=y_label)
    plt.xticks(rotation=x_ticks)
    
    return

In [None]:
# plot average observations grouped by different features

ts_features = ['hour_of_day','day_of_week','Season']

for ts_feature in ts_features:
    lineplot_agg_grouped(
        df = ts1,
        group_col = ts_feature,
        target_col = 'TWH_Generation',
        y_label = 'Average Generation (TWH)',
        agg_type = 'mean',
        x_ticks=0)
    

#### Observations:
Average generation seems be higher during the day with sunrise hours having the lower portion of generation.

Weekend generation is generally higher compared to weekdays. One guess is that this may be due to less curtailment on the weekends.

Generation in Summer (2) is least compared to all the other seasons. Autumn (3) is next followed by Spring (1) and the highest average generation is during the Winter (4) months.





In [None]:
#splitting the train and test data

split_date = '2020-07-22'
ts1['type'] = 'train'
ts1.loc[ts1['date']>= split_date, 'type'] = 'test'
train = ts1.loc[ts1['type'] == 'train'].reset_index(drop=True)
test = ts1.loc[ts1['type'] == 'test'].reset_index(drop=True)


In [None]:
#plotting time series grouped by train and test datasets
fig = px.scatter(ts1, x="Dates", y="TWH_Generation", hover_data=['day_of_week'], color='type')
fig.update_traces(mode='markers+lines', marker_size=3)
fig.show()

### Create an evaluation metric 

In [None]:
def eval_accuracy(y_true, y_pred):
    """
    y_true - actual values as pandas series
    y_pred - predicted values as pandas series
    return: dictionary with MAE, RMSE and MAPE metrics
    """
    #mean absolute error (MAE)
    mae = mean_absolute_error(y_true, y_pred)
    
    #root mean squared error (RMSE)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    
    #mean absolute percentage error (MAPE)
    mape = np.mean(np.abs((y_true-y_pred) / y_true)) * 100
    
    return {'MAE': round(mae, 3),
           'RMSE': round(rmse, 3),
           'MAPE': round(mape, 3)}

    

## TS Forecasting with FB Prophet

Create a time series model with Facebook's Prophet package using the hourly generation data for Tennet TSO

In [None]:
#prophet with default parameters
# https://facebook.github.io/prophet/docs/diagnostics.html#hyperparameter-tuning
#seasonality_prior_scale=1
model_prophet = Prophet(changepoint_prior_scale= 0.5,seasonality_prior_scale=0.01)
model_prophet.fit(train.rename(columns={'Dates':'ds', 'TWH_Generation':'y'}))
test_prophet = model_prophet.predict(df = test.rename(columns={'Dates':'ds', 'TWH_Generation':'y'}))

In [None]:
#check the predictions

test_prophet.head()

In [None]:
# plot historical values and forecast values

fig = model_prophet.plot(test_prophet, figsize=(8,5))

Initial model is pretty terrible, so let's go back and figure out what parameters we can change

In [None]:
fig = model_prophet.plot_components(test_prophet)

In [None]:
f, ax = plt.subplots(1)
f.set_figheight(5)
f.set_figwidth(15)
ax.scatter(test["Dates"], test["TWH_Generation"], color='r')
fig = model_prophet.plot(test_prophet, ax=ax)

In [None]:
eval_accuracy(test["TWH_Generation"],test_prophet['yhat'])

Using default parameters for prophet model, we get: {'MAE': 52.079, 'RMSE': 86.857, 'MAPE': 179.253}

adding yearly_seasonality=20 to model gives {'MAE': 109.27, 'RMSE': 135.33, 'MAPE': 568.851}

reducing yearly_seasonality to 4 in the model gives {'MAE': 211.261, 'RMSE': 238.702, 'MAPE': 1226.523}

changing the model to use logistic growth with cap of 1000 and floor of 0 gives {'MAE': 56.184, 'RMSE': 96.313, 'MAPE': 110.319}

regular model with seasonality_prior_scale set to 1 gives {'MAE': 52.17, 'RMSE': 86.31, 'MAPE': 187.787}

regular model with changepoint_prior_scale set to 0.5 gives {'MAE': 53.286, 'RMSE': 84.874, 'MAPE': 223.09}

regular model with changepoint_prior_scale=0.5, seasonality_prior_scale=1 gives{'MAE': 53.269, 'RMSE': 84.898, 'MAPE': 222.523}

regular model with changepoint_prior_scale=0.5, seasonality_prior_scale=7 gives {'MAE': 53.372, 'RMSE': 84.818, 'MAPE': 225.125}

regular model with 'changepoint_prior_scale': 0.5, 'seasonality_prior_scale': 0.01 gives {'MAE': 53.049, 'RMSE': 84.512, 'MAPE': 223.666}


In [None]:
# plot actual vs forecast in an interactive plot

test = pd.merge(left=test, right=test_prophet.rename(columns={'ds':"Dates",'yhat':'prediction_prophet'}), 
               on = "Dates",
               how="left")

In [None]:
# plot

fig = px.scatter(test, x="Dates", y=["TWH_Generation", "prediction_prophet"],
                hover_data=["day_of_week"])
fig.update_traces(mode='markers+lines', marker_size=3)
fig.show()

### Hyperparameter tuning using cross-validation

In [None]:
#Cross-validation 
import itertools
import time
import logging
logging.getLogger().setLevel(logging.ERROR)
from fbprophet.diagnostics import cross_validation
from fbprophet.diagnostics import performance_metrics

def run_prophet_cv(timeserie):
  start_time = time.time()
  cv_start_time = datetime.datetime.now() - datetime.timedelta(hours=7)
#   series_name = timeserie.name
#   timeserie = timeserie.reset_index()
#   timeserie.columns = ['ds','y']
  param_grid = {  
                'changepoint_prior_scale': [0.001, 0.01, 0.1, 0.5],
                'seasonality_prior_scale': [0.01, 0.1, 1.0, 10.0], 
                }
  cutoffs = pd.to_datetime(['2020-04-22'])
  # Generate all combinations of parameters
  all_params = [dict(zip(param_grid.keys(), v)) for v in itertools.product(*param_grid.values())]
  mapes = []  # Store the MAPEs for each params here

  # Use cross validation to evaluate all parameters
  # other performance metrics available like mse, rmse, mae, mape, mdape,smape,coverage
  for params in all_params:
      m = Prophet(**params).fit(timeserie)  # Fit model with given params
      df_cv = cross_validation(m, cutoffs=cutoffs, horizon='90 days', parallel='threads')
      df_p = performance_metrics(df_cv, rolling_window=1)
      mapes.append(df_p['mape'].values[0])

  # Find the best parameters
  tuning_results = pd.DataFrame(all_params)
  tuning_results['mape'] = mapes
  # print('tuning results:\n',tuning_results)

  best_params = all_params[np.argmin(mapes)]
  print('best param:',best_params)
  print("---CV took %s seconds ----" % (time.time() - start_time))
  print("--- CV started at ----", cv_start_time)
  return best_params

In [None]:
f = run_prophet_cv(train.rename(columns={'Dates':'ds', 'TWH_Generation':'y'}))
f

best param: {'changepoint_prior_scale': 0.5, 'seasonality_prior_scale': 0.01}
---CV took 203.51009392738342 seconds ----

Some more tuning is needed for this model to get better accuracy. 

Insights and ideas are welcome!

---More to follow ---