In [1]:
import pandas as pd 
import numpy as np
import sklearn.metrics as metrics
pd.set_option('mode.chained_assignment', None)

In [2]:
url='https://raw.githubusercontent.com/jenfly/opsd/master/opsd_germany_daily.csv'
data = pd.read_csv(url,sep=",")
data.head()

Unnamed: 0,Date,Consumption,Wind,Solar,Wind+Solar
0,2006-01-01,1069.184,,,
1,2006-01-02,1380.521,,,
2,2006-01-03,1442.533,,,
3,2006-01-04,1457.217,,,
4,2006-01-05,1477.131,,,


In [3]:
# to explicitly convert the date column to type DATETIME
data['Date'] = pd.to_datetime(data['Date'])
data = data.set_index('Date')
data.head(3)

Unnamed: 0_level_0,Consumption,Wind,Solar,Wind+Solar
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2006-01-01,1069.184,,,
2006-01-02,1380.521,,,
2006-01-03,1442.533,,,


### Goals of Prediction
Our aim is to predict Consumption (ideally for future unseen dates) from this time series dataset.
### Training and Test set
We will be using 10 years of data for training i.e. 2006–2016 and last year’s data for testing i.e. 2017.
### Performance Measure
In order to evaluate how good our model is, we would be using R-squared and Root Mean Squared Error (but will be printing all relevant metrics for you to take the final call).

In [4]:
def regression_results(y_true, y_pred):
# Regression metrics
    explained_variance=metrics.explained_variance_score(y_true, y_pred)
    mean_absolute_error=metrics.mean_absolute_error(y_true, y_pred) 
    mse=metrics.mean_squared_error(y_true, y_pred) 
    mean_squared_log_error=metrics.mean_squared_log_error(y_true, y_pred)
    median_absolute_error=metrics.median_absolute_error(y_true, y_pred)
    r2=metrics.r2_score(y_true, y_pred)
    print('explained_variance: ', round(explained_variance,4))    
    print('mean_squared_log_error: ', round(mean_squared_log_error,4))
    print('r2: ', round(r2,4))
    print('MAE: ', round(mean_absolute_error,4))
    print('MSE: ', round(mse,4))
    print('RMSE: ', round(np.sqrt(mse),4))

### Feature Engineering

As a baseline, we choose a simplistic model, one that predicts today’s consumption value based on
* yesterday’s consumption value and;
* difference between yesterday and the day before yesterday’s consumption value.

In [5]:
# creating new dataframe from consumption column
data_consumption = data[['Consumption']]
# inserting new column with yesterday's consumption values
data_consumption.loc[:,'Yesterday'] = data_consumption.loc[:,'Consumption'].shift()# shift the consumption value to the next day

In [6]:
data_consumption.head()

Unnamed: 0_level_0,Consumption,Yesterday
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2006-01-01,1069.184,
2006-01-02,1380.521,1069.184
2006-01-03,1442.533,1380.521
2006-01-04,1457.217,1442.533
2006-01-05,1477.131,1457.217


In [10]:
# inserting another column with difference between yesterday and day before yesterday's consumption values.
data_consumption.loc[:,'Yesterday_Diff'] = data_consumption.loc[:,'Yesterday'].diff()#(current cell - previous cell of yesterday column)
# dropping NAs
#data_consumption = data_consumption.dropna()
data_consumption.head(5)

Unnamed: 0_level_0,Consumption,Yesterday,Yesterday_Diff
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2006-01-01,1069.184,,
2006-01-02,1380.521,1069.184,
2006-01-03,1442.533,1380.521,311.337
2006-01-04,1457.217,1442.533,62.012
2006-01-05,1477.131,1457.217,14.684


<img src='series diff.PNG'>