In [None]:
import numpy as np
import pandas as pd
from sklearn import linear_model
import seaborn as sns
import datetime
from group_lasso import GroupLasso
from sklearn.utils import resample, check_random_state
from sklearn.model_selection import cross_val_score, cross_validate

from extra_functions import *

# Silence some warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('energydata_complete.csv')
df['date'] = pd.to_datetime(df['date'])
df = df.set_index('date')

fig = plot_data(df)
fig

### Generating extra features to describe time
weekday: number [0,6]\
weekstatus: binary describing weekend (1) or not (0)\
NSM: Number of Seconds from Midnight

These are used for filtering the data

In [None]:
weekday = np.zeros(len(df))
weekstatus = np.zeros(len(df))
NSM = np.zeros(len(df))
month = np.zeros(len(df))

for i, val in enumerate(df.index):
    weekday[i] = val.weekday()
    weekstatus[i] = (weekday[i] >= 5)  # False for workday, True for weekend
    NSM[i] = (val - val.replace(hour=0, minute=0, second=0, microsecond=0)).total_seconds()
    month[i] = val.month

df['weekday'] = weekday
df['week status'] = weekstatus
df['NSM'] = NSM
df['month'] = month


## Add n previous timepoints to the data

Here we add the result vector from "t-n" as part of the covariates. 


In [None]:
ns = [1, 10, 100] # Make a list in-case we want to skip some "n"
y = df['Appliances'].values # get y
for n in ns:
    temp = np.zeros_like(y)
    temp[n:] = y[:-n]
    df[f"t-{n}"]=temp
# Strip the first max(n) datapoints that now miss data
n = max(ns)
df = df[n:]

In [None]:
plt.figure()

# These two plots should be identical
plt.plot(df['t-1'][:10],df['Appliances'][:10], lw=3, label="real")
plt.plot(df['t-1'][:10], df['t-1'][1:11], label="shifted t-1") 
plt.xlabel('t-1')
plt.ylabel('Appliances')
plt.legend()
plt.show()

### Filtering data and making training/validation/test set

In [None]:
indices = (np.in1d(df.index.month, (1,2)))
df_train = df[indices]
df_valid = df[(df.index.month==3)]
df_test = df[(df.index.month==4)]


# Training data
y = np.array(df_train['Appliances']).reshape(-1,1)
X = np.array(df_train[df_train.columns[1:]])
X, y = standardize(X,y)

In [None]:
# Check that the filter was correct
print(len(df[(df.index.month==1)])+ len(df[(df.index.month==2)]))
print(len(df_train))

### Correlations of covariates

In [None]:
cor = df_train[df_train.columns[1:]].corr()
fig, ax = plt.subplots(figsize=(10,10)) 
sns.heatmap(cor, square=True, xticklabels=True, yticklabels=True, cmap='RdBu')
plt.show()

## Recurrent NN

In [None]:
# Martins code here

## Random forrest

In [None]:
# Youngrung's code her

## Decision tree surrogacy

In [None]:
# Sander's code here