# Part 1: Setup <a id="Part1_REFIT"> </a>

## 1.1: Initial setup

**Step 1:** Import the relevant packages and set Seaborn/Matplotlib hyperparameters.

In [None]:
import datetime
import graphviz
import hdbscan
import holidays
import os
import sklearn
import tensorflow
import umap

import matplotlib.dates as md
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm

from imblearn.over_sampling import SMOTENC
from scipy.signal import savgol_filter
from scipy import stats
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV, mutual_info_regression
from sklearn.inspection import permutation_importance
from sklearn.manifold import TSNE
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, train_test_split
from sklearn.preprocessing import MaxAbsScaler, MinMaxScaler, StandardScaler
from sklearn import metrics, tree
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.seasonal import STL, seasonal_decompose
from statsmodels.tsa.stattools import adfuller, grangercausalitytests
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.layers import LSTM, Conv1D, ConvLSTM2D, Dense, Flatten, LeakyReLU, MaxPooling1D, MaxPooling2D, RepeatVector, TimeDistributed
from tensorflow.keras.models import Sequential, load_model
from tqdm import notebook

plt.style.use("fivethirtyeight")
sns.set(style="whitegrid", palette="muted")

plt.rcParams["font.size"] = 24
plt.rcParams["axes.labelsize"] = 26
plt.rcParams["axes.titlesize"] = 26
plt.rcParams["figure.figsize"] = 16, 10
plt.rcParams["figure.dpi"] = 300
plt.rcParams["xtick.labelsize"] = 22
plt.rcParams["ytick.labelsize"] = 22
plt.rcParams["legend.fontsize"] = 22

np.set_printoptions(suppress=True)
pd.options.display.float_format = "{:.2f}".format

RANDOM_SEED=3141589

tensorflow.random.set_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

%config Completer.use_jedi = False

**Step 2:** Define the location of our data as well as the relevant columns that we would like to include.

In [None]:
house_number = 12

cols_REFIT = [
    "Time",
    "Aggregate",
    "Appliance1",
    "Appliance2",
    "Appliance3",
    "Appliance4",
    "Appliance5",
    "Appliance6",
    "Appliance7",
    "Appliance8",
    "Appliance9",
]

data_directory_REFIT = os.path.join("Data", "REFIT")
data_directory_Solcast = os.path.join("Data", "Solcast_REFIT")

house = f"CLEAN_House{house_number}.csv"
solcast_15 = "Solcast_REFIT_15.csv"

file_destination_REFIT = os.path.join(data_directory_REFIT, house)
file_destination_Solcast = os.path.join(data_directory_Solcast, solcast_15)

**Step 3:** Read in the data and save it to a dataframe.

In [None]:
df_REFIT = pd.read_csv(file_destination_REFIT, index_col=0, parse_dates=True, usecols=cols_REFIT)
df_Solcast = pd.read_csv(file_destination_Solcast, index_col=0, parse_dates=True)

df_Solcast.index = df_Solcast.index.rename("Time")
df_Solcast.index = pd.to_datetime(df_Solcast.index).tz_localize(None)

cols_REFIT.remove("Time")

In [None]:
df_REFIT = df_REFIT / 1000

## 1.2: Scale the data in the dataframe(s)

**Step 1.1:** Scale the data in a range between 0 and 1 (optional).

In [None]:
# minmax_REFIT = MinMaxScaler()
# minmax_Solcast = MinMaxScaler()

# df_REFIT[cols_REFIT] = minmax_REFIT.fit_transform(df_REFIT[cols_REFIT])
# df_Solcast[cols_Solcast] = minmax_Solcast.fit_transform(df_Solcast[cols_Solcast])

**Step 1.2:** Standardize the data by removing the mean and scaling to unit variance (optional).

In [None]:
# standardscale_REFIT = StandardScaler()
# standardscale_Solcast = StandardScaler()

# df_REFIT[cols_REFIT] = standardscale_REFIT.fit_transform(df_REFIT[cols_REFIT])
# df_Solcast[cols_Solcast] = standardscale_Solcast.fit_transform(df_Solcast[cols_Solcast])

## 1.3: Merge the dataframes

**Step 1:** Create a copy of our REFIT dataframe that is resampled into a resolution of 15 minutes and drop any days that contain an incomplete number of values.

In [None]:
df_REFIT_resampled = df_REFIT.resample("15min").mean()
df_REFIT_resampled = df_REFIT_resampled.dropna()

mask = df_REFIT_resampled.groupby(df_REFIT_resampled.index.date).size()
mask = mask[mask < 96].index.to_list()

df_REFIT_resampled = df_REFIT_resampled[~df_REFIT_resampled.index.floor("D").isin(mask)]

**Step 2:** Create a third dataframe that is the result of merging the Solcast dataframe with the REFIT dataframe.

In [None]:
df_Merged = pd.merge(left=df_Solcast, left_on=df_Solcast.index, right=df_REFIT_resampled, right_on=df_REFIT_resampled.index)

cols_Merged = [
    "PeriodStart",
    "Period",
    "Appliance1",
    "Appliance2",
    "Appliance3",
    "Appliance4",
    "Appliance5",
    "Appliance6",
    "Appliance7",
    "Appliance8",
    "Appliance9",
]

df_Merged.drop(cols_Merged, axis=1, inplace=True)
df_Merged.rename(columns={"key_0": "Time"}, inplace=True)
df_Merged = df_Merged.set_index("Time")
df_Merged.index = pd.to_datetime(df_Merged.index)
df_Merged.head()

## 1.4: Append temporal features to our merged dataframe

**Step 1:** Append public holidays to our merged dataframe.

In [None]:
UK_holidays = holidays.UnitedKingdom()
df_Merged.insert(0, "Holiday", [1 if str(val).split()[0] in UK_holidays else 0 for val in df_Merged.index.date])
df_Merged["Holiday"] = df_Merged["Holiday"].astype("category")

**Step 2:** Define day of the year ranges for each of the seasons.

In [None]:
spring = range(79, 172)
summer = range(172, 266)
fall = range(266, 355)

def season(doy):
    if doy in spring:
        return "0"
    if doy in summer:
        return "1"
    if doy in fall:
        return "2"
    else:
        return "3"

**Step 3:** Append temporal data to our merged dataframe.

In [None]:
df_Merged.insert(0, "Year", df_Merged.index.year)
df_Merged.insert(1, "Month", df_Merged.index.month)
df_Merged.insert(3, "Day", df_Merged.index.day)
df_Merged.insert(4, "Hour", df_Merged.index.hour)
df_Merged.insert(5, "Minute", df_Merged.index.minute)
df_Merged.insert(6, "Weekday", df_Merged.index.weekday)
df_Merged.insert(7, "Season", df_Merged.index.dayofyear.map(season))

## Miscellaneous functions

### 1) Augmented Dickey–Fuller test

In [None]:
def adfuller_test(series, signif=0.05, name=""):
    r = adfuller(series, autolag="AIC")
    output = {"test_statistic": round(r[0], 4), "pvalue": round(r[1], 4), "n_lags": round(r[2], 4), "n_obs": r[3]}
    p_value = output["pvalue"]

    def adjust(val, length=6):
        return str(val).ljust(length)

    print(f'      Augmented Dickey-Fuller Test on "{name}"', "\n   ", "-" * 47)
    print(f" Null Hypothesis: Data has unit root. Non-Stationary.")
    print(f" Significance Level    = {signif}")
    print(f' Test Statistic        = {output["test_statistic"]}')
    print(f' No. Lags Chosen       = {output["n_lags"]}')

    for key, val in r[4].items():
        print(f" Critical value {adjust(key)} = {round(val, 3)}")
    if p_value <= signif:
        print(f" => P-Value = {p_value}. Rejecting Null Hypothesis.")
        print(f" => Series is Stationary.")
    else:
        print(f" => P-Value = {p_value}. Weak evidence to reject the Null Hypothesis.")
        print(f" => Series is Non-Stationary.")

### 2) Augmented Dickey–Fuller test w/plot

In [None]:
def test_stationarity(series, signif=0.05, name="", ylabel=""):
    def adjust(val, length=6):
        return str(val).ljust(length)

    rolmean = series.rolling(12).mean()
    rolstd = series.rolling(12).std()

    fig, ax = plt.subplots()

    series.plot(ax=ax, alpha=0.5)
    rolmean.plot(ax=ax, alpha=0.7)
    rolstd.plot(ax=ax, alpha=0.7)

    ax.set_xlabel("")
    ax.set_ylabel(ylabel)
    ax.set_xlim(left=0, right=len(series))
    plt.legend(loc="best")
    plt.title("Rolling Mean & Standard Deviation")
    plt.setp(ax.get_xticklabels(), ha="right", rotation=60)

    leg = plt.legend()
    leg.get_texts()[0].set_text(name)
    leg.get_texts()[1].set_text("Rolling Mean")
    leg.get_texts()[2].set_text("Rolling STD")

    plt.tight_layout()
    plt.show(block=False)

    adfuller_test(series, 0.05, name)

### 3) Granger Causality test

In [None]:
def grangers_causation_matrix(data, variables, test="ssr_chi2test", maxlag=12, verbose=False):
    df = pd.DataFrame(np.zeros((len(variables), len(variables))), columns=variables, index=variables)
    for c in df.columns:
        for r in df.index:
            test_result = grangercausalitytests(data[[r, c]], maxlag=maxlag, verbose=False)
            p_values = ([round(test_result[i + 1][0][test][1], 2) for i in range(maxlag)])
            if verbose:
                print(f"Y = {r}, X = {c}, P Values = {p_values}")
            min_p_value = np.min(p_values)
            df.loc[r, c] = min_p_value
    df.columns = [var + "_x" for var in variables]
    df.index = [var + "_y" for var in variables]
    return df

### 4) Determine which highly correlated independent variables have a stronger correlation with our target variable.

In [None]:
def correlation(df, threshold, target_variable):
    col_corr = set()
    corr_matrix = df.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold:
                colname = corr_matrix.columns[i]
                rowname = corr_matrix.index[j]
                cor1 = abs(df[colname].corr(target_variable))
                cor2 = abs(df[rowname].corr(target_variable))
                if  cor1 > cor2:
                    col_corr.add(corr_matrix.index[j])
                else:
                    col_corr.add(corr_matrix.columns[i])
    return col_corr

### 5) Reshape correlation matrix

In [None]:
def reshape_corr(df):
    df_corr = df.corr().stack().reset_index()
    df_corr.columns = ["Feature 1", "Feature 2", "Correlation"]
    mask_dups = (df_corr[["Feature 1", "Feature 2"]].apply(frozenset, axis=1).duplicated()) | (df_corr["Feature 1"] == df_corr["Feature 2"])
    df_corr = df_corr[~mask_dups]

    return df_corr

### 6) Forecasting accuracy metrics

In [None]:
def mape(forecast, actual):
    mape = np.round(np.mean(np.abs(forecast - actual) / np.abs(actual)) * 100, 2)
    return mape

In [None]:
def forecast_accuracy(forecast, actual):
    mape = np.round(np.mean(np.abs(forecast - actual) / np.abs(actual)) * 100, 2)
    mae = np.round(np.mean(np.abs(forecast - actual)), 2)
    rmse = np.round(np.mean((forecast - actual) ** 2) ** 0.5, 2)

    print("Forecasting accuracy metrics:")
    print(f"\t - MAPE = {mape}%")
    print(f"\t - MAE = {mae}")
    print(f"\t - RMSE = {rmse}")

### 7) Reshape data into a suitable format for single step forecasting using our CNN-LSTM architecture

In [None]:
def to_Supervised(X, y, time_steps=1):
    Xs, ys = [], []
    for i in range(len(X) - time_steps):
        v = X.iloc[i : (i + time_steps)].values
        Xs.append(v)
        ys.append(y.iloc[i + time_steps])
    return np.array(Xs), np.array(ys)

### 8) Reshape data into a suitable format for multi-step forecasting using our CNN-LSTM architecture

In [None]:
def to_Supervised_ms(ts: np.array, lag=1, n_ahead=1, target_index=0) -> tuple:
    n_features = ts.shape[1]
    X, Y = [], []

    if len(ts) - lag <= 0:
        X.append(ts)
    else:
        for i in range(len(ts) - lag - n_ahead):
            Y.append(ts[(i + lag) : (i + lag + n_ahead), target_index])
            X.append(ts[i : (i + lag)])
    X, Y = np.array(X), np.array(Y)

    X = np.reshape(X, (X.shape[0], lag, n_features))

    return X, Y

# Part 2: Clustering <a id="Part2_REFIT"> </a>

## 2.1: Dimensionality reduction

**Step 1:** Create a copy of our REFIT dataframe

In [None]:
df_REFIT_resampled_c = df_REFIT_resampled.copy()

**Step 2:** Drop all columns barre the `Aggregate` column.

In [None]:
cols_REFIT_c = cols_REFIT.copy()
cols_REFIT_c.remove("Aggregate")
df_REFIT_resampled_c.drop(cols_REFIT_c, axis=1, inplace=True)

**Step 3:** Reshape our dataframe as 96 columns that represent the 96 15-minute chunks of each day.

In [None]:
df_REFIT_resampled_c.index = pd.MultiIndex.from_arrays([df_REFIT_resampled_c.index.date, df_REFIT_resampled_c.index.time], names=["Date", "Time"])
df_REFIT_resampled_c = df_REFIT_resampled_c.unstack()

### 2.1.1: Statistical parameters

**Step 1:** Split our day into 5 periods:
 - `LEEM`: Late evening/early morning (23:30-06:00)
 - `MR`: Morning (06:00-11:00)
 - `LMAF`: Late morning/afternoon (11:00-15:00)
 - `LAEE`: Late afternoon/early evening (15:00-20:30)
 - `EV`: Evening (20:30-23:30)

In [None]:
LEEM = df_REFIT_resampled_c.iloc[:, np.r_[0:24, 92:96]]
MR = df_REFIT_resampled_c.iloc[:, 24:45]
LMAF = df_REFIT_resampled_c.iloc[:, 44:61]
LAEE = df_REFIT_resampled_c.iloc[:, 60:83]
EV = df_REFIT_resampled_c.iloc[:, 82:95]

**Step 2:** Create a new dataframe that consists of the mean, min, max and standard deviation of each of our 5 periods per day. We now represent each day with 20 variables rather than 96.

In [None]:
df_SP = LEEM.mean(axis=1).to_frame(name="LEEM_Mean")
df_SP.insert(len(df_SP.columns), "LEEM_Min", LEEM.min(axis=1))
df_SP.insert(len(df_SP.columns), "LEEM_Max", LEEM.max(axis=1))
df_SP.insert(len(df_SP.columns), "LEEM_STD", LEEM.std(axis=1))
df_SP.insert(len(df_SP.columns), "MR_Mean", MR.mean(axis=1))
df_SP.insert(len(df_SP.columns), "MR_Min", MR.min(axis=1))
df_SP.insert(len(df_SP.columns), "MR_Max", MR.max(axis=1))
df_SP.insert(len(df_SP.columns), "MR_STD", MR.std(axis=1))
df_SP.insert(len(df_SP.columns), "LMAF_Mean", LMAF.mean(axis=1))
df_SP.insert(len(df_SP.columns), "LMAF_Min", LMAF.min(axis=1))
df_SP.insert(len(df_SP.columns), "LMAF_Max", LMAF.max(axis=1))
df_SP.insert(len(df_SP.columns), "LMAF_STD", LMAF.std(axis=1))
df_SP.insert(len(df_SP.columns), "LAEE_Mean", LAEE.mean(axis=1))
df_SP.insert(len(df_SP.columns), "LAEE_Min", LAEE.min(axis=1))
df_SP.insert(len(df_SP.columns), "LAEE_Max", LAEE.max(axis=1))
df_SP.insert(len(df_SP.columns), "LAEE_STD", LAEE.std(axis=1))
df_SP.insert(len(df_SP.columns), "EV_Mean", EV.mean(axis=1))
df_SP.insert(len(df_SP.columns), "EV_Min", EV.min(axis=1))
df_SP.insert(len(df_SP.columns), "EV_Max", EV.max(axis=1))
df_SP.insert(len(df_SP.columns), "EV_STD", EV.std(axis=1))

### 2.1.2: PCA

**Step 1:** Determine the cumulative explained variance ratio as a function of the number of variables.

In [None]:
pca = PCA().fit(df_SP)

plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel("Number of variables")
plt.ylabel("Cumulative explained variance")
plt.tight_layout()
plt.show()

**Step 2:** Apply PCA to our data set.

In [None]:
pca = PCA(random_state=RANDOM_SEED, n_components = 8)
df_SP1 = pca.fit_transform(df_SP)

### 2.1.3: UMAP

**Step 1:** Apply UMAP to our data set.

In [None]:
projection = umap.UMAP(random_state=RANDOM_SEED, n_neighbors=np.power(len(df_REFIT_resampled_c), 0.5).astype(int), min_dist=0.1, n_components=2).fit_transform(df_SP)
projection = TSNE(random_state=RANDOM_SEED).fit_transform(projection)
plt.scatter(*projection.T)
plt.tight_layout()
plt.show()

In [None]:
Split_CL = int(len(projection) * 0.8)
projection = projection[0:Split_CL]
df_REFIT_resampled_CL = df_REFIT_resampled_c[0:Split_CL].copy()
df_REFIT_resampled_TE = df_REFIT_resampled_c[Split_CL:].copy()

## 2.2: HDBSCAN

**Step 1:** Define our HDBSCAN clusterer with the appropriate hyperparameters and fit it to our 2-dimensional projection.

In [None]:
HDB = hdbscan.HDBSCAN(min_cluster_size=(len(projection) // 10), min_samples=15)
HDB = HDB.fit(projection)

In [None]:
HDB.condensed_tree_.plot(select_clusters=True)
plt.show()

**Step 2:** Plot/visualize our clusters.

In [None]:
labels = HDB.labels_
labels = [label + 1 for label in labels]
n_clusters = len(set(labels)) - (1 if 0 in labels else 0)
n_noise = list(labels).count(0)
core_samples_mask = np.zeros_like(labels, dtype=bool)

In [None]:
unique_labels = set(labels)
colors = np.array(sns.color_palette("bright", len(unique_labels)))
colors[0] = [0, 0, 0]
for k, col in zip(unique_labels, colors):
    if k == 0:
        # Black used for noise.
        col = [0, 0, 0, 1]
    class_member_mask = labels == k

    xy = projection[class_member_mask & core_samples_mask]
    plt.plot(xy[:, 0], xy[:, 1], "o", markerfacecolor=tuple(col), markeredgecolor="k", markersize=14)

    xy = projection[class_member_mask & ~core_samples_mask]
    plt.plot(xy[:, 0], xy[:, 1], "o", markerfacecolor=tuple(col), markeredgecolor="k", markersize=6)
plt.xticks(color="w")
plt.yticks(color="w")
plt.title("Estimated number of clusters: %d" % n_clusters)
plt.show()

print("Estimated number of clusters: %d" % n_clusters)
print("Estimated number of noise points: %d" % n_noise)
print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(projection, labels))

**Step 3:** Plot/visualize the average pattern per cluster

In [None]:
df_REFIT_resampled_CL["Labels"] = labels
Cx = []

for i in range(0, n_clusters + 1):
    if(i == 0):
        Cx.append("Noise")
    else:
        Cx.append(f"Cluster " + str(i))
    globals()["C" + str(i)] = (df_REFIT_resampled_CL.loc[df_REFIT_resampled_CL["Labels"] == i]).mean(axis=0)
    globals()["C" + str(i)] = globals()["C" + str(i)].reset_index()
    globals()["C" + str(i)].drop("level_0", axis=1, inplace=True)
    globals()["C" + str(i)].drop(globals()["C" + str(i)].tail(1).index, inplace=True)
    globals()["C" + str(i)]["Time"] = globals()["C" + str(i)]["Time"].astype("str")
    globals()["C" + str(i)]["Time"] = pd.to_datetime(globals()["C" + str(i)]["Time"])
    globals()["C" + str(i)] = globals()["C" + str(i)].set_index("Time")
    globals()["C" + str(i)].index = globals()["C" + str(i)].index.strftime("%H:%M:%S")
    globals()["C" + str(i)] = globals()["C" + str(i)].rename(columns={0: "Aggregate"})

fig, ax = plt.subplots()
for i in range(0, n_clusters + 1):
    globals()["C" + str(i)].plot(ax=ax, color=colors[i], linewidth=2)

ax.set_xlabel("Time")
ax.set_ylabel("Aggregate Power Consumption (Kilowatts)")
ax.set_xlim(left=0, right=95)
plt.legend([*Cx], loc="best", fontsize=18)
plt.setp(ax.get_xticklabels(), ha="right", rotation=60)
plt.tight_layout()
plt.show()

In [None]:
df_REFIT_resampled_CL.index = pd.DatetimeIndex(df_REFIT_resampled_CL.index)
df_REFIT_resampled_CL.insert(0, "Day_name", df_REFIT_resampled_CL.index.day_name())
df_REFIT_resampled_CL.insert(0, "Month_name", df_REFIT_resampled_CL.index.month_name())

In [None]:
ax = sns.countplot(
    x="Month_name",
    hue="Labels",
    data=df_REFIT_resampled_CL,
    palette=colors,
    order=["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"],
)

plt.setp(ax.get_xticklabels(), ha="right", rotation=60)
plt.xlabel("Month")
plt.ylabel("Count")
plt.tight_layout()
plt.show()

In [None]:
ax = sns.countplot(
    x="Day_name",
    hue="Labels",
    data=df_REFIT_resampled_CL,
    palette=colors,
    order=["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"],
)

plt.setp(ax.get_xticklabels(), ha="right", rotation=60)
plt.xlabel("Day")
plt.ylabel("Count")
plt.tight_layout()
plt.show()

In [None]:
x1 = df_REFIT_resampled_CL.copy()
x1.index = x1.index.month_name()
x1 = x1[~x1["Labels"].isin([0])]

kde_kws = {"bw_adjust": 1.0, "bw_method": "silverman"}
line_kws = {"linewidth": 2, "alpha": 1.0}
colors2 = np.delete(colors, 0, 0)
order = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]

ax = sns.histplot(
    data=x1.loc[order],
    x="Month_name",
    hue="Labels",
    alpha=0.75,
    palette=list(colors2),
    multiple="dodge",
    kde=True,
    kde_kws=kde_kws,
    shrink=0.75,
    line_kws=line_kws,
)
ax.set_xlim(-0.75, 11.75)
ax.get_legend().remove()
plt.setp(ax.get_xticklabels(), ha="right", rotation=60)
plt.xlabel("Month")
plt.ylabel("Count")
plt.tight_layout()
plt.show()

In [None]:
x1 = df_REFIT_resampled_CL.copy()
x1.index = x1.index.day_name()
x1 = x1[~x1["Labels"].isin([0])]

kde_kws = {"bw_adjust": 1.0, "bw_method": "silverman"}
line_kws = {"linewidth": 2, "alpha": 1.0}
colors2 = np.delete(colors, 0, 0)
order = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]

ax = sns.histplot(
    data=x1.loc[order],
    x="Day_name",
    hue="Labels",
    alpha=0.75,
    palette=list(colors2),
    multiple="dodge",
    kde=True,
    kde_kws=kde_kws,
    shrink=0.75,
    line_kws=line_kws,
)
ax.set_xlim(-0.75, 6.75)
ax.get_legend().remove()
plt.setp(ax.get_xticklabels(), ha="right", rotation=60)
plt.xlabel("Day")
plt.ylabel("Count")
plt.tight_layout()
plt.show()

# Part 3: CART <a id="Part3_UCID"> </a>

In [None]:
CART_CL = df_REFIT_resampled_CL.copy()
CART_CL.drop(CART_CL.iloc[:, 0:2], axis=1, inplace=True)
CART_CL.drop(CART_CL.iloc[:, 0:96], axis=1, inplace=True)
CART_CL.columns = CART_CL.columns.droplevel(1)
CART_CL = CART_CL[~CART_CL["Labels"].isin([0])]
colors2 = np.delete(colors, 0, 0)

In [None]:
df_Merged2 = df_Merged.copy()
df_Merged2.drop("Aggregate", axis=1, inplace=True)
df_Merged2.drop(df_Merged2.iloc[:, 0:8], axis=1, inplace=True)
df_Merged2 = df_Merged2.resample("1D").agg(["min", "max","std", "mean"]).dropna()

In [None]:
CART_CL = pd.merge(left=df_Merged2, left_on=df_Merged2.index, right=CART_CL, right_on=CART_CL.index)
CART_CL.rename(columns={"key_0": "Date"}, inplace=True)
CART_CL = CART_CL.set_index("Date")
CART_CL.index = pd.to_datetime(CART_CL.index)

In [None]:
CART2 = CART_CL.copy()
CART2 = CART2.loc[:, CART2.columns != "Labels"].rename(columns='_'.join)
CART2.insert(len(CART2.columns), "Labels", CART_CL.Labels)
CART_CL = CART2.copy()
del CART2
CART_CL.dropna(inplace=True)

In [None]:
CART_CL.insert(len(CART_CL.columns), "Day", CART_CL.index.day)
CART_CL.insert(len(CART_CL.columns), "Month", CART_CL.index.month)
CART_CL.insert(len(CART_CL.columns), "Year", CART_CL.index.year)
CART_CL.insert(len(CART_CL.columns), "Day Of Week", CART_CL.index.dayofweek)
CART_CL.insert(len(CART_CL.columns), "Day Of Year", CART_CL.index.dayofyear)

CART_CL["Day"] = CART_CL["Day"].astype("category")
CART_CL["Month"] = CART_CL["Month"].astype("category")
CART_CL["Year"] = CART_CL["Year"].astype("category")
CART_CL["Day Of Week"] = CART_CL["Day Of Week"].astype("category")
CART_CL["Day Of Year"] = CART_CL["Day Of Year"].astype("category")
CART_CL["Labels"] = CART_CL["Labels"].astype("category")

In [None]:
ax = sns.countplot(x=CART_CL["Labels"], data=CART_CL, palette=colors2)
plt.title("Number of days per cluster")
plt.xlabel("Label")
plt.ylabel("Count")
plt.tight_layout()
plt.show()

In [None]:
cols = CART_CL.select_dtypes(include=["category"]).columns
cols = [CART_CL.columns.get_loc(col) for col in cols]
del cols[0]
cols = [col - 1 for col in cols]

In [None]:
smotenc = SMOTENC(random_state=RANDOM_SEED, categorical_features=[*cols])
X, y = (CART_CL.loc[:, CART_CL.columns != "Labels"], CART_CL["Labels"].to_frame())
X, y = smotenc.fit_resample(X, y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y["Labels"], random_state=RANDOM_SEED)

In [None]:
ax = sns.countplot(x=y["Labels"], data=y, palette=colors2)
plt.title("Number of days per cluster")
plt.xlabel("Label")
plt.ylabel("Count")
plt.tight_layout()
plt.show()

In [None]:
RF = RandomForestClassifier(random_state=RANDOM_SEED)

## 3.1 - Random Forest

In [None]:
# n_estimators = [int(x) for x in np.linspace(200, 1600, num=8)]
# max_features = ['auto', 'log2']
# max_depth = [int(x) for x in np.linspace(10, 110, num=11)]
# max_depth.append(None)
# min_samples_split = [2, 5, 10]
# min_samples_leaf = [1, 2, 4, 6, 8, 10]
# bootstrap = [True, False]

# random_grid = {
#     "n_estimators": n_estimators,
#     "max_features": max_features,
#     "max_depth": max_depth,
#     "min_samples_split": min_samples_split,
#     "min_samples_leaf": min_samples_leaf,
#     "bootstrap": bootstrap,
# }

# RF = RandomForestClassifier(random_state=RANDOM_SEED)
# RF_CV = RandomizedSearchCV(RF, random_grid, n_iter=50, verbose=2)
# RF_CV.fit(X, y.values.ravel())

# print(f"Tuned Random Forest Parameters: {RF_CV.best_params_}")
# print(f"Best score is {RF_CV.best_score_}")

In [None]:
RF = RandomForestClassifier(random_state=RANDOM_SEED)
#RF.set_params(**RF_CV.best_params_)
RF.fit(X_train, y_train.values.ravel())
RF.score(X_test, y_test), RF.score(X_train, y_train)

In [None]:
CART_TE = df_REFIT_resampled_TE.copy()
CART_TE.drop(CART_TE.iloc[:, 0:96], axis=1, inplace=True)
CART_TE.columns = CART_TE.columns.droplevel(1)
colors2 = np.delete(colors, 0, 0)
CART_TE.index = pd.DatetimeIndex(CART_TE.index)
CART_TE.insert(len(CART_TE.columns), "Labels", 0)

In [None]:
df_Merged2 = df_Merged.copy()
df_Merged2.drop("Aggregate", axis=1, inplace=True)
df_Merged2.drop(df_Merged2.iloc[:, 0:8], axis=1, inplace=True)
df_Merged2 = df_Merged2.resample("1D").agg(["min", "max","std", "mean"]).dropna()

In [None]:
CART_TE = pd.merge(left=df_Merged2, left_on=df_Merged2.index, right=CART_TE, right_on=CART_TE.index)
CART_TE.rename(columns={"key_0": "Date"}, inplace=True)
CART_TE = CART_TE.set_index("Date")
CART_TE.index = pd.to_datetime(CART_TE.index)

In [None]:
CART2 = CART_TE.copy()
CART2 = CART2.loc[:, CART2.columns != "Labels"].rename(columns='_'.join)
CART_TE = CART2.copy()
del CART2
CART_TE.dropna(inplace=True)

In [None]:
CART_TE.insert(len(CART_TE.columns), "Day", CART_TE.index.day)
CART_TE.insert(len(CART_TE.columns), "Month", CART_TE.index.month)
CART_TE.insert(len(CART_TE.columns), "Year", CART_TE.index.year)
CART_TE.insert(len(CART_TE.columns), "Day Of Week", CART_TE.index.dayofweek)
CART_TE.insert(len(CART_TE.columns), "Day Of Year", CART_TE.index.dayofyear)

CART_TE["Day"] = CART_TE["Day"].astype("category")
CART_TE["Month"] = CART_TE["Month"].astype("category")
CART_TE["Year"] = CART_TE["Year"].astype("category")
CART_TE["Day Of Week"] = CART_TE["Day Of Week"].astype("category")
CART_TE["Day Of Year"] = CART_TE["Day Of Year"].astype("category")

In [None]:
Labels_test = RF.predict(CART_TE)
CART_TE.insert(len(CART_TE.columns), "Labels", Labels_test)

# Part 4: Forecasting <a id="Part3_REFIT"> </a>

## 4.1: Trend

In [None]:
C1_Merge_train = CART_CL.loc[:, "Labels"].to_frame()
df_REFIT_resampled_train = df_REFIT_resampled_CL.copy()
df_REFIT_resampled_train.Labels = C1_Merge_train.Labels
df_REFIT_resampled_train.dropna(inplace=True)
C1_Merge_train = df_REFIT_resampled_train.loc[df_REFIT_resampled_train["Labels"] == 1].copy()
C1_Merge_train = C1_Merge_train.stack().reset_index()
C1_Merge_train["Date"] = C1_Merge_train["Date"].astype("str")
C1_Merge_train["Time"] = C1_Merge_train["Time"].astype("str")
C1_Merge_train["DT"] = C1_Merge_train["Date"].str.cat(C1_Merge_train["Time"], sep=" ")

cols_NA = [
    "Date",
    "Time",
    "Labels",
    "Day_name",
    "Month_name"
]

C1_Merge_train.drop(cols_NA, axis=1, inplace=True)
C1_Merge_train["DT"] = pd.to_datetime(C1_Merge_train["DT"])
C1_Merge_train = C1_Merge_train.set_index("DT")
C1_Merge_train.dropna(inplace=True)
C1_Merge_train = df_Merged.reindex(C1_Merge_train.index)
C1_Merge_train = C1_Merge_train.dropna()
C1_Merge_train["Holiday"] = C1_Merge_train["Holiday"].astype('int32')
C1_Merge_train["Season"] = C1_Merge_train["Season"].astype('int32')

In [None]:
C1_Merge_test = CART_TE.loc[:, "Labels"].to_frame()
df_REFIT_resampled_test = df_REFIT_resampled_TE.copy()
df_REFIT_resampled_test.insert(len(df_REFIT_resampled_test.columns), "Labels", C1_Merge_test.Labels)
df_REFIT_resampled_test.dropna(inplace=True)
C1_Merge_test = df_REFIT_resampled_test.loc[df_REFIT_resampled_test["Labels"] == 1].copy()
C1_Merge_test = C1_Merge_test.stack().reset_index()
C1_Merge_test["Date"] = C1_Merge_test["Date"].astype("str")
C1_Merge_test["Time"] = C1_Merge_test["Time"].astype("str")
C1_Merge_test["DT"] = C1_Merge_test["Date"].str.cat(C1_Merge_test["Time"], sep=" ")

cols_NA = [
    "Date",
    "Time",
    "Labels",
]

C1_Merge_test.drop(cols_NA, axis=1, inplace=True)
C1_Merge_test["DT"] = pd.to_datetime(C1_Merge_test["DT"])
C1_Merge_test = C1_Merge_test.set_index("DT")
C1_Merge_test.dropna(inplace=True)
C1_Merge_test = df_Merged.reindex(C1_Merge_test.index)
C1_Merge_test = C1_Merge_test.dropna()
C1_Merge_test["Holiday"] = C1_Merge_test["Holiday"].astype('int32')
C1_Merge_test["Season"] = C1_Merge_test["Season"].astype('int32')

In [None]:
day = 24 * 60 * 60
year = (365.2425) * day
timestamp_train = C1_Merge_train.index.map(datetime.datetime.timestamp)
timestamp_test = C1_Merge_test.index.map(datetime.datetime.timestamp)

day_sin_tr = np.sin(timestamp_train * (2 * np.pi / day))
day_cos_tr = np.cos(timestamp_train * (2 * np.pi / day))
year_sin_tr = np.sin(timestamp_train * (2 * np.pi / year))
year_cos_tr = np.cos(timestamp_train * (2 * np.pi / year))

day_sin_te = np.sin(timestamp_test * (2 * np.pi / day))
day_cos_te = np.cos(timestamp_test * (2 * np.pi / day))
year_sin_te = np.sin(timestamp_test * (2 * np.pi / year))
year_cos_te = np.cos(timestamp_test * (2 * np.pi / year))

In [None]:
cols_NA = ["Year", "Month", "Holiday", "Day", "Hour", "Minute", "Weekday", "Season"]

C1_Merge_train.drop(cols_NA, axis=1, inplace=True)
C1_Merge_test.drop(cols_NA, axis=1, inplace=True)

C1_Merge_train.insert(0, "Day_Sin", day_sin_tr)
C1_Merge_train.insert(0, "Day_Cos", day_cos_tr)
C1_Merge_train.insert(0, "Year_Sin", year_sin_tr)
C1_Merge_train.insert(0, "Year_Cos", year_cos_tr)

C1_Merge_test.insert(0, "Day_Sin", day_sin_te)
C1_Merge_test.insert(0, "Day_Cos", day_cos_te)
C1_Merge_test.insert(0, "Year_Sin", year_sin_te)
C1_Merge_test.insert(0, "Year_Cos", year_cos_te)

In [None]:
freq = (24 * 60) // 15
stl_decompose_result = STL(C1_Merge_train.Aggregate, period=freq).fit()
C1_Merge_train.insert(len(C1_Merge_train.columns), "Trend", stl_decompose_result.trend.values)

stl_decompose_result = STL(C1_Merge_test.Aggregate, period=freq).fit()
C1_Merge_test.insert(len(C1_Merge_test.columns), "Trend", stl_decompose_result.trend.values)

In [None]:
scaler_tr = MaxAbsScaler()
scaler_te = MaxAbsScaler()

C1_Merge_train[C1_Merge_train.columns] = scaler_tr.fit_transform(C1_Merge_train[C1_Merge_train.columns])
scaler_tr.max_abs_, scaler_tr.scale_ = scaler_tr.max_abs_[len(C1_Merge_train.columns) - 1], scaler_tr.scale_[len(C1_Merge_train.columns) - 1]

C1_Merge_test[C1_Merge_test.columns] = scaler_te.fit_transform(C1_Merge_test[C1_Merge_test.columns])
scaler_te.max_abs_, scaler_te.scale_ = scaler_te.max_abs_[len(C1_Merge_test.columns) - 1], scaler_te.scale_[len(C1_Merge_test.columns) - 1]

In [None]:
n_features = C1_Merge_train.shape[1]
time_steps = 24
Split = len(C1_Merge_train)

s_Train = C1_Merge_train.iloc[0 : int(Split * 0.8)]
s_Val = C1_Merge_train.iloc[int(Split * 0.8) :]

In [None]:
X_train, Y_train = to_Supervised(s_Train, s_Train.Trend, time_steps)
X_val, Y_val = to_Supervised(s_Val, s_Val.Trend, time_steps)
X_test, Y_test = to_Supervised(C1_Merge_test, C1_Merge_test.Trend, time_steps)

In [None]:
model = Sequential(
    [
        Conv1D(filters=64, kernel_size=3, activation=LeakyReLU(), padding="same", input_shape=(time_steps, n_features)),
        MaxPooling1D(2),
        Conv1D(filters=32, kernel_size=2, activation=LeakyReLU(), padding="same"),
        MaxPooling1D(2),
        Conv1D(filters=16, kernel_size=2, activation=LeakyReLU(), padding="same"),
        MaxPooling1D(2),
        LSTM(256, activation=LeakyReLU(), return_sequences=True),
        LSTM(128, activation=LeakyReLU()),
        Dense(64),
        Dense(1),
    ]
)

model.summary()
model.compile(optimizer="NAdam", loss="logcosh")

In [None]:
reduce_learning = ReduceLROnPlateau(monitor="val_loss", factor=0.2, patience=3, verbose=1, mode="auto", min_delta=0.000025, cooldown=4, min_lr=0)
eary_stopping = EarlyStopping(monitor="val_loss", min_delta=0, patience=7, verbose=1, mode="auto")

callbacks = [reduce_learning, eary_stopping]

In [None]:
history = model.fit(
    X_train,
    Y_train,
    epochs=50,
    batch_size=32,
    validation_data=(X_val, Y_val),
    verbose=1,
    shuffle=False,
    callbacks=callbacks,
)

In [None]:
loss = history.history["loss"]
val_loss = history.history["val_loss"]
epochs_graph = np.arange(1, len(loss) + 1)

x_ticks = np.arange(0, len(loss) + 1, 5)
x_ticks = np.insert(x_ticks, 1, 1)
x_ticks = np.insert(x_ticks, len(x_ticks), len(loss))

plt.xticks(x_ticks)
plt.xlim(1, len(loss))
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.plot(epochs_graph, loss, "red", label="Training loss")
plt.plot(epochs_graph, val_loss, "blue", label="Validation loss")
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
model.save("Models/REFIT-Trend.h5")

In [None]:
# model = load_model("Models/REFIT-Trend.h5", custom_objects={"LeakyReLU": tensorflow.keras.layers.LeakyReLU})
# model.summary()

In [None]:
Predictions = model.predict(X_test)
Predictions = scaler_te.inverse_transform(Predictions.reshape(-1, 1))

s_Pred = C1_Merge_test["Trend"].to_frame().copy()
s_Pred = s_Pred.iloc[time_steps:]
s_Pred[s_Pred.columns] = scaler_te.inverse_transform(s_Pred[s_Pred.columns])
s_Pred.insert(len(s_Pred.columns), "Predictions", Predictions)

In [None]:
forecast_accuracy(s_Pred.Predictions.ravel(), s_Pred.Trend.ravel())

## 4.2: Raw data

In [None]:
C1_Merge_train = CART_CL.loc[:, "Labels"].to_frame()
df_REFIT_resampled_train = df_REFIT_resampled_CL.copy()
df_REFIT_resampled_train.Labels = C1_Merge_train.Labels
df_REFIT_resampled_train.dropna(inplace=True)
C1_Merge_train = df_REFIT_resampled_train.loc[df_REFIT_resampled_train["Labels"] == 1].copy()
C1_Merge_train = C1_Merge_train.stack().reset_index()
C1_Merge_train["Date"] = C1_Merge_train["Date"].astype("str")
C1_Merge_train["Time"] = C1_Merge_train["Time"].astype("str")
C1_Merge_train["DT"] = C1_Merge_train["Date"].str.cat(C1_Merge_train["Time"], sep=" ")

cols_NA = [
    "Date",
    "Time",
    "Labels",
    "Day_name",
    "Month_name"
]

C1_Merge_train.drop(cols_NA, axis=1, inplace=True)
C1_Merge_train["DT"] = pd.to_datetime(C1_Merge_train["DT"])
C1_Merge_train = C1_Merge_train.set_index("DT")
C1_Merge_train.dropna(inplace=True)
C1_Merge_train = df_Merged.reindex(C1_Merge_train.index)
C1_Merge_train = C1_Merge_train.dropna()
C1_Merge_train["Holiday"] = C1_Merge_train["Holiday"].astype('int32')
C1_Merge_train["Season"] = C1_Merge_train["Season"].astype('int32')

In [None]:
C1_Merge_test = CART_TE.loc[:, "Labels"].to_frame()
df_REFIT_resampled_test = df_REFIT_resampled_TE.copy()
df_REFIT_resampled_test.insert(len(df_REFIT_resampled_test.columns), "Labels", C1_Merge_test.Labels)
df_REFIT_resampled_test.dropna(inplace=True)
C1_Merge_test = df_REFIT_resampled_test.loc[df_REFIT_resampled_test["Labels"] == 1].copy()
C1_Merge_test = C1_Merge_test.stack().reset_index()
C1_Merge_test["Date"] = C1_Merge_test["Date"].astype("str")
C1_Merge_test["Time"] = C1_Merge_test["Time"].astype("str")
C1_Merge_test["DT"] = C1_Merge_test["Date"].str.cat(C1_Merge_test["Time"], sep=" ")

cols_NA = [
    "Date",
    "Time",
    "Labels",
]

C1_Merge_test.drop(cols_NA, axis=1, inplace=True)
C1_Merge_test["DT"] = pd.to_datetime(C1_Merge_test["DT"])
C1_Merge_test = C1_Merge_test.set_index("DT")
C1_Merge_test.dropna(inplace=True)
C1_Merge_test = df_Merged.reindex(C1_Merge_test.index)
C1_Merge_test = C1_Merge_test.dropna()
C1_Merge_test["Holiday"] = C1_Merge_test["Holiday"].astype('int32')
C1_Merge_test["Season"] = C1_Merge_test["Season"].astype('int32')

In [None]:
day = 24 * 60 * 60
year = (365.2425) * day
timestamp_train = C1_Merge_train.index.map(datetime.datetime.timestamp)
timestamp_test = C1_Merge_test.index.map(datetime.datetime.timestamp)

day_sin_tr = np.sin(timestamp_train * (2 * np.pi / day))
day_cos_tr = np.cos(timestamp_train * (2 * np.pi / day))
year_sin_tr = np.sin(timestamp_train * (2 * np.pi / year))
year_cos_tr = np.cos(timestamp_train * (2 * np.pi / year))

day_sin_te = np.sin(timestamp_test * (2 * np.pi / day))
day_cos_te = np.cos(timestamp_test * (2 * np.pi / day))
year_sin_te = np.sin(timestamp_test * (2 * np.pi / year))
year_cos_te = np.cos(timestamp_test * (2 * np.pi / year))

In [None]:
cols_NA = ["Year", "Month", "Holiday", "Day", "Hour", "Minute", "Weekday", "Season"]

C1_Merge_train.drop(cols_NA, axis=1, inplace=True)
C1_Merge_test.drop(cols_NA, axis=1, inplace=True)

C1_Merge_train.insert(0, "Day_Sin", day_sin_tr)
C1_Merge_train.insert(0, "Day_Cos", day_cos_tr)
C1_Merge_train.insert(0, "Year_Sin", year_sin_tr)
C1_Merge_train.insert(0, "Year_Cos", year_cos_tr)

C1_Merge_test.insert(0, "Day_Sin", day_sin_te)
C1_Merge_test.insert(0, "Day_Cos", day_cos_te)
C1_Merge_test.insert(0, "Year_Sin", year_sin_te)
C1_Merge_test.insert(0, "Year_Cos", year_cos_te)

In [None]:
freq = (24 * 60) // 15
stl_decompose_result = STL(C1_Merge_train.Aggregate, period=freq).fit()
C1_Merge_train.insert(len(C1_Merge_train.columns), "Trend", stl_decompose_result.trend.values)

stl_decompose_result = STL(C1_Merge_test.Aggregate, period=freq).fit()
C1_Merge_test.insert(len(C1_Merge_test.columns), "Trend", stl_decompose_result.trend.values)

In [None]:
C1_GAP_train = C1_Merge_train.Aggregate.copy().to_frame()
C1_Merge_train.Aggregate = savgol_filter(C1_Merge_train.Aggregate, 5, 3)

C1_GAP_test = C1_Merge_test.Aggregate.copy().to_frame()
C1_Merge_test.Aggregate = savgol_filter(C1_Merge_test.Aggregate, 5, 3)

In [None]:
scaler_tr = MinMaxScaler()
scaler_te = MinMaxScaler()

C1_Merge_train[C1_Merge_train.columns] = scaler_tr.fit_transform(C1_Merge_train[C1_Merge_train.columns])
scaler_tr.min_, scaler_tr.scale_ = scaler_tr.min_[len(C1_Merge_train.columns) - 2], scaler_tr.scale_[len(C1_Merge_train.columns) - 2]

C1_Merge_test[C1_Merge_test.columns] = scaler_te.fit_transform(C1_Merge_test[C1_Merge_test.columns])
scaler_te.min_, scaler_te.scale_ = scaler_te.min_[len(C1_Merge_test.columns) - 2], scaler_te.scale_[len(C1_Merge_test.columns) - 2]

In [None]:
n_features = C1_Merge_train.shape[1]
time_steps = 24
Split = len(C1_Merge_train)

s_Train = C1_Merge_train.iloc[0 : int(Split * 0.8)]
s_Val = C1_Merge_train.iloc[int(Split * 0.8) :]
o_Test = C1_GAP_test.iloc[time_steps :]

In [None]:
X_train, Y_train = to_Supervised(s_Train, s_Train.Aggregate, time_steps)
X_val, Y_val = to_Supervised(s_Val, s_Val.Aggregate, time_steps)
X_test, Y_test = to_Supervised(C1_Merge_test, C1_Merge_test.Aggregate, time_steps)

In [None]:
model = Sequential(
    [
        Conv1D(filters=64, kernel_size=3, activation=LeakyReLU(), padding="same", input_shape=(time_steps, n_features)),
        MaxPooling1D(2),
        Conv1D(filters=32, kernel_size=2, activation=LeakyReLU(), padding="same"),
        MaxPooling1D(2),
        Conv1D(filters=16, kernel_size=2, activation=LeakyReLU(), padding="same"),
        MaxPooling1D(2),
        LSTM(256, activation=LeakyReLU(), return_sequences=True),
        LSTM(128, activation=LeakyReLU()),
        Dense(64),
        Dense(1),
    ]
)

model.summary()
model.compile(optimizer="NAdam", loss="logcosh")

In [None]:
reduce_learning = ReduceLROnPlateau(monitor="val_loss", factor=0.2, patience=3, verbose=1, mode="auto", min_delta=0.000025, cooldown=4, min_lr=0)
eary_stopping = EarlyStopping(monitor="val_loss", min_delta=0, patience=7, verbose=1, mode="auto")

callbacks = [reduce_learning, eary_stopping]

In [None]:
history = model.fit(
    X_train,
    Y_train,
    epochs=50,
    batch_size=32,
    validation_data=(X_val, Y_val),
    verbose=1,
    shuffle=False,
    callbacks=callbacks,
)

In [None]:
loss = history.history["loss"]
val_loss = history.history["val_loss"]
epochs_graph = np.arange(1, len(loss) + 1)

x_ticks = np.arange(0, len(loss) + 1, 5)
x_ticks = np.insert(x_ticks, 1, 1)
x_ticks = np.insert(x_ticks, len(x_ticks), len(loss))

plt.xticks(x_ticks)
plt.xlim(1, len(loss))
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.plot(epochs_graph, loss, "red", label="Training loss")
plt.plot(epochs_graph, val_loss, "blue", label="Validation loss")
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
model.save("Models/REFIT-Raw.h5")

In [None]:
# model = load_model("Models/REFIT-Raw.h5", custom_objects={"LeakyReLU": tensorflow.keras.layers.LeakyReLU})
# model.summary()

In [None]:
Predictions = model.predict(X_test)
Predictions = scaler_te.inverse_transform(Predictions.reshape(-1, 1))

s_Pred = C1_Merge_test["Aggregate"].to_frame().copy()
s_Pred = s_Pred.iloc[time_steps:]
s_Pred[s_Pred.columns] = scaler_te.inverse_transform(s_Pred[s_Pred.columns])
s_Pred.insert(len(s_Pred.columns), "Predictions", Predictions)

In [None]:
forecast_accuracy(s_Pred.Aggregate.ravel(), s_Pred.Predictions.ravel())

## 4.3: 12 steps ahead - Trend

In [None]:
C1_Merge_train = CART_CL.loc[:, "Labels"].to_frame()
df_REFIT_resampled_train = df_REFIT_resampled_CL.copy()
df_REFIT_resampled_train.Labels = C1_Merge_train.Labels
df_REFIT_resampled_train.dropna(inplace=True)
C1_Merge_train = df_REFIT_resampled_train.loc[df_REFIT_resampled_train["Labels"] == 1].copy()
C1_Merge_train = C1_Merge_train.stack().reset_index()
C1_Merge_train["Date"] = C1_Merge_train["Date"].astype("str")
C1_Merge_train["Time"] = C1_Merge_train["Time"].astype("str")
C1_Merge_train["DT"] = C1_Merge_train["Date"].str.cat(C1_Merge_train["Time"], sep=" ")

cols_NA = [
    "Date",
    "Time",
    "Labels",
    "Day_name",
    "Month_name"
]

C1_Merge_train.drop(cols_NA, axis=1, inplace=True)
C1_Merge_train["DT"] = pd.to_datetime(C1_Merge_train["DT"])
C1_Merge_train = C1_Merge_train.set_index("DT")
C1_Merge_train.dropna(inplace=True)
C1_Merge_train = df_Merged.reindex(C1_Merge_train.index)
C1_Merge_train = C1_Merge_train.dropna()
C1_Merge_train["Holiday"] = C1_Merge_train["Holiday"].astype('int32')
C1_Merge_train["Season"] = C1_Merge_train["Season"].astype('int32')

In [None]:
C1_Merge_test = CART_TE.loc[:, "Labels"].to_frame()
df_REFIT_resampled_test = df_REFIT_resampled_TE.copy()
df_REFIT_resampled_test.insert(len(df_REFIT_resampled_test.columns), "Labels", C1_Merge_test.Labels)
df_REFIT_resampled_test.dropna(inplace=True)
C1_Merge_test = df_REFIT_resampled_test.loc[df_REFIT_resampled_test["Labels"] == 1].copy()
C1_Merge_test = C1_Merge_test.stack().reset_index()
C1_Merge_test["Date"] = C1_Merge_test["Date"].astype("str")
C1_Merge_test["Time"] = C1_Merge_test["Time"].astype("str")
C1_Merge_test["DT"] = C1_Merge_test["Date"].str.cat(C1_Merge_test["Time"], sep=" ")

cols_NA = [
    "Date",
    "Time",
    "Labels",
]

C1_Merge_test.drop(cols_NA, axis=1, inplace=True)
C1_Merge_test["DT"] = pd.to_datetime(C1_Merge_test["DT"])
C1_Merge_test = C1_Merge_test.set_index("DT")
C1_Merge_test.dropna(inplace=True)
C1_Merge_test = df_Merged.reindex(C1_Merge_test.index)
C1_Merge_test = C1_Merge_test.dropna()
C1_Merge_test["Holiday"] = C1_Merge_test["Holiday"].astype('int32')
C1_Merge_test["Season"] = C1_Merge_test["Season"].astype('int32')

In [None]:
day = 24 * 60 * 60
year = (365.2425) * day
timestamp_train = C1_Merge_train.index.map(datetime.datetime.timestamp)
timestamp_test = C1_Merge_test.index.map(datetime.datetime.timestamp)

day_sin_tr = np.sin(timestamp_train * (2 * np.pi / day))
day_cos_tr = np.cos(timestamp_train * (2 * np.pi / day))
year_sin_tr = np.sin(timestamp_train * (2 * np.pi / year))
year_cos_tr = np.cos(timestamp_train * (2 * np.pi / year))

day_sin_te = np.sin(timestamp_test * (2 * np.pi / day))
day_cos_te = np.cos(timestamp_test * (2 * np.pi / day))
year_sin_te = np.sin(timestamp_test * (2 * np.pi / year))
year_cos_te = np.cos(timestamp_test * (2 * np.pi / year))

In [None]:
cols_NA = ["Year", "Month", "Holiday", "Day", "Hour", "Minute", "Weekday", "Season"]

C1_Merge_train.drop(cols_NA, axis=1, inplace=True)
C1_Merge_test.drop(cols_NA, axis=1, inplace=True)

C1_Merge_train.insert(0, "Day_Sin", day_sin_tr)
C1_Merge_train.insert(0, "Day_Cos", day_cos_tr)
C1_Merge_train.insert(0, "Year_Sin", year_sin_tr)
C1_Merge_train.insert(0, "Year_Cos", year_cos_tr)

C1_Merge_test.insert(0, "Day_Sin", day_sin_te)
C1_Merge_test.insert(0, "Day_Cos", day_cos_te)
C1_Merge_test.insert(0, "Year_Sin", year_sin_te)
C1_Merge_test.insert(0, "Year_Cos", year_cos_te)

In [None]:
freq = (24 * 60) // 15
stl_decompose_result = STL(C1_Merge_train.Aggregate, period=freq).fit()
C1_Merge_train.insert(len(C1_Merge_train.columns), "Trend", stl_decompose_result.trend.values)

stl_decompose_result = STL(C1_Merge_test.Aggregate, period=freq).fit()
C1_Merge_test.insert(len(C1_Merge_test.columns), "Trend", stl_decompose_result.trend.values)

In [None]:
C1_GAP_train = C1_Merge_train.Aggregate.copy().to_frame()
C1_Merge_train.Aggregate = savgol_filter(C1_Merge_train.Aggregate, 5, 3)

C1_GAP_test = C1_Merge_test.Aggregate.copy().to_frame()
C1_Merge_test.Aggregate = savgol_filter(C1_Merge_test.Aggregate, 5, 3)

In [None]:
scaler_tr = MaxAbsScaler()
scaler_te = MaxAbsScaler()

C1_Merge_train[C1_Merge_train.columns] = scaler_tr.fit_transform(C1_Merge_train[C1_Merge_train.columns])
scaler_tr.max_abs_, scaler_tr.scale_ = scaler_tr.max_abs_[len(C1_Merge_train.columns) - 1], scaler_tr.scale_[len(C1_Merge_train.columns) - 1]

C1_Merge_test[C1_Merge_test.columns] = scaler_te.fit_transform(C1_Merge_test[C1_Merge_test.columns])
scaler_te.max_abs_, scaler_te.scale_ = scaler_te.max_abs_[len(C1_Merge_test.columns) - 1], scaler_te.scale_[len(C1_Merge_test.columns) - 1]

In [None]:
n_features = C1_Merge_train.shape[1]
time_steps = 24
Split = len(C1_Merge_train)

s_Train = C1_Merge_train.iloc[0 : int(Split * 0.8)]
s_Val = C1_Merge_train.iloc[int(Split * 0.8) :]
o_Test = C1_GAP_test.iloc[time_steps :]

In [None]:
X_train, Y_train = to_Supervised_ms(s_Train.values, lag=24, n_ahead=12, target_index=len(s_Train.columns) - 1)
X_val, Y_val = to_Supervised_ms(s_Val.values, lag=24, n_ahead=12, target_index=len(s_Val.columns) - 1)
X_test, Y_test = to_Supervised_ms(C1_Merge_test.values, lag=24, n_ahead=12, target_index=len(C1_Merge_test.columns) - 1)

In [None]:
model = Sequential(
    [
        Conv1D(filters=64, kernel_size=3, activation=LeakyReLU(), padding="same", input_shape=(time_steps, n_features)),
        MaxPooling1D(2),
        Conv1D(filters=32, kernel_size=2, activation=LeakyReLU(), padding="same"),
        MaxPooling1D(2),
        Conv1D(filters=16, kernel_size=2, activation=LeakyReLU(), padding="same"),
        MaxPooling1D(2),
        LSTM(256, activation=LeakyReLU(), return_sequences=True),
        LSTM(128, activation=LeakyReLU()),
        Dense(64),
        Dense(12),
    ]
)

model.summary()
model.compile(optimizer="NAdam", loss="logcosh")

In [None]:
reduce_learning = ReduceLROnPlateau(monitor="val_loss", factor=0.2, patience=3, verbose=1, mode="auto", min_delta=0.000025, cooldown=4, min_lr=0)
eary_stopping = EarlyStopping(monitor="val_loss", min_delta=0, patience=7, verbose=1, mode="auto")

callbacks = [reduce_learning, eary_stopping]

In [None]:
history = model.fit(
    X_train,
    Y_train,
    epochs=50,
    batch_size=32,
    validation_data=(X_val, Y_val),
    verbose=1,
    shuffle=False,
    callbacks=callbacks,
)

In [None]:
loss = history.history["loss"]
val_loss = history.history["val_loss"]
epochs_graph = np.arange(1, len(loss) + 1)

x_ticks = np.arange(0, len(loss) + 1, 5)
x_ticks = np.insert(x_ticks, 1, 1)
x_ticks = np.insert(x_ticks, len(x_ticks), len(loss))

plt.xticks(x_ticks)
plt.xlim(1, len(loss))
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.plot(epochs_graph, loss, "red", label="Training loss")
plt.plot(epochs_graph, val_loss, "blue", label="Validation loss")
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
model.save("Models/UCID-Raw-12-Steps-Trend.h5")

In [None]:
# model = load_model("Models/UCID-Raw-12-Steps.h5", custom_objects={"LeakyReLU": tensorflow.keras.layers.LeakyReLU})
# model.summary()

In [None]:
Predictions = model.predict(X_test)
Predictions = scaler_te.inverse_transform(Predictions.reshape(-1, 1))
Y_test = scaler_te.inverse_transform(Y_test.reshape(-1, 1))

In [None]:
forecast_accuracy(Predictions, Y_test)

## 4.3: 12 steps ahead - Raw data

In [None]:
C1_Merge_train = CART_CL.loc[:, "Labels"].to_frame()
df_REFIT_resampled_train = df_REFIT_resampled_CL.copy()
df_REFIT_resampled_train.Labels = C1_Merge_train.Labels
df_REFIT_resampled_train.dropna(inplace=True)
C1_Merge_train = df_REFIT_resampled_train.loc[df_REFIT_resampled_train["Labels"] == 1].copy()
C1_Merge_train = C1_Merge_train.stack().reset_index()
C1_Merge_train["Date"] = C1_Merge_train["Date"].astype("str")
C1_Merge_train["Time"] = C1_Merge_train["Time"].astype("str")
C1_Merge_train["DT"] = C1_Merge_train["Date"].str.cat(C1_Merge_train["Time"], sep=" ")

cols_NA = [
    "Date",
    "Time",
    "Labels",
    "Day_name",
    "Month_name"
]

C1_Merge_train.drop(cols_NA, axis=1, inplace=True)
C1_Merge_train["DT"] = pd.to_datetime(C1_Merge_train["DT"])
C1_Merge_train = C1_Merge_train.set_index("DT")
C1_Merge_train.dropna(inplace=True)
C1_Merge_train = df_Merged.reindex(C1_Merge_train.index)
C1_Merge_train = C1_Merge_train.dropna()
C1_Merge_train["Holiday"] = C1_Merge_train["Holiday"].astype('int32')
C1_Merge_train["Season"] = C1_Merge_train["Season"].astype('int32')

In [None]:
C1_Merge_test = CART_TE.loc[:, "Labels"].to_frame()
df_REFIT_resampled_test = df_REFIT_resampled_TE.copy()
df_REFIT_resampled_test.insert(len(df_REFIT_resampled_test.columns), "Labels", C1_Merge_test.Labels)
df_REFIT_resampled_test.dropna(inplace=True)
C1_Merge_test = df_REFIT_resampled_test.loc[df_REFIT_resampled_test["Labels"] == 1].copy()
C1_Merge_test = C1_Merge_test.stack().reset_index()
C1_Merge_test["Date"] = C1_Merge_test["Date"].astype("str")
C1_Merge_test["Time"] = C1_Merge_test["Time"].astype("str")
C1_Merge_test["DT"] = C1_Merge_test["Date"].str.cat(C1_Merge_test["Time"], sep=" ")

cols_NA = [
    "Date",
    "Time",
    "Labels",
]

C1_Merge_test.drop(cols_NA, axis=1, inplace=True)
C1_Merge_test["DT"] = pd.to_datetime(C1_Merge_test["DT"])
C1_Merge_test = C1_Merge_test.set_index("DT")
C1_Merge_test.dropna(inplace=True)
C1_Merge_test = df_Merged.reindex(C1_Merge_test.index)
C1_Merge_test = C1_Merge_test.dropna()
C1_Merge_test["Holiday"] = C1_Merge_test["Holiday"].astype('int32')
C1_Merge_test["Season"] = C1_Merge_test["Season"].astype('int32')

In [None]:
day = 24 * 60 * 60
year = (365.2425) * day
timestamp_train = C1_Merge_train.index.map(datetime.datetime.timestamp)
timestamp_test = C1_Merge_test.index.map(datetime.datetime.timestamp)

day_sin_tr = np.sin(timestamp_train * (2 * np.pi / day))
day_cos_tr = np.cos(timestamp_train * (2 * np.pi / day))
year_sin_tr = np.sin(timestamp_train * (2 * np.pi / year))
year_cos_tr = np.cos(timestamp_train * (2 * np.pi / year))

day_sin_te = np.sin(timestamp_test * (2 * np.pi / day))
day_cos_te = np.cos(timestamp_test * (2 * np.pi / day))
year_sin_te = np.sin(timestamp_test * (2 * np.pi / year))
year_cos_te = np.cos(timestamp_test * (2 * np.pi / year))

In [None]:
cols_NA = ["Year", "Month", "Holiday", "Day", "Hour", "Minute", "Weekday", "Season"]

C1_Merge_train.drop(cols_NA, axis=1, inplace=True)
C1_Merge_test.drop(cols_NA, axis=1, inplace=True)

C1_Merge_train.insert(0, "Day_Sin", day_sin_tr)
C1_Merge_train.insert(0, "Day_Cos", day_cos_tr)
C1_Merge_train.insert(0, "Year_Sin", year_sin_tr)
C1_Merge_train.insert(0, "Year_Cos", year_cos_tr)

C1_Merge_test.insert(0, "Day_Sin", day_sin_te)
C1_Merge_test.insert(0, "Day_Cos", day_cos_te)
C1_Merge_test.insert(0, "Year_Sin", year_sin_te)
C1_Merge_test.insert(0, "Year_Cos", year_cos_te)

In [None]:
freq = (24 * 60) // 15
stl_decompose_result = STL(C1_Merge_train.Aggregate, period=freq).fit()
C1_Merge_train.insert(len(C1_Merge_train.columns), "Trend", stl_decompose_result.trend.values)

stl_decompose_result = STL(C1_Merge_test.Aggregate, period=freq).fit()
C1_Merge_test.insert(len(C1_Merge_test.columns), "Trend", stl_decompose_result.trend.values)

In [None]:
C1_GAP_train = C1_Merge_train.Aggregate.copy().to_frame()
C1_Merge_train.Aggregate = savgol_filter(C1_Merge_train.Aggregate, 5, 3)

C1_GAP_test = C1_Merge_test.Aggregate.copy().to_frame()
C1_Merge_test.Aggregate = savgol_filter(C1_Merge_test.Aggregate, 5, 3)

In [None]:
scaler_tr = MinMaxScaler()
scaler_te = MinMaxScaler()

C1_Merge_train[C1_Merge_train.columns] = scaler_tr.fit_transform(C1_Merge_train[C1_Merge_train.columns])
scaler_tr.min_, scaler_tr.scale_ = scaler_tr.min_[len(C1_Merge_train.columns) - 2], scaler_tr.scale_[len(C1_Merge_train.columns) - 2]

C1_Merge_test[C1_Merge_test.columns] = scaler_te.fit_transform(C1_Merge_test[C1_Merge_test.columns])
scaler_te.min_, scaler_te.scale_ = scaler_te.min_[len(C1_Merge_test.columns) - 2], scaler_te.scale_[len(C1_Merge_test.columns) - 2]

In [None]:
n_features = C1_Merge_train.shape[1]
time_steps = 24
Split = len(C1_Merge_train)

s_Train = C1_Merge_train.iloc[0 : int(Split * 0.8)]
s_Val = C1_Merge_train.iloc[int(Split * 0.8) :]
o_Test = C1_GAP_test.iloc[time_steps :]

In [None]:
X_train, Y_train = to_Supervised_ms(s_Train.values, lag=24, n_ahead=12, target_index=len(s_Train.columns) - 2)
X_val, Y_val = to_Supervised_ms(s_Val.values, lag=24, n_ahead=12, target_index=len(s_Val.columns) - 2)
X_test, Y_test = to_Supervised_ms(C1_Merge_test.values, lag=24, n_ahead=12, target_index=len(C1_Merge_test.columns) - 2)

In [None]:
model = Sequential(
    [
        Conv1D(filters=64, kernel_size=3, activation=LeakyReLU(), padding="same", input_shape=(time_steps, n_features)),
        MaxPooling1D(2),
        Conv1D(filters=32, kernel_size=2, activation=LeakyReLU(), padding="same"),
        MaxPooling1D(2),
        Conv1D(filters=16, kernel_size=2, activation=LeakyReLU(), padding="same"),
        MaxPooling1D(2),
        LSTM(256, activation=LeakyReLU(), return_sequences=True),
        LSTM(128, activation=LeakyReLU()),
        Dense(64),
        Dense(12),
    ]
)

model.summary()
model.compile(optimizer="NAdam", loss="logcosh")

In [None]:
reduce_learning = ReduceLROnPlateau(monitor="val_loss", factor=0.2, patience=3, verbose=1, mode="auto", min_delta=0.000025, cooldown=4, min_lr=0)
eary_stopping = EarlyStopping(monitor="val_loss", min_delta=0, patience=7, verbose=1, mode="auto")

callbacks = [reduce_learning, eary_stopping]

In [None]:
history = model.fit(
    X_train,
    Y_train,
    epochs=50,
    batch_size=32,
    validation_data=(X_val, Y_val),
    verbose=1,
    shuffle=False,
    callbacks=callbacks,
)

In [None]:
loss = history.history["loss"]
val_loss = history.history["val_loss"]
epochs_graph = np.arange(1, len(loss) + 1)

x_ticks = np.arange(0, len(loss) + 1, 5)
x_ticks = np.insert(x_ticks, 1, 1)
x_ticks = np.insert(x_ticks, len(x_ticks), len(loss))

plt.xticks(x_ticks)
plt.xlim(1, len(loss))
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.plot(epochs_graph, loss, "red", label="Training loss")
plt.plot(epochs_graph, val_loss, "blue", label="Validation loss")
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
model.save("Models/UCID-Raw-12-Steps-Trend.h5")

In [None]:
# model = load_model("Models/UCID-Raw-12-Steps.h5", custom_objects={"LeakyReLU": tensorflow.keras.layers.LeakyReLU})
# model.summary()

In [None]:
Predictions = model.predict(X_test)
Predictions = scaler_te.inverse_transform(Predictions.reshape(-1, 1))
Y_test = scaler_te.inverse_transform(Y_test.reshape(-1, 1))

In [None]:
forecast_accuracy(Y_test, Predictions)