In [4]:
from typing import Tuple

import itertools

import pickle

#from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

from sklearn.preprocessing import MinMaxScaler

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
torch.manual_seed(99)

from matplotlib import pyplot as plot

from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel

from sklearn.metrics import explained_variance_score, mean_squared_error, max_error, mean_absolute_error
from scipy.stats import pearsonr

def printRegStatistics(truth, preds):
    print("The RVE is: ", explained_variance_score(truth, preds))
    print("The rmse is: ", mean_squared_error(truth, preds, squared=False))
    corr, pval = pearsonr(truth, preds)
    print("The Correlation Score is is: %6.4f (p-value=%e)\n"%(corr,pval))
    print("The Maximum Error is is: ", max_error(truth, preds))
    print("The Mean Absolute Error is: ", mean_absolute_error(truth, preds))

def split_data(arr: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
    total_len,_=arr.shape

    train_p = 0.80
    test_p = 0.10
    ivs_p = 0.10

    train_len = round(total_len*train_p)
    test_len = round(total_len*test_p)
    ivs_len = round(total_len*ivs_p)

    model_len = train_len+test_len

    total_used_len = model_len+ivs_len

    train = arr[:train_len]
    test = arr[train_len:model_len]
    ivs = arr[model_len:total_used_len]

    return train, test, ivs

In [5]:
class LSTM(nn.Module):

    output_size: int

    hidden_layer_size: int
    lstm: nn.LSTM
    linear: nn.Linear
    hidden_cell: tuple[torch.Tensor, torch.Tensor]
    scaler: MinMaxScaler
    device: str

    def __init__(self, input_size=1, hidden_layer_size=100, output_size=1, num_layers=1):
        super().__init__()

        self.output_size = output_size

        self.hidden_layer_size = hidden_layer_size

        self.lstm = nn.LSTM(input_size, hidden_layer_size)


        self.linear = nn.Linear(hidden_layer_size, output_size)

        self.hidden_cell = (torch.zeros(1,1,self.hidden_layer_size),
                            torch.zeros(1,1,self.hidden_layer_size))

        self.scaler = MinMaxScaler(feature_range=(-1, 1))

        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        print(self.device)

    def forward(self, input_seq):
        lstm_out, self.hidden_cell = self.lstm(input_seq.view(len(input_seq) ,1, -1), self.hidden_cell)
        predictions = self.linear(lstm_out.view(len(input_seq), -1))
        return predictions[-1]
    
    def predict(self, input):
        X = self.scaler.fit_transform(input.reshape(-1, 1))
        X = seq = torch.FloatTensor(X).to(self.device)
        with torch.no_grad():
            self.hidden = (torch.zeros(1, 1, self.hidden_layer_size).to(self.device),
                            torch.zeros(1, 1, self.hidden_layer_size).to(self.device))
            preds = self(X)
            Y = np.array(preds.cpu())
        actual_predictions = self.scaler.inverse_transform(Y.reshape(-1, 1))
        return actual_predictions

    def set_device(self, device = None):
        if device is None:
            self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        elif device == "cpu" or device == "cuda":
            self.device = device
        print(self.device)
        
        self.to(self.device)

    def train(self, train_data, train_window = 50, epochs = 100):

        # Initiate loss function and optimizer

        loss_function = nn.MSELoss().to(self.device)
        optimizer = torch.optim.Adam(self.parameters(), lr=0.001)
        
        # Scale
        
        train_data_normalized = self.scaler.fit_transform(train_data.reshape(-1, 1))
        train_data_normalized = torch.FloatTensor(train_data_normalized).view(-1).to(self.device)

        # Create sequences
        
        train_inout_seq = []
        L = len(train_data_normalized)
        for i in range(L-train_window):
            train_seq = train_data_normalized[i:i+train_window]
            # TODO - offset on the start for direct decoding with several (5) of these models giving (5) concurrent predictions?
            train_label = train_data_normalized[i+train_window:i+train_window+self.output_size]
            train_inout_seq.append((train_seq ,train_label))
        
        # Train the model

        for i in range(epochs):
            _ix = -1
            for seq, labels in train_inout_seq:
                seq = seq.to(self.device)
                labels = labels.to(self.device)

                if len(labels) != self.output_size:
                    continue

                _ix +=1
                
                #if torch.isnan(seq).any().item():
                #    print(f"nan values in seq at {_ix}")
                #    continue

                #if pd.isna(labels.item()):
                #    print(f"nan labels at {_ix}")
                #    continue

                optimizer.zero_grad()
                self.hidden_cell = (torch.zeros(1, 1, self.hidden_layer_size).to(self.device),
                                torch.zeros(1, 1, self.hidden_layer_size).to(self.device))

                y_pred = self(seq)
                #if pd.isna(y_pred.item()):
                #    print(f"nan preds at {_ix}")
                #    continue

                single_loss = loss_function(y_pred, labels)
                #if pd.isna(single_loss.item()):
                #    print(f"nan loss at {_ix}")
                #    raise
                single_loss.backward()
                optimizer.step()

            if i%25 == 1:
                print(f'epoch: {i:3} loss: {single_loss.item():10.8f}')

        print(f'epoch: {i:3} loss: {single_loss.item():10.10f}')

        # Finalize
        #self.eval()

In [6]:
class CustomRandomForestRegressor(RandomForestRegressor):

    n_preds: int
    n_estimators: int
    
    def __init__(self, n_preds=4,n_estimators=100):
        super().__init__(n_estimators=100)
        self.n_preds = n_preds
        self.n_estimators = n_estimators
    
    def total_predict(self, outside_val_curr: float, outside_val_preds: np.array, inside_val_curr: float):
        inside_preds = []
        assert len(outside_val_preds) == self.n_preds
        for i in range(self.n_preds):
            if i == 0:
                input = np.array([outside_val_curr, outside_val_preds[i][0], inside_val_curr], dtype="object").reshape(1, -1)
            else:
                input = np.array([outside_val_preds[i-1], outside_val_preds[i][0], inside_preds[i-1]], dtype="object").reshape(1, -1)
            pred = self.predict(input)
            inside_preds.append(pred)
        return inside_preds

# Pre-Process data

In [7]:
df = pd.read_csv("../data/datasets/historical.data")
df = df.sort_values(by="date",ascending=True)
df_inside = df.loc[df["label"] == "inside"].drop(["label"], axis=1)
df_outside = df.loc[df["label"] == "outside"].drop(["label"], axis=1)
inside_arr=np.array(df_inside)
outside_arr=np.array(df_outside)

# Model 1 -> Outside Temperature Prediction

## Train the LSTM Model

In [8]:
outside_train, outside_test, outside_ivs = split_data(outside_arr)

In [9]:
per_min = (2*60) + 30
per_30_mins = per_min / 30

print("Number of Wanted Predictions: " + str(per_30_mins))

Number of Wanted Predictions: 5.0


In [10]:
train_data = outside_train[:,1]

In [72]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = LSTM().to(device)

cuda


In [73]:
model.train(train_data, epochs=1)

epoch:   0 loss: 0.0000215613


In [74]:
test_data = outside_test[:,1]

In [75]:
print(model.predict(test_data[0:49]))
print(test_data[50])

[[10.737656]]
10.78


In [76]:
print(model.predict(test_data[1:50]))
print(test_data[51])

[[10.812096]]
10.58


In [77]:
print(model.predict(test_data[2:51]))
print(test_data[52])

[[10.824025]]
10.35


## Test the Model

In [79]:

test_window = 50

test_inout_seq = []
L = len(test_data)
for i in range(L-test_window):
    train_seq = test_data[i:i+test_window]
    train_label = test_data[i+test_window:i+test_window+1]
    test_inout_seq.append((train_seq ,train_label))

In [80]:
test_X = [ x for x, _ in test_inout_seq ]
true_Y = [ y[0] for _, y in test_inout_seq ]

In [81]:
pred_Y = []
for X in test_X:
    Y = model.predict(X)[0]
    pred_Y.append(Y)

In [82]:
printRegStatistics(true_Y, pred_Y)

The RVE is:  0.9592687396444167
The rmse is:  1.1444307181832196
The Correlation Score is is: 0.9836 (p-value=0.000000e+00)

The Maximum Error is is:  11.14809281539917
The Mean Absolute Error is:  0.7564291050544436


In [83]:
print(model.predict(np.array([test_data[1]])))
print(test_data[2])

[[12.278472]]
12.66


In [84]:
test_data

array([11.98, 12.3, 12.66, ..., 14.822, 15.6, 16.289], dtype=object)

In [85]:
pred_Y = model.predict(np.array(test_data[0:49]))[0]
pred_Y = list(pred_Y)

test_X_stream = test_data[50:]
_ix = 0
for X in test_X_stream:
    if _ix == len(test_X_stream) - 1:
        break
    Y = model.predict(np.array([X]))[0,0]
    pred_Y.append(Y)
    _ix+=1

pred_Y = np.array(pred_Y)
true_Y = test_data[50:]

In [86]:
printRegStatistics(true_Y, pred_Y)

The RVE is:  0.9747749111592113
The rmse is:  0.8317133807817854
The Correlation Score is is: 0.9874 (p-value=0.000000e+00)

The Maximum Error is is:  10.431982585906983
The Mean Absolute Error is:  0.5514762293463369


## Find the best Hyperparams

In [None]:
results = []

truth = test_data[50:]

hidden_layer_sizes = [50,100,150,200]
nums_layers = [1,2,4]

hyper_params = itertools.product(hidden_layer_sizes, nums_layers)

for params in hyper_params:

    hidden_layer_size, num_layers = params

    model = LSTM(output_size=1, hidden_layer_size=hidden_layer_size, num_layers=num_layers).to(device)
    model.train(train_data, epochs=1)
    #model.eval()

    preds = model.predict(np.array(test_data[0:49]))[0]
    preds = list(preds)

    _ix = 0
    for X in test_data[50:]:
        if _ix == len(test_data[50:]) - 1:
            break
        Y = model.predict(np.array([X]))[0,0]
        preds.append(Y)
        _ix+=1
        

    RVE = explained_variance_score(truth, preds)

    result = {"RVE": RVE, "hidden_layer_size": hidden_layer_size, "num_layers": num_layers}
    print(f"RVE {RVE} -> hidden_layer_size '{hidden_layer_size}' ; num_layers '{num_layers}'")
    results.append(result)

    # hls 200 - nl 1 - RVE 0.9747816351413197

cuda
epoch:   0 loss: 0.0001001226
RVE 0.9747772659729601 -> hidden_layer_size '50' ; num_layers '1'
cuda
epoch:   0 loss: 0.0000055385
RVE 0.9747743420749971 -> hidden_layer_size '50' ; num_layers '2'
cuda
epoch:   0 loss: 0.0000279706
RVE 0.974776405808909 -> hidden_layer_size '50' ; num_layers '4'
cuda
epoch:   0 loss: 0.0001242574
RVE 0.9747814662230647 -> hidden_layer_size '100' ; num_layers '1'
cuda
epoch:   0 loss: 0.0000008138
RVE 0.9747752504410033 -> hidden_layer_size '100' ; num_layers '2'
cuda
epoch:   0 loss: 0.0000191966
RVE 0.9747699142794141 -> hidden_layer_size '100' ; num_layers '4'
cuda
epoch:   0 loss: 0.0000268046
RVE 0.9747730403548921 -> hidden_layer_size '150' ; num_layers '1'
cuda
epoch:   0 loss: 0.0000000163
RVE 0.9747727295893932 -> hidden_layer_size '150' ; num_layers '2'
cuda
epoch:   0 loss: 0.0000093265
RVE 0.9747771335109023 -> hidden_layer_size '150' ; num_layers '4'
cuda
epoch:   0 loss: 0.0000323359
RVE 0.9747816351413197 -> hidden_layer_size '200' ;

## With multiple inputs + outputs

In [None]:
results_multilabel = []

truth = test_data[50:]

hidden_layer_sizes = [50,100,150,200]
nums_layers = [1,2,4]

hyper_params = itertools.product(hidden_layer_sizes, nums_layers)

for params in hyper_params:

    hidden_layer_size, num_layers = params

    model = LSTM(output_size=5, hidden_layer_size=hidden_layer_size, num_layers=num_layers).to(device)
    model.train(train_data, epochs=1)
    #model.eval()

    preds = model.predict(np.array(test_data[0:49]))[0]
    preds = list(preds)

    _ix = 0
    for X in test_data[50:]:
        if _ix == len(test_data[50:]) - 1:
            break
        Y = model.predict(np.array([X]))[0,0]
        preds.append(Y)
        _ix+=1
        

    RVE = explained_variance_score(truth, preds)

    result = {"RVE": RVE, "hidden_layer_size": hidden_layer_size, "num_layers": num_layers}
    print(f"RVE {RVE} -> hidden_layer_size '{hidden_layer_size}' ; num_layers '{num_layers}'")
    results_multilabel.append(result)
    
    # hls 100 - nl 4 - RVE 0.9747813178439919

cuda
epoch:   0 loss: 0.0003430746
RVE 0.9747811136729241 -> hidden_layer_size '50' ; num_layers '1'
cuda
epoch:   0 loss: 0.0002447922
RVE 0.9747796207311098 -> hidden_layer_size '50' ; num_layers '2'
cuda
epoch:   0 loss: 0.0003614936
RVE 0.9747807494036066 -> hidden_layer_size '50' ; num_layers '4'
cuda
epoch:   0 loss: 0.0008355238
RVE 0.9747748306945716 -> hidden_layer_size '100' ; num_layers '1'
cuda
epoch:   0 loss: 0.0004488731
RVE 0.9747785924735314 -> hidden_layer_size '100' ; num_layers '2'
cuda
epoch:   0 loss: 0.0001496843
RVE 0.9747813178439919 -> hidden_layer_size '100' ; num_layers '4'
cuda
epoch:   0 loss: 0.0000350967
RVE 0.9747807698753959 -> hidden_layer_size '150' ; num_layers '1'
cuda
epoch:   0 loss: 0.0001870407
RVE 0.9747812800629767 -> hidden_layer_size '150' ; num_layers '2'
cuda
epoch:   0 loss: 0.0002436671
RVE 0.9747774788790655 -> hidden_layer_size '150' ; num_layers '4'
cuda
epoch:   0 loss: 0.0004602174
RVE 0.974780746608952 -> hidden_layer_size '200' ;

## Save Best

In [25]:
ivs_data = outside_ivs[:,1]
truth = ivs_data[50:]

In [87]:
model_data = np.concatenate((train_data, test_data), axis=0)

model = LSTM(output_size=4, hidden_layer_size=100, num_layers=4).to(device)
#model.train(model_data, epochs=1)
model.train(model_data, epochs=26)
#model.train(model_data, epochs=200)

ivs_data = outside_ivs[:,1]

truth = ivs_data[50:]

preds = model.predict(np.array(ivs_data[0:49]))[0]
preds = list(preds)

_ix = 0
for X in ivs_data[50:]:
    if _ix == len(ivs_data[50:]) - 1:
        break
    Y = model.predict(np.array([X]))[0,0]
    preds.append(Y)
    _ix+=1

printRegStatistics(truth, preds)

cuda
epoch:   1 loss: 0.00122739
epoch:  25 loss: 0.0042992551
The RVE is:  0.9836861223507917
The rmse is:  0.8151775415994958
The Correlation Score is is: 0.9918 (p-value=0.000000e+00)

The Maximum Error is is:  6.203763076782224
The Mean Absolute Error is:  0.569096624291944


In [88]:
print(model.predict(np.array([test_data[0:1000]])))
print(test_data[1000:1000+4])

[[13.717048]
 [14.006069]
 [14.587363]
 [15.422805]]
[15.539 18.6 19.189 19.15]


In [18]:
with open("./outside_model/model_v3.1-pickle","wb") as o:
    pickle.dump(model, o)
model.set_device("cpu")
model.to("cpu")
#for ix in len(model.hidden_cell.to("cpu")):
#    model.hidden_cell
model.hidden_cell = None
with open("./outside_model/model_v3.1-torch","wb") as o:
    torch.save(model, o)

cpu


In [20]:
with open("./outside_model/model_v3.1-torch","rb") as o:
    _model = torch.load(o, encoding='bytes')
#with open("./outside_model/model_v3.1-pickle","rb") as o:
#    _model = pickle.load(o)
print(_model)

LSTM(
  (lstm): LSTM(1, 100)
  (linear): Linear(in_features=100, out_features=4, bias=True)
)


In [26]:
preds = _model.predict(np.array(ivs_data[0:49]))[0]
preds = list(preds)

_ix = 0
for X in ivs_data[50:]:
    if _ix == len(ivs_data[50:]) - 1:
        break
    Y = _model.predict(np.array([X]))[0,0]
    preds.append(Y)
    _ix+=1

printRegStatistics(truth, preds)

ValueError: Found input variables with inconsistent numbers of samples: [4862, 4812]

# Inside Predictions Model
## Data pre-processing

In [58]:
outside_arr
inside_arr

outside_val_curr_l = []
outside_val_valpred_l = []
outside_val_valpred_2 = []
outside_val_valpred_3 = []
outside_val_valpred_4 = []

inside_val_curr_l = []
y = []

assert outside_arr.size == inside_arr.size

labels = ["outside_val_curr_l","outside_val_valpred_l","outside_val_valpred_2","outside_val_valpred_3","outside_val_valpred_4","inside_val_curr_l"]

for i in range(4,len(outside_arr)):
    outside_val_valpred_4.append(outside_arr[i][1])
    outside_val_valpred_3.append(outside_arr[i-1][1])
    outside_val_valpred_2.append(outside_arr[i-2][1])
    outside_val_valpred_l.append(outside_arr[i-3][1])
    outside_val_curr_l.append(outside_arr[i-4][1])

    inside_val_curr_l.append(inside_arr[i-4][1])

    y.append([inside_arr[i-3][1], inside_arr[i-2][1], inside_arr[i-1][1], inside_arr[i][1]])

df_arr = [ [outside_val_curr_l[i],outside_val_valpred_l[i],outside_val_valpred_2[i],outside_val_valpred_3[i],outside_val_valpred_4[i],inside_val_curr_l[i]] for i in range(len(inside_val_curr_l)) ]

inside_X_train, inside_X_test, inside_X_ivs = split_data(np.array(df_arr))
inside_y_train, inside_y_test, inside_y_ivs = split_data(np.array(y))

df_inside_X_train = pd.DataFrame(inside_X_train, columns=labels)
df_inside_X_test = pd.DataFrame(inside_X_test, columns=labels)
df_inside_X_ivs = pd.DataFrame(inside_X_ivs, columns=labels)

df = pd.DataFrame(df_arr, columns=labels)
display(df)

Unnamed: 0,outside_val_curr_l,outside_val_valpred_l,outside_val_valpred_2,outside_val_valpred_3,outside_val_valpred_4,inside_val_curr_l
0,10.430,9.760,8.990,8.200,7.640,22.875
1,9.760,8.990,8.200,7.640,7.200,22.937
2,8.990,8.200,7.640,7.200,6.990,23.062
3,8.200,7.640,7.200,6.990,6.430,23.125
4,7.640,7.200,6.990,6.430,6.220,23.187
...,...,...,...,...,...,...
48614,15.117,15.606,15.406,16.228,16.483,24.062
48615,15.606,15.406,16.228,16.483,16.561,24.125
48616,15.406,16.228,16.483,16.561,16.983,24.437
48617,16.228,16.483,16.561,16.983,17.461,24.500


## Random Forest Regression Model

In [56]:
rf_model = RandomForestRegressor(n_estimators = 100)
rf_model.fit(df_inside_X_train, inside_y_train)

preds = rf_model.predict(df_inside_X_test).flatten()
truths = inside_y_test.flatten()

printRegStatistics(preds,truths)

The RVE is:  0.844591688876241
The rmse is:  0.18992293768609675
The Correlation Score is is: 0.9243 (p-value=0.000000e+00)

The Maximum Error is is:  2.293232933697709
The Mean Absolute Error is:  0.14036057314163217


In [57]:
# https://towardsdatascience.com/feature-selection-using-random-forest-26d7b747597f
sel = SelectFromModel(RandomForestRegressor(n_estimators = 100))#, max_features=12) # when no max_features are specified, seems to vary between 12 and 15
sel.fit(df_inside_X_train, inside_y_train)

In [58]:
print(sel.get_support())
selected_feat= df_inside_X_train.columns[(sel.get_support())]
print(len(selected_feat))
print(selected_feat)

[False False False False False  True]
1
Index(['inside_val_curr_l'], dtype='object')


In [59]:
df_inside_X_train = df_inside_X_train[df_inside_X_train.columns[(sel.get_support())]]
df_inside_X_test = df_inside_X_test[df_inside_X_test.columns[(sel.get_support())]]

print(df_inside_X_train)

       inside_val_curr_l
0                 22.875
1                 22.937
2                 23.062
3                 23.125
4                 23.187
...                  ...
38890             23.312
38891             23.375
38892             23.500
38893             23.375
38894             23.500

[38895 rows x 1 columns]


In [75]:
model = RandomForestRegressor(n_estimators = 100)
model.fit(df_inside_X_train.to_numpy(), inside_y_train)

preds = model.predict(df_inside_X_test.to_numpy()).flatten()
truths = inside_y_test.flatten()

printRegStatistics(preds,truths)

The RVE is:  0.8456982653069713
The rmse is:  0.1861762414605077
The Correlation Score is is: 0.9275 (p-value=0.000000e+00)

The Maximum Error is is:  2.6192463235805796
The Mean Absolute Error is:  0.13801686278670214


In [76]:
p = model.predict(np.array([df_inside_X_test.to_numpy()[0]]))
print(p)
print(inside_y_test[0])

[[23.55185153 23.55528567 23.553122   23.54535476]]
[23.437 23.562 23.625 23.437]


## Save Model

In [77]:
with open("./models/inside_model/model_v1","wb") as o:
    pickle.dump(model, o)

In [78]:
with open("./models/inside_model/model_v1","rb") as o:
    model = pickle.load(o, encoding='bytes')
print(model)
preds = model.predict(df_inside_X_test).flatten()
truths = inside_y_test.flatten()

printRegStatistics(preds,truths)

RandomForestRegressor()
The RVE is:  0.8456982653069713
The rmse is:  0.1861762414605077
The Correlation Score is is: 0.9275 (p-value=0.000000e+00)

The Maximum Error is is:  2.6192463235805796
The Mean Absolute Error is:  0.13801686278670214




## Random Forests V2

In [84]:
outside_arr
inside_arr

outside_val_curr_l = []
outside_val_valpred_l = []

inside_val_curr_l = []

y = []

assert outside_arr.size == inside_arr.size

labels = ["outside_val_curr_l","outside_val_valpred_l","inside_val_curr_l"]

for i in range(1,len(outside_arr)):
    outside_val_valpred_l.append(outside_arr[i][1])
    outside_val_curr_l.append(outside_arr[i-1][1])

    inside_val_curr_l.append(inside_arr[i-1][1])

    y.append(inside_arr[i][1])

df_arr = [ [outside_val_curr_l[i],outside_val_valpred_l[i],inside_val_curr_l[i]] for i in range(len(inside_val_curr_l)) ]

inside_X_train, inside_X_test, inside_X_ivs = split_data(np.array(df_arr))

y = np.array(y)
total_len=y.size

train_p = 0.80
test_p = 0.10
ivs_p = 0.10

train_len = round(total_len*train_p)
test_len = round(total_len*test_p)
ivs_len = round(total_len*ivs_p)

model_len = train_len+test_len

total_used_len = model_len+ivs_len

inside_y_train = y[:train_len]
inside_y_test = y[train_len:model_len]
inside_y_ivs = y[model_len:total_used_len]

df_inside_X_train = pd.DataFrame(inside_X_train, columns=labels)
df_inside_X_test = pd.DataFrame(inside_X_test, columns=labels)
df_inside_X_ivs = pd.DataFrame(inside_X_ivs, columns=labels)

df = pd.DataFrame(df_arr, columns=labels)
display(df)

Unnamed: 0,outside_val_curr_l,outside_val_valpred_l,inside_val_curr_l
0,10.430,9.760,22.875
1,9.760,8.990,22.937
2,8.990,8.200,23.062
3,8.200,7.640,23.125
4,7.640,7.200,23.187
...,...,...,...
48617,16.228,16.483,24.500
48618,16.483,16.561,24.812
48619,16.561,16.983,24.750
48620,16.983,17.461,24.500


In [85]:
model = RandomForestRegressor(n_estimators = 100)
model.fit(df_inside_X_train.to_numpy(), inside_y_train)

preds = model.predict(df_inside_X_test.to_numpy()).flatten()
truths = inside_y_test.flatten()

printRegStatistics(preds,truths)

The RVE is:  0.8947976157964002
The rmse is:  0.15917090582348375
The Correlation Score is is: 0.9476 (p-value=0.000000e+00)

The Maximum Error is is:  1.2784524685074814
The Mean Absolute Error is:  0.12196035073999228


In [86]:
p = model.predict(np.array([df_inside_X_test.to_numpy()[0]]))
print(p)
print(inside_y_test[0])

[23.54857]
23.437


In [87]:
with open("./models/inside_model/model_v2","wb") as o:
    pickle.dump(model, o)

In [88]:
with open("./models/inside_model/model_v2","rb") as o:
    model = pickle.load(o, encoding='bytes')
print(model)
preds = model.predict(df_inside_X_test).flatten()
truths = inside_y_test.flatten()

printRegStatistics(preds,truths)

RandomForestRegressor()
The RVE is:  0.8947976157964002
The rmse is:  0.15917090582348375
The Correlation Score is is: 0.9476 (p-value=0.000000e+00)

The Maximum Error is is:  1.2784524685074814
The Mean Absolute Error is:  0.12196035073999228




## Random Forests V3

In [175]:
inside_model = CustomRandomForestRegressor(n_preds = 4, n_estimators = 100)
inside_model.fit(df_inside_X_train.to_numpy(), inside_y_train)

preds = inside_model.predict(df_inside_X_test.to_numpy()).flatten()
truths = inside_y_test.flatten()

printRegStatistics(preds,truths)

The RVE is:  0.8947175926706176
The rmse is:  0.15920844237402698
The Correlation Score is is: 0.9475 (p-value=0.000000e+00)

The Maximum Error is is:  1.3059624685074809
The Mean Absolute Error is:  0.12173446103777685


In [176]:
with open("./models/inside_model/model_v3","wb") as o:
    pickle.dump(inside_model, o)

In [61]:
with open("./outside_model/model_v3","rb") as o:
    outside_model: LSTM = torch.load(o, encoding='bytes')
print(outside_model)
with open("./inside_model/model_v3","rb") as o:
    inside_model: CustomRandomForestRegressor = pickle.load(o, encoding='bytes')
print(inside_model)


outside_model.set_device()
outside_model.predict(np.array(test_X_stream[0:1000]))


outside_preds_total = []
inside_preds_total = []

assert len(test_X_stream) == len(true_Y)
assert len(df_inside_X_test.to_numpy()) == len(inside_y_test)

for i in range(1000,len(test_X_stream)):
    outside_preds = outside_model.predict(np.array(test_X_stream[i]))
    inside_preds = inside_model.total_predict(test_X_stream[i], outside_preds, inside_arr[i][1])

    outside_preds = [ o[0] for o in outside_preds ]
    inside_preds = [ o[0] for o in inside_preds ]

    outside_preds_total.append(outside_preds)
    inside_preds_total.append(inside_preds)


outside_truth = []
inside_truth = []

for i in range(1005,true_Y.size):
    outside_truth.append([true_Y[i-3], true_Y[i-2], true_Y[i-1], true_Y[i]])
    inside_truth.append([inside_y_test[i-3], inside_y_test[i-2], inside_y_test[i-1], inside_y_test[i]])


#print("Preds")
#print(outside_preds_total)
#print(inside_preds_total)

#print("Truth")
#print(outside_truth)
#print(inside_truth)

#print("Outside")
#printRegStatistics(outside_preds_total,outside_truth)
#print("Inside")
#printRegStatistics(inside_preds_total,inside_truth)

LSTM(
  (lstm): LSTM(1, 100)
  (linear): Linear(in_features=100, out_features=4, bias=True)
)
CustomRandomForestRegressor()
cuda


In [63]:
outside_preds = outside_model.predict(np.array(test_X_stream[0]))
inside_preds = inside_model.total_predict(test_X_stream[0], outside_preds, inside_arr[0][1])

outside_preds = [ o[0] for o in outside_preds ]
inside_preds = [ o[0] for o in inside_preds ]

print(outside_preds)
print(inside_preds)

[10.914384, 10.945582, 10.9903, 11.040925]
[22.894129999999972, 22.913012597732212, 22.85561683832897, 23.013715461919652]


In [64]:
print(inside_arr[0][1])

22.875
