# **PREDICCIONES PARA UN SOLO PARTICIPANTE**

In [32]:
import pandas as pd
import numpy as np
import pickle
import seaborn as sbn
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
import random as random
import plotly.express as px
import tensorflow as tf
from tensorflow.keras import layers

def set_output_precision(decimals):
  """
  format the output of the all the data structures
  with an specific number of decimals
  """
  np.set_printoptions(precision=decimals)
  into='{'+':.{}f'.format(decimals)+'}'
  pd.options.display.float_format = into.format
  torch.set_printoptions(precision=decimals)
  pass

def plot_ts(df,dfx="Minute",dfy="METS",_title="DF minute x Mets"):
  if not isinstance(df,pd.DataFrame):
    df = pd.DataFrame({'METS': df, 'Minute': range(len(df))})

  plt.figure()
  fig = px.line(df, x = dfx, y = dfy , title = _title)
  fig.update_xaxes(
      rangeslider_visible = True,
      rangeselector = dict(
          buttons = list([
              dict(count=1,label="1y",step="year",stepmode="backward"),
              dict(count=2,label="2y",step="year",stepmode="backward"),
              dict(count=3,label="3y",step="year",stepmode="backward"),
              dict(step="all")
          ])
      )

  )
  fig.show()

def plot_predictions_vs_real(predictions, reals):
  df = pd.DataFrame()
  number_of_points = len(predictions)
  df["time"] = range(0,number_of_points)
  df["participant"] = "prediction"
  df["value"] = predictions
  for i in range(0,number_of_points):
    df.loc[number_of_points+i] = [i,"real",reals[i]]

  plt.figure(1)
  fig = px.line(df, x = "time", y = "value" , title = "predictions vs reals" , color = "participant")
  fig.update_xaxes(
        rangeslider_visible = True,
        rangeselector = dict(
            buttons = list([
                dict(count=1,label="1y",step="year",stepmode="backward"),
                dict(count=2,label="2y",step="year",stepmode="backward"),
                dict(count=3,label="3y",step="year",stepmode="backward"),
                dict(step="all")
            ])
        )

    )
  fig.show()

In [33]:
READ_LOCAL_DATA = True
COMPUTED_OPTION = 1
TEST_SIZE = 0.2
VALIDATION_SIZE = 0.15
SAVE_RESULTS = True
LOW_DATA = True
SPLIT_INTO_TWO_DAYS = True
MULTI_STEP_FORECAST = False
np.random.seed(42)

In [34]:
if READ_LOCAL_DATA:
  PATH = "Resources/Individual/"
else:
    #  We start by getting access to the drive
    from google.colab import drive
    drive.mount('/content/drive')
    PATH = "/content/drive/MyDrive/TFG/Resources/Individual/"

if LOW_DATA:
    PATH += "LowData/"

In [35]:
import gzip
documents = ['minuteY','hourY','dayY']
# with open(PATH+"minuteX"+".pkl", 'rb') as file:
#     dataX = np.array(pickle.load(file),np.float32)
file = PATH+"minuteX.pkl.gz"
dataX = np.array(pickle.load(gzip.open(file, 'rb')),np.float32)

file = PATH+documents[COMPUTED_OPTION]+".pkl.gz"
dataY = np.array(pickle.load(gzip.open(file, 'rb')),np.float32)


In [36]:
NUMBER_OF_PARTICIPANTS = dataX.shape[0]
if MULTI_STEP_FORECAST:
    PREDICTED_HORIZON = 1
    if COMPUTED_OPTION == 0:
        dataX_nuevo = dataX
        JUMP = 1
    elif COMPUTED_OPTION == 1:
        JUMP = 60
        dataX_nuevo = np.ones(shape=(NUMBER_OF_PARTICIPANTS,dataX.shape[1],24))
        for k in range(0,NUMBER_OF_PARTICIPANTS):
            for i in range(0,dataX.shape[1]):
                for j in range(0,24):
                    dataX_nuevo[k,i,j] = np.sum(dataX[k,i,60*j:60*(j+1)])
    else:
        JUMP = 1440
        dataX_nuevo = np.ones(shape=(dataX.shape[0],1))
        for k in range(0,NUMBER_OF_PARTICIPANTS):
            for i in range(0,dataX.shape[0]):
                dataX_nuevo[k,i,:] = np.sum(dataX[k,i,:])
else:
    JUMP = 1
    if COMPUTED_OPTION == 0:
        PREDICTED_HORIZON = 120
    elif COMPUTED_OPTION == 1:
        PREDICTED_HORIZON = 2
    else:
        PREDICTED_HORIZON = 1

print(dataX_nuevo.shape)

In [37]:
def change_shape_by_participant(data):
    original_shape = data.shape
    new_shape = (original_shape[0] * original_shape[1], original_shape[2])
    reshaped_array = data.reshape(new_shape)
    return reshaped_array

In [38]:
if MULTI_STEP_FORECAST:
    dataX_nuevo = dataX_nuevo.transpose(1,0,2)
else:
    dataX_nuevo = dataX.transpose(1,0,2)
dataY = dataY.transpose(1,0,2)
print(dataX_nuevo.shape)
print(dataY.shape)

(36355, 25, 1440)
(36355, 25, 1440)


In [39]:
if SPLIT_INTO_TWO_DAYS:
    LAST_TWO_DAYS_INDEX = -1440*2
    X_train, X_test, y_train, y_test = dataX_nuevo[:LAST_TWO_DAYS_INDEX,:,:],dataX_nuevo[LAST_TWO_DAYS_INDEX::JUMP,:,:],\
        dataY[:LAST_TWO_DAYS_INDEX:,:,0:PREDICTED_HORIZON],dataY[LAST_TWO_DAYS_INDEX::JUMP,:,0:PREDICTED_HORIZON]
else:
    X_train, X_test, y_train, y_test = train_test_split(dataX_nuevo, dataY, test_size=TEST_SIZE,shuffle=True,random_state=42)
    y_test = y_test[:,:,0:PREDICTED_HORIZON]
    y_train = y_train[:,:,0:PREDICTED_HORIZON]

if not LOW_DATA:
    del dataY
    del dataX
X_train = X_train.transpose(1,0,2)
X_test = X_test.transpose(1,0,2)
y_train = y_train.transpose(1,0,2)
y_test =  y_test.transpose(1,0,2)

print("Examples for training\n","X:",X_train.shape,"y:",y_train.shape)
print("Examples for test\n","X:",X_test.shape,"y:",y_test.shape)
VALIDATION_INDEX = int(len(X_train)*VALIDATION_SIZE)
print("Examples for validation: \n", VALIDATION_INDEX)

Examples for training
 X: (25, 33475, 1440) y: (25, 33475, 120)
Examples for test
 X: (25, 2880, 1440) y: (25, 2880, 120)
Examples for validation: 
 3


In [40]:
X_train = change_shape_by_participant(X_train)
X_test = change_shape_by_participant(X_test)
y_train = change_shape_by_participant(y_train)
y_test = change_shape_by_participant(y_test)

In [41]:
print("Examples for training\n","X:",X_train.shape,"y:",y_train.shape)
print("Examples for test\n","X:",X_test.shape,"y:",y_test.shape)
VALIDATION_INDEX = int(len(X_train)*VALIDATION_SIZE)
print("Examples for validation: \n", VALIDATION_INDEX)

Examples for training
 X: (836875, 1440) y: (836875, 120)
Examples for test
 X: (72000, 1440) y: (72000, 120)
Examples for validation: 
 125531


In [42]:
tf.random.set_seed(42)
# Setup dataset hyperparameters
HORIZON = y_test.shape[1]
WINDOW_SIZE = X_test.shape[1]

# Let's build an LSTM model with the Functional API
inputs = layers.Input(shape=(WINDOW_SIZE))
x = layers.Lambda(lambda x: tf.expand_dims(x, axis=1))(inputs) # expand input dimension to be compatible with LSTM
# print(x.shape)
# x = layers.LSTM(128, activation="relu", return_sequences=True)(inputs) # this layer will error if the inputs are not the right shape
x = layers.LSTM(128,return_sequences=True, activation="relu")(x) # using the tanh loss function results in a massive error
# print(x.shape)
# Add another optional dense layer (you could add more of these to see if they improve model performance)
# x = layers.Dense(32, activation="relu")(x)
output = layers.Dense(HORIZON)(x)
model_LSTM = tf.keras.Model(inputs=inputs, outputs=output, name="model_5_lstm")

model_LSTM.summary()

Model: "model_5_lstm"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 1440)]            0         
                                                                 
 lambda_2 (Lambda)           (None, 1, 1440)           0         
                                                                 
 lstm_2 (LSTM)               (None, 1, 128)            803328    
                                                                 
 dense_2 (Dense)             (None, 1, 120)            15480     
                                                                 
Total params: 818,808
Trainable params: 818,808
Non-trainable params: 0
_________________________________________________________________


In [43]:
# Compile model
model_LSTM.compile(loss="mae",
                optimizer=tf.keras.optimizers.Adam(),
             metrics=["mae"])

In [44]:
# Seems when saving the model several warnings are appearing: https://github.com/tensorflow/tensorflow/issues/47554
model_LSTM.fit(X_train[:-VALIDATION_INDEX],
            y_train[:-VALIDATION_INDEX],
            epochs=3,
            verbose=1,
            batch_size=128,
            shuffle=True,
            validation_data=(X_train[-VALIDATION_INDEX:], y_train[-VALIDATION_INDEX:])
               )

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x2a21141fee0>

In [45]:
def make_preds(model, input_data):
  """
  Uses model to make predictions on input_data.

  Parameters
  ----------
  model: trained model 
  input_data: windowed input data (same kind of data model was trained on)

  Returns model predictions on input_data.
  """
  forecast = model.predict(input_data)
  return tf.squeeze(forecast) # return 1D array of predictions

predictions = make_preds(model_LSTM, X_test)



In [46]:
print(predictions.shape)
print(y_test.shape)
DATA_BY_PARTICIPANT = int(y_test.shape[0]/25)
poblational_predicion = np.ones(shape=(DATA_BY_PARTICIPANT,PREDICTED_HORIZON))
poblational_y_test = np.ones(shape=(DATA_BY_PARTICIPANT,PREDICTED_HORIZON))
poblational_X_test = np.ones(shape=(DATA_BY_PARTICIPANT,WINDOW_SIZE))
for i in range(0,DATA_BY_PARTICIPANT):
    poblational_predicion[i,:] = np.sum(np.array(predictions[i::DATA_BY_PARTICIPANT]),axis=0)
    poblational_y_test[i,:] = np.sum(np.array(y_test[i::DATA_BY_PARTICIPANT]),axis=0)
    poblational_X_test[i,:] = np.sum(np.array(X_test[i::DATA_BY_PARTICIPANT]),axis=0)
print(poblational_predicion.shape)

(72000, 120)
(72000, 120)
(2880, 120)


In [57]:
if not MULTI_STEP_FORECAST:
    if SPLIT_INTO_TWO_DAYS:
        index = 0
        period = poblational_X_test[::120,:]
        period_results = make_preds(model_LSTM,period)
        period_results_to_plot = np.array(period_results).reshape(48)
        y_test_to_plot = y_test[::120,:].reshape(48)
        plot_predictions_vs_real(predictions=period_results_to_plot,reals=y_test_to_plot)
        print('\033[1m' + "Predicted: " + str(np.sum(period_results_to_plot)) + '\033[0m')
        print('\033[1m' + "Reals: " + str(np.sum(y_test_to_plot)) + '\033[0m')
    else:
        print("Resultados poblacionales cada dos horas")
        print('\033[1m' + "MSE: " + str(mean_squared_error(np.sum(poblational_y_test,axis=1),np.sum(poblational_predicion,axis=1))) + '\033[0m')
        print('\033[1m' + "MAE: " + str(mean_absolute_error(np.sum(poblational_y_test,axis=1),np.sum(poblational_predicion,axis=1))) + '\033[0m')
        print("Ejemplos aleatorios")
        # Crear un array de índices
        indices_totales = np.arange(poblational_X_test.shape[0])
        # Seleccionar 5 índices aleatorios
        indices_aleatorios = np.random.choice(indices_totales, size=5, replace=False)
        for i in indices_aleatorios:
            if COMPUTED_OPTION == 1:
                END = 24
                STARTED_MINUTE = 0
                previous = np.ones(shape=(24))
                for j in range(0,24):
                    previous[j] = np.sum(poblational_X_test[i,:][60*j:60*(j+1)])
            else:
                END = 1440
                STARTED_MINUTE = 1000
                previous = poblational_X_test[i,:]

            predictions_to_plot = np.ones(shape=(END + PREDICTED_HORIZON))
            predictions_to_plot[0:END] = previous[:]
            predictions_to_plot[END:] = poblational_predicion[i,:]
            y_test_to_plot = np.ones(shape=(END + PREDICTED_HORIZON))
            y_test_to_plot[0:END] = previous[:]
            y_test_to_plot[END:] = poblational_y_test[i,:]
            plot_predictions_vs_real(predictions_to_plot,y_test_to_plot)


<Figure size 640x480 with 0 Axes>

In [63]:
print(predictions.shape)
print(y_test.shape)
print('\033[1m' + "MSE: " + str(mean_squared_error(y_test,predictions)) + '\033[0m')
print('\033[1m' + "MAE: " + str(mean_absolute_error(y_test,predictions)) + '\033[0m')
print(predictions[0].shape)

if COMPUTED_OPTION < 2:
    y_test_suma = np.sum(y_test,axis=1)
    predictions_suma = np.sum(predictions,axis=1)
    print(y_test_suma.shape)
    print(predictions_suma.shape)
    print('\033[1m' + "MSE: " + str(mean_squared_error(y_test_suma,predictions_suma)) + '\033[0m')
    print('\033[1m' + "MAE: " + str(mean_absolute_error(y_test_suma,predictions_suma)) + '\033[0m')

[1mPredicted: 96452.99646759033[0m
[1mReals: 112191.10620117188[0m


<Figure size 640x480 with 0 Axes>

In [None]:
if COMPUTED_OPTION < 2:
    plot_predictions_vs_real(predictions=predictions[1000],reals=y_test[1000])
else:
    print(predictions[0],y_test[0])
    print()

In [None]:
if SAVE_RESULTS:
    file_path = 'Resources/Resultados/Individual/'
    if LOW_DATA:
        file_path += "LowData/"
    documents = ['minuteY','hourY','dayY']
    file = file_path+documents[COMPUTED_OPTION]+"-predictions"+".pkl.gz"
    pickle.dump(predictions, gzip.open(file, 'wb'))
    file = file_path+documents[COMPUTED_OPTION]+"-test"+".pkl.gz"
    pickle.dump(y_test, gzip.open(file, 'wb'))
    file = file_path+documents[COMPUTED_OPTION]+"-X"+".pkl.gz"
    pickle.dump(X_test, gzip.open(file, 'wb'))