# **PREDICCIONES PARA UN SOLO PARTICIPANTE**

In [1]:
from statistics import mean, median
import pandas as pd
import numpy as np
import pickle
import seaborn as sbn
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
import random as random
import plotly.express as px
import tensorflow as tf
from tensorflow.keras import layers

def set_output_precision(decimals):
  """
  format the output of the all the data structures
  with an specific number of decimals
  """
  np.set_printoptions(precision=decimals)
  into='{'+':.{}f'.format(decimals)+'}'
  pd.options.display.float_format = into.format
  torch.set_printoptions(precision=decimals)
  pass

def plot_ts(df,dfx="Minute",dfy="METS",_title="DF minute x Mets"):
  if not isinstance(df,pd.DataFrame):
    df = pd.DataFrame({'METS': df, 'Minute': range(len(df))})

  plt.figure()
  fig = px.line(df, x = dfx, y = dfy , title = _title)
  fig.update_xaxes(
      rangeslider_visible = True,
      rangeselector = dict(
          buttons = list([
              dict(count=1,label="1y",step="year",stepmode="backward"),
              dict(count=2,label="2y",step="year",stepmode="backward"),
              dict(count=3,label="3y",step="year",stepmode="backward"),
              dict(step="all")
          ])
      )

  )
  fig.show()

def plot_predictions_vs_real(predictions, reals):
  df = pd.DataFrame()
  number_of_points = len(predictions)
  df["time"] = range(0,number_of_points)
  df["participant"] = "prediction"
  df["value"] = predictions
  for i in range(0,number_of_points):
    df.loc[number_of_points+i] = [i,"real",reals[i]]

  plt.figure(1)
  fig = px.line(df, x = "time", y = "value" , title = "predictions vs reals" , color = "participant")
  fig.update_xaxes(
        rangeslider_visible = True,
        rangeselector = dict(
            buttons = list([
                dict(count=1,label="1y",step="year",stepmode="backward"),
                dict(count=2,label="2y",step="year",stepmode="backward"),
                dict(count=3,label="3y",step="year",stepmode="backward"),
                dict(step="all")
            ])
        )

    )
  fig.show()

In [2]:
INDEXS = [{"train":[{"start":{"day":2,"hour":0,"minute":0},
                     "end":{"day":20,"hour":22,"minute":0}}
                    ],
           "validation":[{"start":{"day":22,"hour":0,"minute":0},
                     "end":{"day":24,"hour":22,"minute":0}}
                    ],
           "test":[{"start":{"day":26,"hour":0,"minute":0},
                     "end":{"day":27,"hour":22,"minute":0}}
                    ]}
          ]

In [3]:
READ_LOCAL_DATA = True
COMPUTED_OPTION = 0
TEST_SIZE = 0.2
VALIDATION_SIZE = 0.15
SAVE_RESULTS = True
LOW_DATA = True
SPLIT_INTO_TWO_DAYS = True
MULTI_STEP_FORECAST = False
SPLIT = 0
np.random.seed(42)

In [4]:
if READ_LOCAL_DATA:
  PATH = "Resources/Individual/"
else:
    #  We start by getting access to the drive
    from google.colab import drive
    drive.mount('/content/drive')
    PATH = "/content/drive/MyDrive/TFG/Resources/Individual/"

if LOW_DATA:
    PATH += "LowData/"

In [5]:
import gzip
documents = ['minuteY','hourY','dayY']
# with open(PATH+"minuteX"+".pkl", 'rb') as file:
#     dataX = np.array(pickle.load(file),np.float32)
file = PATH+"minuteX.pkl.gz"
dataX = np.array(pickle.load(gzip.open(file, 'rb')),np.float32)

file = PATH+documents[COMPUTED_OPTION]+".pkl.gz"
dataY = np.array(pickle.load(gzip.open(file, 'rb')),np.float32)


In [6]:
NUMBER_OF_PARTICIPANTS = dataX.shape[0]
if MULTI_STEP_FORECAST:
    PREDICTED_HORIZON = 1
    if COMPUTED_OPTION == 0:
        dataX_nuevo = dataX
        JUMP = 1
    elif COMPUTED_OPTION == 1:
        JUMP = 60
        dataX_nuevo = np.ones(shape=(NUMBER_OF_PARTICIPANTS,dataX.shape[1],24))
        for k in range(0,NUMBER_OF_PARTICIPANTS):
            for i in range(0,dataX.shape[1]):
                for j in range(0,24):
                    dataX_nuevo[k,i,j] = np.sum(dataX[k,i,60*j:60*(j+1)])
    else:
        JUMP = 1440
        dataX_nuevo = np.ones(shape=(dataX.shape[0],1))
        for k in range(0,NUMBER_OF_PARTICIPANTS):
            for i in range(0,dataX.shape[0]):
                dataX_nuevo[k,i,:] = np.sum(dataX[k,i,:])

In [7]:
def change_shape_by_participant(data):
    original_shape = data.shape
    new_shape = (original_shape[0] * original_shape[1], original_shape[2])
    reshaped_array = data.reshape(new_shape)
    return reshaped_array

In [8]:
if MULTI_STEP_FORECAST:
    dataX_nuevo = dataX_nuevo.transpose(1,0,2)

#We split a test set for testing
# train_test_split(dataX, dataY, test_size=TEST_SIZE)

def calculate_index(time):
  minute_index = time["day"] * 1440 + time["hour"]*60 + time["minute"]
  return minute_index

def get_split(dataX,dataY,index):
  start = calculate_index(index[0]["start"])
  end = calculate_index(index[0]["end"])
  X_split = dataX[:,start:end,:]
  y_split = dataY[:,start:end,:]
  if len(index) > 1:
    for i in range(1,index):
      start = calculate_index(index[i]["start"])
      end = calculate_index(index[i]["end"])
      X_split = np.concatenate(X_split,dataX[:,start:end,:])
      y_split = np.concatenate(y_split,dataY[:,start:end,:])
  return X_split,y_split

def train_test_validation_split(dataX,dataY,indexs):
  X_train,y_train = get_split(dataX,dataY,indexs["train"])
  X_validation,y_validation = get_split(dataX,dataY,indexs["validation"])
  X_test,y_test = get_split(dataX,dataY,indexs["test"])
  return X_train,y_train,X_validation,y_validation,X_test,y_test


X_train,y_train,X_validation,y_validation,X_test,y_test = train_test_validation_split(dataX,dataY,INDEXS[SPLIT])
print("Examples for training\n","X:",X_train.shape,"y:",y_train.shape)
print("Examples for validation\n","X:",X_validation.shape,"y:",y_validation.shape)
print("Examples for test\n","X:",X_test.shape,"y:",y_test.shape)

Examples for training
 X: (25, 27240, 1440) y: (25, 27240, 120)
Examples for validation
 X: (25, 4200, 1440) y: (25, 4200, 120)
Examples for test
 X: (25, 2760, 1440) y: (25, 2760, 120)


In [9]:
X_train,y_train,X_validation,y_validation,X_test,y_test = [ i.transpose(1,0,2) for i in
    [X_train,y_train,X_validation,y_validation,X_test,y_test ]
    ]
print("Examples for training\n","X:",X_train.shape,"y:",y_train.shape)
print("Examples for validation\n","X:",X_validation.shape,"y:",y_validation.shape)
print("Examples for test\n","X:",X_test.shape,"y:",y_test.shape)

Examples for training
 X: (27240, 25, 1440) y: (27240, 25, 120)
Examples for validation
 X: (4200, 25, 1440) y: (4200, 25, 120)
Examples for test
 X: (2760, 25, 1440) y: (2760, 25, 120)


In [10]:

X_train,y_train,X_validation,y_validation,X_test,y_test = [ change_shape_by_participant(i) for i in
    [X_train,y_train,X_validation,y_validation,X_test,y_test ]
    ]
print("Examples for training\n","X:",X_train.shape,"y:",y_train.shape)
print("Examples for validation\n","X:",X_validation.shape,"y:",y_validation.shape)
print("Examples for test\n","X:",X_test.shape,"y:",y_test.shape)

Examples for training
 X: (681000, 1440) y: (681000, 120)
Examples for validation
 X: (105000, 1440) y: (105000, 120)
Examples for test
 X: (69000, 1440) y: (69000, 120)


In [11]:
tf.random.set_seed(42)
# Setup dataset hyperparameters
HORIZON = y_test.shape[1]
WINDOW_SIZE = X_test.shape[1]

# Let's build an LSTM model with the Functional API
inputs = layers.Input(shape=(WINDOW_SIZE))
x = layers.Lambda(lambda x: tf.expand_dims(x, axis=1))(inputs) # expand input dimension to be compatible with LSTM
# print(x.shape)
# x = layers.LSTM(128, activation="relu", return_sequences=True)(inputs) # this layer will error if the inputs are not the right shape
x = layers.LSTM(128,return_sequences=True, activation="relu")(x) # using the tanh loss function results in a massive error
# print(x.shape)
# Add another optional dense layer (you could add more of these to see if they improve model performance)
# x = layers.Dense(32, activation="relu")(x)
output = layers.Dense(HORIZON)(x)
model_LSTM = tf.keras.Model(inputs=inputs, outputs=output, name="model_5_lstm")

model_LSTM.summary()

Model: "model_5_lstm"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1440)]            0         
                                                                 
 lambda (Lambda)             (None, 1, 1440)           0         
                                                                 
 lstm (LSTM)                 (None, 1, 128)            803328    
                                                                 
 dense (Dense)               (None, 1, 120)            15480     
                                                                 
Total params: 818,808
Trainable params: 818,808
Non-trainable params: 0
_________________________________________________________________


In [12]:
# Compile model
model_LSTM.compile(loss="mae",
                optimizer=tf.keras.optimizers.Adam(),
             metrics=["mae"])

In [13]:
# Seems when saving the model several warnings are appearing: https://github.com/tensorflow/tensorflow/issues/47554
model_LSTM.fit(X_train,
            y_train,
            epochs=3,
            verbose=1,
            batch_size=128,
            shuffle=True,
            validation_data=(X_validation, y_validation)
               )

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x266efc0fc70>

In [14]:
def make_preds(model, input_data):
  """
  Uses model to make predictions on input_data.

  Parameters
  ----------
  model: trained model 
  input_data: windowed input data (same kind of data model was trained on)

  Returns model predictions on input_data.
  """
  forecast = model.predict(input_data)
  return tf.squeeze(forecast) # return 1D array of predictions

predictions = make_preds(model_LSTM, X_test)



In [18]:
print(predictions.shape)
print(y_test.shape)
DATA_BY_PARTICIPANT = int(y_test.shape[0]/25)
poblational_prediction = np.ones(shape=(DATA_BY_PARTICIPANT,HORIZON))
poblational_y_test = np.ones(shape=(DATA_BY_PARTICIPANT,HORIZON))
poblational_X_test = np.ones(shape=(DATA_BY_PARTICIPANT,WINDOW_SIZE))
for i in range(0,DATA_BY_PARTICIPANT):
    poblational_prediction[i,:] = np.sum(np.array(predictions[i::DATA_BY_PARTICIPANT]),axis=0)
    poblational_y_test[i,:] = np.sum(np.array(y_test[i::DATA_BY_PARTICIPANT]),axis=0)
    poblational_X_test[i,:] = np.sum(np.array(X_test[i::DATA_BY_PARTICIPANT]),axis=0)
print(poblational_prediction.shape)
print(poblational_y_test.shape)

print('\033[1m' + "MSE: " + str(mean_squared_error(poblational_y_test,poblational_prediction)) + '\033[0m')
print('\033[1m' + "MAE: " + str(mean_absolute_error(poblational_y_test,poblational_prediction)) + '\033[0m')

(69000, 120)
(69000, 120)
(2760, 120)
(2760, 120)
[1mMSE: 41.978508314498676[0m
[1mMAE: 5.4566724717789805[0m
38.2359308714452


In [27]:
SPLIT_INTO_TWO_DAYS = True
if not MULTI_STEP_FORECAST:
  print("Resultados poblacionales cada dos horas")
  print('\033[1m' + "MSE: " + str(mean_squared_error(np.sum(poblational_y_test,axis=1),np.sum(poblational_prediction,axis=1))) + '\033[0m')
  print('\033[1m' + "MAE: " + str(mean_absolute_error(np.sum(poblational_y_test,axis=1),np.sum(poblational_prediction,axis=1))) + '\033[0m')

  list_of_MAE = [  mean_squared_error(poblational_prediction[i],poblational_y_test[i]) for i in range(0,len(poblational_y_test)) ]
  list_of_values = sorted(list_of_MAE)
  mean_value = mean(list_of_MAE)
  closest_value = min(list_of_MAE, key=lambda x: abs(x - mean_value))
  # Crear un array de índices
  indices = [list_of_MAE.index(list_of_values[-1]),
             list_of_MAE.index(closest_value) ,
            list_of_MAE.index(list_of_MAE[0])]
  if COMPUTED_OPTION == 0:
    for i in indices:
      plot_predictions_vs_real(poblational_prediction[i],poblational_y_test[i])
  else:
    for i in indices:
        END = 24
        STARTED_MINUTE = 0
        previous = np.ones(shape=(24))
        for j in range(0,24):
            previous[j] = np.sum(poblational_X_test[i,:][60*j:60*(j+1)])
        predictions_to_plot = np.ones(shape=(END + HORIZON))
        predictions_to_plot[0:END] = previous[:]
        predictions_to_plot[END:] = poblational_prediction[i,:]
        y_test_to_plot = np.ones(shape=(END + HORIZON))
        y_test_to_plot[0:END] = previous[:]
        y_test_to_plot[END:] = poblational_y_test[i,:]
        plot_predictions_vs_real(predictions_to_plot,y_test_to_plot)

if SPLIT_INTO_TWO_DAYS:
    index = 0
    period = poblational_X_test[::120,:]
    period_results = make_preds(model_LSTM,period)
    period_results_aux = np.array(period_results)
    period_results_to_plot = np.array(period_results_aux).reshape(HORIZON*23)
    y_test_to_plot = poblational_y_test[::120,:].reshape(HORIZON*23)
    plot_predictions_vs_real(predictions=period_results_to_plot,reals=y_test_to_plot)
    print('\033[1m' + "Predicted: " + str(np.sum(period_results_to_plot)) + '\033[0m')
    print('\033[1m' + "Reals: " + str(np.sum(y_test_to_plot)) + '\033[0m')




Resultados poblacionales cada dos horas
[1mMSE: 509778.57483326335[0m
[1mMAE: 654.2654295437578[0m
2760.0
4588.311704573424
150.16803119581917
[32.58561707 32.66766357 32.54138947 32.71694565 32.60134506 32.651577
 32.54296875 32.70872116 32.66831589 32.59570312 32.59423065 32.57945251
 32.65180206 32.51951981 32.67578125 32.72210693 32.6065979  32.66819382
 32.5971756  32.66856766 32.74102783 32.63061523 32.7170105  32.64454269
 32.5850296  32.66909027 32.61981201 32.59049988 32.66648102 32.67501068
 32.65297699 32.67306519 32.66298676 32.66686249 32.59670258 32.52506638
 32.63100052 32.62992859 32.67923355 32.68297958 32.67300797 32.59430313
 32.70776367 32.67569351 32.70202255 32.60602951 32.67074966 32.6280632
 32.64499283 32.64935684 32.61818314 32.69462585 32.66735458 32.66839218
 32.62084198 32.59714127 32.62778473 32.61611557 32.59283447 32.59291077
 32.64754105 32.70743561 32.60506439 32.62212372 32.62289047 32.62571335
 32.66788101 32.62095642 32.66701126 32.65611649 32.6

41.974513284522985
[32.7795372  32.89960861 32.73506165 32.92575073 32.81285095 32.84736633
 32.72824478 32.90553665 32.87005997 32.7787056  32.78504181 32.78803253
 32.847435   32.72325134 32.89249802 32.91246414 32.80247116 32.8719635
 32.80646515 32.85671616 32.94511414 32.83035278 32.91082382 32.83572006
 32.77560425 32.88193512 32.82875061 32.79269791 32.87374115 32.87603378
 32.86368942 32.89567947 32.85132217 32.87237167 32.77001572 32.73009491
 32.81793213 32.80827332 32.91628265 32.88034821 32.87552643 32.79595947
 32.91116333 32.8808403  32.92106628 32.81320953 32.91477203 32.8156662
 32.85722351 32.86215973 32.80931473 32.90599823 32.87325668 32.8776207
 32.79523849 32.78198624 32.82612991 32.79101181 32.7802124  32.78537369
 32.85498047 32.92443848 32.79891968 32.82031631 32.8132782  32.83158875
 32.87650299 32.8190918  32.8686676  32.86565018 32.88318253 32.81239319
 32.78398895 32.88329697 32.91752243 32.80948257 32.8005867  32.91318893
 32.87789917 32.83710861 32.8460960

89.01856442621829
[32.69507599 32.82112885 32.64828873 32.88540268 32.73620605 32.77863693
 32.64929962 32.85697556 32.79894257 32.71922684 32.71670532 32.68888855
 32.79359436 32.63150024 32.82315063 32.86775208 32.72337723 32.80935669
 32.72385788 32.80502319 32.90407181 32.75906754 32.7953949  32.77507019
 32.71868515 32.8120079  32.75866318 32.70547485 32.81274414 32.80908585
 32.80936813 32.8285141  32.7962532  32.81853867 32.69830322 32.63717651
 32.76296616 32.75663376 32.8564682  32.83354187 32.82884598 32.71522522
 32.86607742 32.8175354  32.86092758 32.73984146 32.83390045 32.75181198
 32.78683472 32.78475189 32.73564148 32.85048294 32.80931091 32.81906891
 32.74039841 32.68660736 32.74852371 32.73398209 32.70466232 32.70604706
 32.79874802 32.86841965 32.72435379 32.75544739 32.74803543 32.76729584
 32.80831909 32.74608994 32.81856537 32.80971527 32.82486725 32.74055862
 32.68347931 32.81122971 32.84734726 32.75168991 32.7327652  32.86455154
 32.8173027  32.76263428 32.79091



[1mPredicted: 25683.918[0m
[1mReals: 111454.35481262207[0m


<Figure size 640x480 with 0 Axes>

In [17]:
if SAVE_RESULTS:
  if READ_LOCAL_DATA:
    file_path = 'Resources/Resultados/Individual/'
  else:
    file_path = "/content/drive/MyDrive/TFG/Resources/Resultados/Individual/"
    if LOW_DATA:
        file_path += "LowData/"

    file_path += "Split"+str(SPLIT)+"/"
    documents = ['minuteY','hourY','dayY']
    file = file_path+documents[COMPUTED_OPTION]+"-predictions"+".pkl.gz"
    pickle.dump(predictions, gzip.open(file, 'wb'))
    file = file_path+documents[COMPUTED_OPTION]+"-test"+".pkl.gz"
    pickle.dump(y_test, gzip.open(file, 'wb'))
    file = file_path+documents[COMPUTED_OPTION]+"-X"+".pkl.gz"
    pickle.dump(X_test, gzip.open(file, 'wb'))