<a href="https://colab.research.google.com/github/sanntana21/TFG/blob/first_model_implementation/preprocesamiento_de_datos.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import pickle
import gzip
import matplotlib.pyplot as plt
import plotly.express as px

In [3]:
DATA_SET_AGRUPADO = dict() 
DATA_SET_INDIVIDUAL = dict()
DATA_SET_MATRICIAL = dict()

DATA_SET_AGRUPADO["generar"] = False
DATA_SET_AGRUPADO["cargar"] = False
DATA_SET_AGRUPADO["pintar"] = False

DATA_SET_INDIVIDUAL["generar"] = False
DATA_SET_INDIVIDUAL["pintar"] = False
DATA_SET_INDIVIDUAL["cargar"] = False
DATA_SET_MATRICIAL["generar"] = False
DATA_SET_MATRICIAL["pintar"] = False

initial = True
MINUTES_PER_DAY = 1440
HOURS_PER_DAY = 24
DAY = 1
STARTED_MINUTE = 1440
MINUTES_IN_THE_STUDY = 41760
LOCAL = True
GENERAR_DATA_SET_LIMITADO = True

# **FUNCIONES AUXILIARES**

In [3]:
def make_preds(model, input_data):
  """
  Uses model to make predictions on input_data.

  Parameters
  ----------
  model: trained model 
  input_data: windowed input data (same kind of data model was trained on)

  Returns model predictions on input_data.
  """
  forecast = model.predict(input_data)
  return tf.squeeze(forecast) # return 1D array of predictions

def plot_predictions_vs_real(predictions, reals):
    df = pd.DataFrame()
    number_of_points = len(predictions)
    df["hour"] = range(0,number_of_points)
    df["participant"] = "prediction"
    df["value"] = predictions
    for i in range(0,number_of_points):
      df.loc[number_of_points+i] = [i,"real",reals[i]]

    print(df)

    plt.figure(1)
    fig = px.line(df, x = "hour", y = "value" , title = "predicitons vs reals" , color = "participant")
    fig.update_xaxes(
          rangeslider_visible = True,
          rangeselector = dict(
              buttons = list([
                  dict(count=1,label="1y",step="year",stepmode="backward"),
                  dict(count=2,label="2y",step="year",stepmode="backward"),
                  dict(count=3,label="3y",step="year",stepmode="backward"),
                  dict(step="all")
              ])
          )

      )
    fig.show()

def set_output_precision(decimals):
  """
  format the output of the all the data structures
  with an specific number of decimals
  """
  np.set_printoptions(precision=decimals)
  into='{'+':.{}f'.format(decimals)+'}'
  pd.options.display.float_format = into.format

  pass

set_output_precision(6)


def plot_ts(df,dfx="Minute",dfy="METS",_title="DF minute x Mets"):
  if not isinstance(df,pd.DataFrame):
    df = pd.DataFrame({'METS': df, 'Minute': range(len(df))})

  plt.figure()
  fig = px.line(df, x = dfx, y = dfy , title = _title)
  fig.update_xaxes(
      rangeslider_visible = True,
      rangeselector = dict(
          buttons = list([
              dict(count=1,label="1y",step="year",stepmode="backward"),
              dict(count=2,label="2y",step="year",stepmode="backward"),
              dict(count=3,label="3y",step="year",stepmode="backward"),
              dict(step="all")
          ])
      )

  )
  fig.show()

NameError: name 'np' is not defined

# **Procesamiento de los datos**

In [15]:
#First we read datasets into pandasDataFrame
if LOCAL:
    path = "Resources/METS_in_minutes.csv"
else:
    #  We start by getting access to the drive
    from google.colab import drive
    drive.mount('/content/drive')
    path = "/content/drive/MyDrive/TFG/Resources/METS_in_minutes.csv"

df = pd.read_csv(path,sep=",",dtype={"METS":"float32"})

print("Desviación de METS:" , df["METS"].std())
print('\033[1m' + "SET OF VALUES\n" + '\033[0m')
print(df.head())

total_nan_values = df.apply(lambda x: x.isna().sum())["METS"]

print('\033[1m' + "\nValores NULOS: "  + '\033[0m' + str(total_nan_values) )

Desviación de METS: 0.5084145
[1mSET OF VALUES
[0m
  participant            timestamp  minute     METS
0       A3FNz  2021-11-16 00:00:00       0 0.000000
1       A3FNz  2021-11-16 00:01:00       1 0.000000
2       A3FNz  2021-11-16 00:02:00       2 0.000000
3       A3FNz  2021-11-16 00:03:00       3 0.000000
4       A3FNz  2021-11-16 00:04:00       4 0.000000
[1m
Valores NULOS: [0m0


In [16]:
errores = df.loc[(df["METS"] < 1) & (df["minute"] > 1440)]
STARTED_MINUTE = int(errores["minute"].max())
COMBINATIONS = (MINUTES_IN_THE_STUDY - MINUTES_PER_DAY*2 - (STARTED_MINUTE-1))

In [17]:
#Generate trainable sets for the LSTM

def create_minutes_to_minutes_forecasting_sets(values,started_minute = 0):
    X = []
    y = []
    for i in range(started_minute, values["minute"].max() - 1439*2,1):
        first_minute_in_window = i
        last_minute_in_window = i + 1440
        last_minute_in_prediction = last_minute_in_window + 1440
        X.append([j for j in values.loc[(values["minute"] >= first_minute_in_window) & (values["minute"] < last_minute_in_window)]["METS"]])
        y.append([j for j in values.loc[(values["minute"] >= last_minute_in_window) & (values["minute"] < last_minute_in_prediction)]["METS"]])
    return X,y


def create_minutes_to_hours_forecasting_sets(y_in_minutes,started_minute = 0):
    y = []
    for window_of_values in y_in_minutes:
        y.append([ sum(window_of_values[first_minute_of_the_hour:first_minute_of_the_hour+60]) for first_minute_of_the_hour in range(started_minute,1440-59,60)])
    return y


def create_minutes_to_day_forecasting_sets(y_in_minutes):
    y = []
    for window_of_values in y_in_minutes:
        y.append([sum(window_of_values)])
    return y


# **SETS GENERATION**

## GENERATE AGGREGATED DATA


In [18]:
# DATA_SET_AGRUPADO["generar"] = False
def generate_aggregated_data(df):
  dataX = []
  dataY_minute = []
  dataY_hour = []
  dataY_day = []

  # Paso 1: Agrupa por tiempo y suma los valores de los participantes
  df_aggregated_by_minute = df.groupby('minute').sum()

  # Paso 2: Restablece el índice para convertir 'tiempo' en una columna nuevamente
  df_aggregated_by_minute = df_aggregated_by_minute.reset_index()

  pX,pY = create_minutes_to_minutes_forecasting_sets(df_aggregated_by_minute,started_minute=STARTED_MINUTE)
  dataX.append(pX)
  dataY_minute.append(pY)
  dataY_hour.append(create_minutes_to_hours_forecasting_sets(pY))
  dataY_day.append(create_minutes_to_day_forecasting_sets(pY))
  return dataX,dataY_minute,dataY_hour,dataY_day

def load_aggregated_data():
  file_path = '/content/drive/MyDrive/TFG/Resources/Agregado/'
  documents = ['minuteX','minuteY','hourY','dayY']
  data_to_load = []
  # Save the list using pickle
  for i in range(0,4,1):
    with open(file_path+documents[i]+".pkl", 'rb') as file:
        data_to_load.append(pickle.load(file))

  return data_to_load[0],data_to_load[1],data_to_load[2],data_to_load[3]

dataX = []
dataY_minute = []
dataY_hour = []
dataY_day = []


if DATA_SET_AGRUPADO["generar"] == True:
  dataX,dataY_minute,dataY_hour,dataY_day = generate_aggregated_data(df)
  file_path = '/content/drive/MyDrive/TFG/Resources/Agregado/'
  documents = ['minuteX','minuteY','hourY','dayY']
  data_to_save = [dataX,dataY_minute,dataY_hour,dataY_day]
  # Save the list using pickle
  for i in range(0,4,1):
    with open(file_path+documents[i]+".pkl", 'wb') as file:
        pickle.dump(data_to_save[i], file)
elif DATA_SET_AGRUPADO["cargar"] == True:
  dataX,dataY_minute,dataY_hour,dataY_day = load_aggregated_data()

dataX = np.array(dataX)
dataY_minute = np.array(dataY_minute)
dataY_hour = np.array(dataY_hour)
dataY_day = np.array(dataY_day)

In [19]:
if DATA_SET_AGRUPADO["pintar"] == True:
  print(dataX.shape)
  print(dataY_minute.shape)
  print(dataY_hour.shape)
  print(dataY_day.shape)
  plot_ts(dataX[0][0],_title="Serie temporal agregada en minutos del día 1")
  plot_ts(dataY_minute[0][1440],_title="Predicción de la serie temporal agregada en minutos del día 1, es decir el día 2")
  plot_ts(dataY_hour[0][14400],_title="Predicción de la serie temporal agregada en horas del día 1, es decir el día 2")
  print('\033[1m' + "Valor de METS en días para la predicción del día 1, es decir dia 2 "+'\033[0m',dataY_day[0][2880])

## INDIVIDUAL

In [20]:

if DATA_SET_INDIVIDUAL["generar"] == True:
  dataX= np.full((len(df["participant"].unique()),COMBINATIONS,MINUTES_PER_DAY),0.0,dtype=np.float32)
  dataY_minute = np.full((len(df["participant"].unique()),COMBINATIONS,MINUTES_PER_DAY),0.0,dtype=np.float32)
  dataY_hour = np.full((len(df["participant"].unique()),COMBINATIONS,HOURS_PER_DAY),0.0,dtype=np.float32)
  dataY_day = np.full((len(df["participant"].unique()),COMBINATIONS,DAY),0.0,dtype=np.float32)
  index = 0
  for participant in df["participant"].unique():
    px_minute,py_minute = create_minutes_to_minutes_forecasting_sets(df.loc[df["participant"] == participant],started_minute=STARTED_MINUTE)
    dataX[index] = px_minute
    dataY_minute[index] = py_minute
    del px_minute
    dataY_hour[index] = create_minutes_to_hours_forecasting_sets(py_minute)
    dataY_day[index] = create_minutes_to_day_forecasting_sets(py_minute)
    del py_minute
    index += 1


In [21]:
if DATA_SET_INDIVIDUAL["generar"]:
    if LOCAL:
        file_path = 'Resources/Individual/'
    else:
        file_path = '/content/drive/MyDrive/TFG/Resources/Individual/'
    documents = ['minuteX','minuteY','hourY','dayY']
    data_to_save = [dataX,dataY_minute,dataY_hour,dataY_day]
    # Save the list using pickle
    for i in range(0,4,1):
        file = file_path+documents[i]+".pkl.gz"
        pickle.dump(data_to_save[i], gzip.open(file, 'wb'))

In [8]:
if GENERAR_DATA_SET_LIMITADO:
    if not DATA_SET_INDIVIDUAL["generar"]:
        PATH = "Resources/Individual/"
        documents = ['minuteY','hourY','dayY']
        # with open(PATH+"minuteX"+".pkl", 'rb') as file:
        #     dataX = np.array(pickle.load(file),np.float32)
        file = PATH+"minuteX.pkl.gz"
        dataX = np.array(pickle.load(gzip.open(file, 'rb')),np.float32)

        file = PATH+documents[0]+".pkl.gz"
        dataY_minute = np.array(pickle.load(gzip.open(file, 'rb')),np.float32)

        file = PATH+documents[1]+".pkl.gz"
        dataY_hour = np.array(pickle.load(gzip.open(file, 'rb')),np.float32)

        file = PATH+documents[2]+".pkl.gz"
        dataY_day = np.array(pickle.load(gzip.open(file, 'rb')),np.float32)

    # Calcular la variabilidad de cada elemento en X
    variability = np.std(dataX, axis=(1, 2))  # Calcular la desviación estándar

    # Obtener los índices de los 25 elementos con mayor variabilidad
    top_indices = np.argsort(variability)[-25:]

    # Extraer los mismos elementos de X y Y utilizando los índices
    selected_dataX = dataX[top_indices, :, :]
    selected_dataY_minute = dataY_minute[top_indices, :, :]
    selected_dataY_hour = dataY_hour[top_indices, :, :]
    selected_dataY_day = dataY_day[top_indices, :, :]


In [9]:
    if GENERAR_DATA_SET_LIMITADO and DATA_SET_INDIVIDUAL["generar"] and DATA_SET_INDIVIDUAL["cargar"]:

        if LOCAL:
          file_path = 'Resources/Individual/LowData/'
        else:
          file_path = '/content/drive/MyDrive/TFG/Resources/Individual/LowData/'

        documents = ['minuteX','minuteY','hourY','dayY']
        data_to_save = [selected_dataX,selected_dataY_minute,selected_dataY_hour,selected_dataY_day]
        # Save the list using pickle
        for i in range(0,4,1):
            file = file_path+documents[i]+".pkl.gz"
            pickle.dump(data_to_save[i], gzip.open(file, 'wb'))

In [17]:
if GENERAR_DATA_SET_LIMITADO:
        if LOCAL:
          file_path = 'Resources/Agregado/LowData/'
        else:
          file_path = '/content/drive/MyDrive/TFG/Resources/Agregado/LowData/'

        documents = ['minuteX','minuteY','hourY','dayY']
        data_to_save = [np.sum(selected_dataX,axis=0),np.sum(selected_dataY_minute,axis=0),np.sum(selected_dataY_hour,axis=0),np.sum(selected_dataY_day,axis=0)]
        # Save the list using pickle
        for i in range(0,4,1):
            file = file_path+documents[i]+".pkl.gz"
            pickle.dump(data_to_save[i], gzip.open(file, 'wb'))