### Author: Rafael de Oliveira Magalhães

# PEMSd3 Dataset - Cleaning Data

# Download Data

All data can be downloaded on the website https://pems.dot.ca.gov/.

First, it is necessary to create an account to access the data.

After logging in, go to 'Data Clearinghouse'. Then access:

- 'Station 5-Minute' -> 'District 3'. On this page, you will find data captured by sensors from 2001 to the present moment. Each day captured by the sensors is recorded in a single .txt file. To simplify the download, it is recommended to use an extension to download multiple files automatically.
- 'Station Metadata' -> 'District 3'. To download metadata files for the sensors, which are used to generate a map of monitored roadways.

Additionally, a list with a subset of the sensors is in the PEMSd3.csv file

# Imports

In [None]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
#os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
#os.environ['CUDA_VISIBLE_DEVICES'] = ''

import tensorflow as tf

import keras
from keras.callbacks import ReduceLROnPlateau
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Conv2D, MaxPooling2D, SeparableConv2D
from keras.regularizers import l2
from keras.optimizers import SGD, RMSprop
from keras.utils import to_categorical
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.callbacks import TensorBoard
import tensorflow.keras.backend as K
from keras.metrics import Metric
from keras.utils import plot_model
from keras.layers import Add, Concatenate, Input, GlobalAveragePooling2D, Layer
from keras import models, initializers
from keras.models import Model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

from spektral.datasets import TUDataset
from spektral.layers import GCNConv, GlobalSumPool, ChebConv
from spektral.data import SingleLoader, BatchLoader
from spektral.data import Graph
from spektral.data import Dataset

import statsmodels.api as sm
from statsmodels.stats.diagnostic import het_breuschpagan, acorr_ljungbox
from scipy.stats import f_oneway, f, kstest, norm, ks_2samp, kendalltau
from scipy.interpolate import interp2d, RegularGridInterpolator, RectBivariateSpline, griddata

# Helper libraries

from bokeh.io import show
from bokeh.plotting import gmap
from bokeh.models import GMapOptions
from bokeh.io import output_notebook
from bokeh.models import ColumnDataSource, HoverTool
import csv
import pandas as pd
import geopandas as gpd
import osmnx as ox
import networkx as nx
import folium
import pyproj
import math as m
import numpy as np
import random
import scipy as sp
import datetime as dt
import re
import time
import gmaps as gm
from shapely.geometry import Point, LineString
from shapely import wkt
from numba import jit, cuda
from sodapy import Socrata
from datetime import datetime
import matplotlib.pyplot as plt
import warnings as w

# Verify GPU

In [None]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

In [None]:
if gpus:
    # Alocar memória da VRAM conforme necessário
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print("Memória da VRAM alocada conforme necessário.")
    except RuntimeError as e:
        print(e)

In [None]:
physical_devices = tf.config.list_physical_devices('GPU')
if len(physical_devices) > 0:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)
    tf.config.set_visible_devices(physical_devices[0], 'GPU')

# Auxiliary Functions

In [None]:
def generate_dict(array: list) -> dict:
    """
        Generate a dict from a array
    """
    dictionary = {value: index for index, value in enumerate(array)}
    return dictionary

In [None]:
def dict_sort(dictt):
    return dict(sorted(dictt.items(), key=lambda item: item[1]))

In [None]:
def binary_search(element, array: list):
    lo = 0
    hi = len(array) - 1
    while lo <= hi:
        mid = lo + (hi - lo)//2
        temp = array[mid]
        if element > temp:
            lo = mid + 1
        elif element < temp:
            hi = mid - 1
        else:
            return mid
    return None

In [None]:
def create_list_datetime(initial_date: dt.datetime, length: int) -> list:
    """
        Create a list of datetime object that represents a time series

        Parameters:
        - initial_date: Initial date of the list
        - length: Expected length of the generated list
    """
    date_start = initial_date
    array_dates = []
    i = 0
    while i < length:
        array_dates.append(date_start)
        time_change = dt.timedelta(minutes=5)
        date_start += time_change
        i += 1
    return array_dates

In [None]:
def list_datetime_timedelta(initial_date: dt.datetime, final_date: dt.datetime, timedelta: int) -> list:
    """
        Create a list of datetime objects by increasing time by timedelta
    """
    date_start = initial_date
    array_dates = []
    while date_start <= final_date:
        array_dates.append(date_start)
        time_change = dt.timedelta(minutes=timedelta)
        date_start += time_change
    return array_dates

# Methods - Neural Network Input Generation

In [None]:
def data_short_time(temporal_series: np.array, list_dates: list, s: int, current_time: dt.datetime) -> tuple:
    """
        Generate short time data in format of numpy array

        Parameters:
        - temporal_series: a numpy array that represents a temporal series
        - list_dates: a list that contains the dates of the temporal series
        - s: interval between selected samples of the temporal series.
             This value must be an integer representing the number of time series samples to be discarded
        - current_time: The current time selected

        Returns:
        A tuple that contains a numpy array of the short time data and the corresponding expected data
    """
    dim = temporal_series.ndim
    index_d = binary_search(current_time, list_dates)
    pre = index_d - 1
    if dim == 2:
        expected_vector = temporal_series[:,index_d]
        previous_data = temporal_series[:,(index_d-s):index_d]
    else:
        expected_vector = temporal_series[index_d]
        previous_data = temporal_series[(index_d-s):index_d]
    return (previous_data, expected_vector)

In [None]:
def data_medium_time(temporal_series: np.array, list_dates: list, m: int, mm: int, current_time: dt.datetime) -> tuple:
    """
        Generate medium time data in format of numpy array

        Parameters:
        - temporal_series: A numpy array that represents a temporal series
        - list_dates: a list that contains the dates of the temporal series
        - m: The number of samples in a medium time data
        - mm: interval between selected samples of the temporal series.
             This value must be an integer representing the number of time series samples to be discarded
        - current_time: The current time selected

        Returns:
        A tuple that contains a numpy array of the medium time data and the corresponding expected data
    """
    index_d = binary_search(current_time, list_dates)
    dim = temporal_series.ndim
    m *= mm
    pre = index_d - 1
    if dim == 2:
        expected_vector = temporal_series[:,index_d]
        previous_data = temporal_series[:,(index_d-m):index_d:mm]
    else:
        expected_vector = temporal_series[index_d]
        previous_data = temporal_series[(index_d-m):index_d:mm]
    return (previous_data, expected_vector)

In [None]:
def data_long_time(temporal_series: np.array, list_dates: list, l: int, ll: int, current_time: dt.datetime) -> tuple:
    """
        Generate long time data in format of numpy array

        Parameters:
        - temporal_series: A numpy array that represents a temporal series
        - list_dates: A list that contains the dates of the temporal series
        - l: The number of samples in a long time data
        - ll: interval between selected samples of the temporal series.
             This value must be an integer representing the number of time series samples to be discarded
        - current_time: The current time selected

        Returns:
        A tuple that contains a numpy array of the long time data and the corresponding expected data
    """
    index_d = binary_search(current_time, list_dates)
    dim = temporal_series.ndim
    pre = index_d - 1
    l *= ll
    if dim == 2:
        expected_vector = temporal_series[:,index_d]
        previous_data = temporal_series[:,(index_d-l):index_d:ll]
    else:
        expected_vector = temporal_series[index_d]
        previous_data = temporal_series[(index_d-l):index_d:ll]
    return (previous_data, expected_vector)

In [None]:
def concatenation(short_data: np.array, medium_data: np.array, long_data: np.array) -> np.array:
    """
        Concatenate short, medium and long term data
    """
    if short_data.ndim == 1:
        return np.concatenate((short_data, medium_data, long_data), axis=0)
    return np.concatenate((short_data, medium_data, long_data), axis=1)

In [None]:
def elements_medium_term(current_time: dt.datetime, medium_timedelta: int, sample_timedelta: int) -> int:
    """
        Calculate the number of elements in a medium term

        Args:
        - current_time: The current time selected
        - medium_timedelta: The timedelta between the initial and the final sample
        of the medium time data
        - sample_timedelta: The timedelta between consecutive samples of the medium time data
    """
    td = dt.timedelta(hours=medium_timedelta)
    initial_time = current_time - td
    count = 0
    while initial_time < current_time:
        count += 1
        time_change = dt.timedelta(minutes=sample_timedelta)
        initial_time += time_change
    return count

In [None]:
def elements_long_term(current_time: dt.datetime, long_timedelta: int, sample_timedelta: int) -> int:
    """
        Calculate the number of elements in a long term

        Args:
        - current_time: The current time selected
        - long_timedelta: The timedelta between the initial and the final sample
        of the long time data
        - sample_timedelta: The timedelta between consecutive samples of the long time data
    """
    td = dt.timedelta(hours=long_timedelta)
    initial_time = current_time - td
    count = 0
    while initial_time < current_time:
        count += 1
        time_change = dt.timedelta(minutes=sample_timedelta)
        initial_time += time_change
    return count

In [None]:
def create_Xt(temporal_series: np.array, list_dates: list, current_time: dt.datetime, medium_timedelta: int, long_timedelta: int, mm: int, ll: int) -> tuple:
    """
        Create a tuple of X and y input of the GNN

        Args:
        - temporal_series: A numpy array that represents a temporal series
        - list_dates: A list that contains the dates of the temporal series
        - current_time: The current time selected
        - medium_timedelta: The timedelta between the initial and the final sample
        of the medium time data
        - long_timedelta: The timedelta between the initial and the final sample
        of the long time data
        - mm: interval between selected samples of the temporal series.
             This value must be an integer representing the number of time series samples to be discarded
        - ll: interval between selected samples of the temporal series.
             This value must be an integer representing the number of time series samples to be discarded
    """
    delta = 5
    s = 2
    m = elements_medium_term(current_time, medium_timedelta, delta * mm)
    l = elements_long_term(current_time, long_timedelta, delta * ll)
    short_data, expected_data_1 = data_short_time(temporal_series, list_dates, s, current_time)
    medium_data, expected_data_2 = data_medium_time(temporal_series, list_dates, m, mm, current_time)
    long_data, expected_data_3 = data_long_time(temporal_series, list_dates, l, ll, current_time)
    conc = concatenation(short_data, medium_data, long_data)
    if conc.ndim == 1:
        conc = conc.reshape(-1, 1)
        expected_data_1 = np.expand_dims(expected_data_1, axis=0)
    return (conc, expected_data_1)

In [None]:
def initial_index(list_dates: list, long_timedelta: int) -> int:
    """
        Determine the initial index to generate the data

        Args:
        - list_dates: A list that contains the dates of the temporal series
        - long_timedelta: The timedelta between the initial and the final sample
        of the long time data
    """
    delta = dt.timedelta(hours=long_timedelta)
    tomorrow = list_dates[0] + delta
    now = list_dates[0]
    count = 0
    while now <= tomorrow:
        count += 1
        time_change = dt.timedelta(minutes=5)
        now += time_change
    return count

In [None]:
def create_list_Xt(matrix: np.array, list_dates: list, medium_timedelta, long_timedelta, mm: int, ll: int) -> list:
    """
        Create a list of input data
    """
    start_index = initial_index(list_dates, long_timedelta)
    list_Xt = []
    for i in range(start_index,len(list_dates)):
        tuplee = create_Xt(matrix, list_dates, list_dates[i], medium_timedelta, long_timedelta, mm, ll)
        list_Xt.append(tuplee)
    return list_Xt

# Methods - Separation of Data into Training, Validation and Testing

In [None]:
def random_split(list_Xt: list, prob_training: float, prob_validation: float) -> tuple:
    """
      Split the input data in training, validation and testing sets.

      Args:
      - list_Xt: List of input data
      - prob_training: Probability of a input being placed in training set
      - prob_validation: Probability of a input being placed in validation set
    """
    training = []
    validation = []
    test = []
    for i in range(len(list_Xt)):
        val = np.random.rand()
        if val < prob_training:
            training.append(list_Xt[i])
        elif val < prob_validation:
            validation.append(list_Xt[i])
        else:
            test.append(list_Xt[i])
    return (training, validation, test)

In [None]:
def sequential_split(list_Xt: list, frac_training: float, frac_validation: float) -> tuple:
    """
      Split the input data in training, validation and testing sets.

      Args:
      - list_Xt: List of input data
      - frac_training: Fraction of the input that will be for training
      - frac_validation: Fraction of the input that will be for validation
    """
    training = []
    validation = []
    test = []
    index_training = m.floor(len(list_Xt) * frac_training)
    index_validation = index_training + m.floor(len(list_Xt) * (frac_validation - frac_training))
    for i in range(len(list_Xt)):
        if i < index_training:
            training.append(list_Xt[i])
        elif i < index_validation:
            validation.append(list_Xt[i])
        else:
            test.append(list_Xt[i])
    return (training, validation, test)

In [None]:
def split_x_and_y(list_of_tuples: list) -> tuple:
    """
        Split a list of tuples into two lists
    """
    sett_x = []
    sett_y = []

    for _ in range(10):
      random.shuffle(list_of_tuples)

    for tuplee in list_of_tuples:
        x, y = tuplee
        if x.ndim == 2:
            lin, col = x.shape
            ones = np.ones((lin))
            x = np.insert(x, 0, ones, axis=1)
        else:
            lin = len(x)
            ones = np.ones((1))
            x = np.insert(x, 0, ones, axis=0)
        sett_x.append(x)
        sett_y.append(y)
    return (np.array(sett_x), np.array(sett_y))

# Neural network implementation

In [None]:
class Dataset_C(Dataset):
    """
        Create a dataset of the input data
    """

    def __init__(self, adjacency_matrix: np.array, list_Xt: list, **kwargs):
        self.adjacency_matrix = adjacency_matrix
        self.list_Xt = list_Xt
        super().__init__(**kwargs)

    def read(self):
        # We must return a list of Graph objects
        list_graphs = []
        for tuplee in self.list_Xt:
            Xt, yt = tuplee
            list_graphs.append(Graph(x=Xt, a=self.adjacency_matrix, e=None, y=yt))

        return list_graphs

In [None]:
class LearnableMatrixMultiplicationLayer(tf.keras.layers.Layer):
    """
        Class for learnable matrix multiplation layer
    """

    def __init__(self, channels: int):
        super(LearnableMatrixMultiplicationLayer, self).__init__()
        self.channels = channels

    def build(self, input_shape):
        # Creates the learnable tensor with the correct dimensions
        self.kernel = self.add_weight("kernel", shape=[input_shape[-1], self.channels], trainable=True)
        #self.kernel = self.add_weight("kernel", shape=[input_shape[-1], input_shape[-2]], trainable=True)

    def call(self, inputs):
        # Multiply the input tensor by the learnable tensor
        return tf.matmul(inputs, self.kernel)

In [None]:
class FixedMatrixMultiplicationLayer(tf.keras.layers.Layer):
    """
        Class for fixed matrix multiplation layer
    """
    def __init__(self, channels: int):
        super(FixedMatrixMultiplicationLayer, self).__init__()
        self.channels = channels

    def build(self, input_shape):
        # Creates the learnable tensor with the correct dimensions
        input_x, input_y = input_shape
        #self.kernel = self.add_weight("kernel", shape=[input_x[-1], input_x[-2]],initializer=initializers.Ones(),trainable=False)
        self.kernel = self.add_weight("kernel", shape=[input_x[-1], self.channels],initializer=initializers.Ones(),trainable=False)


    def call(self, inputs):
        # Multiply the input tensor by the learnable tensor
        input1, input2 = inputs
        return tf.matmul(input1,self.kernel)

In [None]:
def rmse(y_true: np.array, y_pred: np.array) -> float:
    """
    Root Mean Squared Error
    Args:
        y_true ([np.array]): test samples
        y_pred ([np.array]): predicted samples
    Returns:
        [float]: root mean squared error
    """
    if y_pred.ndim == 3:
        y_pred = tf.squeeze(y_pred, axis=-1)
    y_pred2 = tf.squeeze(y_pred)
    return K.sqrt(K.mean(K.square(y_pred2 - y_true), axis=-1))

In [None]:
def nrmse(y_true: np.array, y_pred: np.array) -> float:
    """
    Normalized Root Mean Squared Error
    Args:
        y_true ([np.array]): test samples
        y_pred ([np.array]): predicted samples
    Returns:
        [float]: normalized root mean squared error
    """
    if y_pred.ndim == 3:
        y_pred = tf.squeeze(y_pred, axis=-1)
    y_pred2 = tf.squeeze(y_pred)
    return K.sqrt(K.mean(K.square(y_pred2 - y_true), axis=-1)) / K.mean(K.abs(y_true), axis=-1)

In [None]:
def rmse2(y_true, y_pred):
    """
    Root Mean Squared Error
    """
    y_true_tensor = tf.convert_to_tensor(y_true, dtype=tf.float32)
    y_pred_tensor = tf.convert_to_tensor(y_pred, dtype=tf.float32)
    return K.sqrt(K.mean(K.square(y_pred_tensor - y_true_tensor)))

In [None]:
def nrmse2(y_true, y_pred):
    """
    Normalized Root Mean Squared Error
    """
    y_true_tensor = tf.convert_to_tensor(y_true, dtype=tf.float32)
    y_pred_tensor = tf.convert_to_tensor(y_pred, dtype=tf.float32)
    
    return K.sqrt(K.mean(K.square(y_pred_tensor - y_true_tensor))) / K.mean(y_true_tensor)

In [None]:
def squared_error(y_true: np.array, y_pred: np.array) -> float:
    """
        Method for square error loss
    """
    error = tf.square(y_true - y_pred)
    loss = tf.reduce_mean(error)
    return loss

In [None]:
class GNN(Model):
    """
        Class for GNN model
    """

    def __init__(self, channels: int, k_layers: int, relu_last=False):
        super().__init__()
        self.channels = channels
        self.k_layers = k_layers
        self.num_layers = len(channels)
        self.relu_last = relu_last
        self.init_layers()

    def init_layers(self):
        self.concatenate = Concatenate(axis=2)
        self.add = Add()
        self.relu = tf.keras.layers.Activation('relu')

        for i in range(self.num_layers):
            setattr(self, f'cheb_stgi_l{i+1}', [])
            kk = self.k_layers[i]
            for k in range(1, kk + 1):
                layer = ChebConv(self.channels[i], K=k, activation='relu', use_bias=True)
                getattr(self, f'cheb_stgi_l{i+1}').append(layer)

        self.dot_learnable_layers = [LearnableMatrixMultiplicationLayer(self.channels[i]) for i in range(self.num_layers)]
        self.dot_fixed_layers = [FixedMatrixMultiplicationLayer(self.channels[i]) for i in range(self.num_layers)]

    def call(self, inputs):
        x, y = inputs
        out = None

        for i in range(self.num_layers):
            cheb_stgi_layers = getattr(self, f'cheb_stgi_l{i+1}')
            out_layers = [cheb(inputs) for cheb in cheb_stgi_layers]
            concatenate = self.concatenate(out_layers)
            mult_learnable = self.dot_learnable_layers[i](concatenate)
            mult_fixed = self.dot_fixed_layers[i](inputs)
            add = self.add([mult_learnable, mult_fixed])
            if i < self.num_layers - 1 or self.relu_last:
                relu = self.relu(add)
                out = relu
            else:
                out = add
            inputs = (out,y)

        return out

# Clean Data

In [None]:
def clean_txt(archive: str, directory_initial: str, directory_destiny: str, sensors, specific_sensor=None) -> None:
    """
        Cleans texts files by deleting unnecessary columns
    """
    new_name = directory_destiny + "/" + archive[:-3] + "csv"
    og_name = directory_initial + "/" + archive
    with open(new_name, 'w+') as new_arc:
        with open(og_name, "r") as arc:
            first_line = 'Timestamp,Station,Flow,Speed\n'
            new_arc.write(first_line)
            for line in arc:
                content = line.split(",")
                try:
                    sensor = int(content[1])
                except:
                    continue
                if specific_sensor is not None:
                    if sensor == specific_sensor:
                        new_line = content[0] + "," + content[1] + "," + content[9] + "," + content[11] + "\n"
                        new_arc.write(new_line)
                else:
                    if sensor in sensors:
                        new_line = content[0] + "," + content[1] + "," + content[9] + "," + content[11] + "\n"
                        new_arc.write(new_line)

In [None]:
def through_directory(directory_initial: str, directory_csv_files: str, sensors, specific_sensor=None) -> None:
    """
        Cleans text files from a directory and creates csv files
    """
    cont = 0
    for name_archive in os.listdir(directory_initial):
        print(cont)
        if name_archive.endswith('.txt'):
            clean_txt(name_archive, directory_initial, directory_csv_files, sensors, specific_sensor)
            cont += 1
            #generate_csv(name_archive, directory_txt_files, directory_csv_files)

In [None]:
dict_sensors = generate_dict_sensors("PEMSd3.csv")
list_sensors = set(dict_sensors.keys())
directory_initial = "/home/rafael_o_magalhaes/Documentos/Python Projects/PEMS2"
directory_txt_files = "/home/rafael_o_magalhaes/Documentos/Python Projects/pemsd7_2023 - clean"
directory_csv_files = "/home/rafael_o_magalhaes/Documentos/Python Projects/PEMS2 - csv2"
through_directory(directory_initial, directory_csv_files, list_sensors, specific_sensor=317842)

# Process Data

### Methods - Select Sensors

In [None]:
def generate_dict_sensors(archive: str) -> dict:
    """
      Read a csv file that contains the sensors and create a sensors dictionary
    """
    dataset = pd.read_csv(archive).dropna()
    unique_stations = pd.unique(dataset['from'])
    return generate_dict(unique_stations)

In [None]:
def exclude_sensors(dict_sensors: dict, exclude_lines: list) -> dict:
    """
        Exclude invalid sensors from the sensors dictionary
    """
    new_sensors = []
    sensors = list(dict_sensors.keys())
    for i in range(len(sensors)):
        if i not in exclude_lines:
            new_sensors.append(sensors[i])
    return generate_dict(new_sensors)

### Methods - Generate Temporal Series

In [None]:
def insert_flow_matrix(dict_dates: dict, dict_sensors: dict, directory: str) -> np.array:
  """
      Insert flow data in temporal series
  """
  matrix = np.full((len(dict_sensors), len(dict_dates)), np.nan)
  for name_archive in os.listdir(directory):
    if name_archive.endswith('.csv'):
        print(name_archive)
        archive = directory + "/" + name_archive
        dataset = pd.read_csv(archive)
        for i in range(len(dataset)):
          line = dataset.iloc[i]
          time = line['Timestamp']
          sensor = line['Station']
          try:
            ii = dict_sensors[sensor]
          except:
            continue
          format = "%m/%d/%Y %H:%M:%S"
          date = dt.datetime.strptime(time, format)
          j = dict_dates[date]
          matrix[ii][j] = line['Flow']
  return matrix

In [None]:
def insert_flow_list(dict_dates: dict, directory: str) -> np.array:
  """
      Insert flow data in temporal series
  """
  matrix = np.full((len(dict_dates)), np.nan)
  for name_archive in os.listdir(directory):
    if name_archive.endswith('.csv'):
        print(name_archive)
        archive = directory + "/" + name_archive
        dataset = pd.read_csv(archive)
        for i in range(len(dataset)):
          line = dataset.iloc[i]
          time = line['Timestamp']
          format = "%m/%d/%Y %H:%M:%S"
          date = dt.datetime.strptime(time, format)
          j = dict_dates[date]
          matrix[j] = line['Flow']
  return matrix

In [None]:
def dict_count_nan(directory: str) -> dict:
    dictt = {}
    cont = 0
    for name_archive in os.listdir(directory):
        if name_archive.endswith('.csv'):
            print(cont)
            cont += 1
            archive = directory + "/" + name_archive
            dataset = pd.read_csv(archive)
            for i in range(len(dataset)):
                line = dataset.iloc[i]
                flow = line['Flow']
                sensor = line['Station']
                if np.isnan(flow):
                    if sensor not in dictt:
                        dictt[sensor] = 1
                    else:
                        dictt[sensor] += 1
    return dictt

In [None]:
directory_csv_files = "/home/rafael_o_magalhaes/Documentos/Python Projects/PEMS2 - csv"
dict_nan_sensors = dict_count_nan(directory_csv_files) 

In [None]:
directory_csv_files = "/home/rafael_o_magalhaes/Documentos/Python Projects/PEMS2 - csv2"
initial_date = dt.datetime(2001, 1, 1, 0, 0, 0)
final_date = dt.datetime(2023, 12, 31, 23, 55, 0)
list_datetime = list_datetime_timedelta(initial_date, final_date, 5)
dict_dates = generate_dict(list_datetime)
list_flow = insert_flow_list(dict_dates, directory_csv_files)

In [None]:
np.save("list_pemsd3.npy", list_flow) 

### Methods - Interpolation

In [None]:
def avaliate_nan_values(matrix: np.array) -> tuple:
    """
        Evaluate the amount of NaN values, of non NaN values, the percentage of NaN values and the list of NaN lines
    """
    lin, col = matrix.shape
    nan_lines = pd.DataFrame(columns=['NaN Values', 'Percentage'])
    count_nan = 0
    total = lin * col
    for i in range(lin):
        line = matrix[i]
        known_indexes = np.arange(len(line))[~np.isnan(line)]
        if len(known_indexes) == 0:
            nan_lines.loc[i] = [m.inf, m.inf]
            total -= col
            continue
        unknown_indexes = np.arange(len(line))[np.isnan(line)]
        count_nan += len(unknown_indexes)
        nan_lines.loc[i] = [len(unknown_indexes), len(unknown_indexes)/col * 100]
    return (count_nan, total, count_nan/total * 100, nan_lines)

In [None]:
def avaliate_nan_values_list(matrix: np.array) -> tuple:
    """
        Evaluate the amount of NaN values, of non NaN values, the percentage of NaN values and the list of NaN lines
    """
    length = len(matrix)
    nan_lines = pd.DataFrame(columns=['NaN Values', 'Percentage'])
    known_indexes = np.arange(length)[~np.isnan(matrix)]
    if len(known_indexes) == 0:
        nan_lines.loc[0] = [m.inf, m.inf]
    else:
        unknown_indexes = np.arange(length)[np.isnan(matrix)]
        nan_lines.loc[0] = [len(unknown_indexes), len(unknown_indexes)/length * 100]
    return nan_lines

In [None]:
def interpolate_list(matrix: np.array) -> tuple:
    """
        Interpolate a numpy array to fill the NaN values and exclude the NaN lines
    """
    length = len(matrix)
    known_indexes = np.arange(length)[~np.isnan(matrix)] 
    # Find the index of null values (NaN)
    unknown_indexes = np.arange(length)[np.isnan(matrix)]
    # Use the interp function to calculate estimated values for NaN
    estimated_values = np.interp(unknown_indexes, known_indexes, matrix[~np.isnan(matrix)])
    # Replace NaN values with estimated values
    matrix[unknown_indexes] = estimated_values
    return matrix

In [None]:
def interpolate_matrix(matrix: np.array) -> tuple:
    """
        Interpolate a numpy array to fill the NaN values and exclude the NaN lines
    """
    lin, col = matrix.shape
    exclude_lines = []
    for i in range(lin):
        line = matrix[i]
        known_indexes = np.arange(len(line))[~np.isnan(line)]
        if len(known_indexes) == 0:
            exclude_lines.append(i)
            continue
        # Find the index of null values (NaN)
        unknown_indexes = np.arange(len(line))[np.isnan(line)]
        if len(unknown_indexes) == 0:
            continue
        # Use the interp function to calculate estimated values for NaN
        estimated_values = np.interp(unknown_indexes, known_indexes, line[~np.isnan(line)])
        # Replace NaN values with estimated values
        line[unknown_indexes] = estimated_values
        matrix[i] = line
    return (matrix, exclude_lines)

In [None]:
def fill_nan_polynomial(arr):
    mask = np.isnan(arr)
    x = np.flatnonzero(~mask)
    y = arr[~mask]
    p = np.poly1d(np.polyfit(x, y, deg=min(2, len(x)-1)))
    arr[mask] = p(np.flatnonzero(mask))
    return arr

In [None]:
def polinomial_interpolation(matrix: np.array) -> tuple:
    return np.apply_along_axis(fill_nan_polynomial, axis=1, arr=matrix)

In [None]:
def lagrange_interpolation(x, y, xi):
    yi = 0
    for i in range(len(x)):
        numerator, denominator = 1, 1
        for j in range(len(x)):
            if i != j:
                numerator *= xi - x[j]
                denominator *= x[i] - x[j]
        yi += y[i] * (numerator / denominator)
    return yi

In [None]:
def lagrange_interpolate_row(row):
    nan_indices = np.isnan(row)
    x_known = np.flatnonzero(~nan_indices)
    y_known = row[~nan_indices]
    row[nan_indices] = [lagrange_interpolation(x_known, y_known, xi) for xi in np.flatnonzero(nan_indices)]
    return row

In [None]:
def lagrange_interpolate_matrix(matrix):
    return np.apply_along_axis(lagrange_interpolate_row, axis=1, arr=matrix)

In [None]:
def divided_differences(x, y):
    n = len(x)
    F = np.zeros((n, n))
    F[:, 0] = y

    for j in range(1, n):
        for i in range(n - j):
            F[i, j] = (F[i + 1, j - 1] - F[i, j - 1]) / (x[i + j] - x[i])

    return F[0, :]

def newton_interpolation(x, y, xi):
    n = len(x)
    F = divided_differences(x, y)
    p = F[0]
    for j in range(1, n):
        p += F[j] * np.prod(xi - x[:j])
    return p

def newton_interpolate_row(row):
    nan_indices = np.isnan(row)
    x_known = np.flatnonzero(~nan_indices)
    y_known = row[~nan_indices]
    row[nan_indices] = [newton_interpolation(x_known, y_known, xi) for xi in np.flatnonzero(nan_indices)]
    return row

def newton_interpolate_matrix(matrix):
    return np.apply_along_axis(newton_interpolate_row, axis=1, arr=matrix)


In [None]:
def spline_interpolate_matrix(matrix):
    nan_indices = np.isnan(matrix)
    row, col = np.indices(matrix.shape)
    x_known = col[~nan_indices]
    y_known = row[~nan_indices]
    values_known = matrix[~nan_indices]
    
    points = np.transpose(np.nonzero(nan_indices))
    interpolated_values = griddata((x_known, y_known), values_known, (points[:, 1], points[:, 0]), method='cubic')
    
    matrix[nan_indices] = interpolated_values
    return matrix

In [None]:
def cubic_interpolate_matrix(matrix):
    nan_indices = np.isnan(matrix)
    y, x = np.mgrid[:matrix.shape[0], :matrix.shape[1]]
    x_known = x[~nan_indices]
    y_known = y[~nan_indices]
    values_known = matrix[~nan_indices]

    f = RegularGridInterpolator((y_known, x_known), values_known)

    points = np.transpose(np.nonzero(nan_indices))
    interpolated_values = f(points)

    matrix[nan_indices] = interpolated_values

    return matrix


### Methods - Transition Matrix

In [None]:
def generate_transition_matrix(dict_sensors,exclude_lines):
  matrix = np.zeros((len(dict_sensors),len(dict_sensors)))
  graph_csv = pd.read_csv("PEMSd3.csv")
  graph_csv = graph_csv.dropna()
  for i in range(len(graph_csv)):
    line = graph_csv.iloc[i]
    try:
        ii = dict_sensors[int(line['from'])]
        j = dict_sensors[int(line['to'])]
    except:
        continue
    matrix[ii][j] = line['distance']
  return matrix

In [None]:
def max_avg(dict_sensors: dict, directory_initial: str) -> tuple:
  list_max = np.zeros(len(dict_sensors))
  list_count = np.zeros(len(dict_sensors))

  for i in range(len(list_max)):
    list_max[i] = -m.inf

  for name_archive in os.listdir(directory_initial):
    if name_archive.endswith('.csv'):
      print(name_archive)
      archive = directory_initial + "/" + name_archive
      dataset = pd.read_csv(archive)
      dataset = dataset.dropna(subset=['Speed'])
      for i in range(len(dataset)):
        line = dataset.iloc[i]
        sensor = line['Station']
        speed = line['Speed']
        try:
            index = dict_sensors[sensor]
        except:
            continue
        list_count[index] += 1
        if speed > list_max[index]:
          list_max[index] = speed
  return (list_max, list_count)

In [None]:
def avg_speed(dict_sensors: dict, list_count: list, directory_initial: str) -> np.array:
  """
      Generate a numpy array of average speed for each node
  """
  list_avg = np.zeros(len(dict_sensors), dtype=float)
  for name_archive in os.listdir(directory_initial):
    if name_archive.endswith('.csv'):
      print(name_archive)
      archive = directory_initial + "/" + name_archive
      dataset = pd.read_csv(archive)
      dataset = dataset.dropna(subset=['Speed'])
      for i in range(len(dataset)):
        line = dataset.iloc[i]
        sensor = line['Station']
        speed = line['Speed']
        try:
            index = dict_sensors[sensor]
        except:
            continue
        list_avg[index] += speed / list_count[index]
  return list_avg

In [None]:
def interpolate_values(listt: np.array, value) -> np.array:
    """
      Replace values equals 'value' by the mean of the other values
    """
    sum_values = 0.0
    count = 0
    list_index = []
    for i in range(len(listt)):
        elem = listt[i]
        if elem != value:
            sum_values += elem
            count += 1
        else:
            list_index.append(i)
    for elem in list_index:
        listt[elem] = sum_values/count
    return listt

In [None]:
def transition_matrix_definitive(matrix_og: np.array, exclude_lines: list) -> np.array:
    """
        Exclude sensors (lines and columns) from the transition/adjacency matrix
    """
    lin, col = matrix_og.shape
    for i in range(lin):
        for j in range(len(exclude_lines)):
            jj = exclude_lines[j]
            elem = matrix_og[i][jj]
            if elem != 0.0:
                for k in range(col):
                    val = matrix_og[jj][k]
                    if val != 0.0 and matrix_og[i][k] == 0.0:
                        matrix_og[i][k] = val
    matrix_og = np.delete(matrix_og, exclude_lines, axis=0)
    matrix_og = np.delete(matrix_og, exclude_lines, axis=1)
    return matrix_og

In [None]:
def definitive_transition_matrix(matrix: np.array, list_max: np.array, list_avg: np.array) -> np.array:
    """
        Fill the transition matrix

        Args:
        - matrix: The transition matrix
        - list_max: The list of max speed for each node
        - list_avg: The list of average speed for each node
    """
    lin, col = matrix.shape
    print(lin, col)
    print(len(list_max), len(list_avg))
    for i in range(lin):
        count = 0
        for j in range(col):
            if matrix[i][j] != 0.0 and i != j:
                count += 1

        if count == 0:
            matrix[i][i] = 1
            print(i)
            continue

        matrix[i][i] = (list_max[i] - list_avg[i])/list_avg[i]

        for j in range(col):
            if matrix[i][j] != 0.0 and i != j:
                matrix[i][j] = (1 - matrix[i][i])/count
    return matrix

# Methods - Data Temporal Sparsity

In [None]:
def remove_data(matrix: np.array, interval: int) -> np.array:
    """
        Removes columns from a temporal series
    """
    lin, col = matrix.shape
    list_index = np.zeros(col) == 1
    i = 0
    while i < col:
        list_index[i] = True
        i += interval
    return matrix[:,list_index]

# Methods - Removing Data for Interpolation

In [None]:
def create_nan_columns(matrix: np.array, interval: int) -> np.array:
    """
        Creates NaN columns into a temporal series
    """
    lin, col = matrix.shape
    nan_column = np.full((lin), np.nan)
    i = 0
    while i < col:
        if (i % (interval + 1) != 0):
            matrix[:, i] = nan_column
        i += 1
    return matrix

In [None]:
def remove_random_data(matrix: np.array, probability: float, length) -> np.array:
    """
        Replaces values to NaN
    """
    lin, col = matrix.shape
    for i in range(lin):
        for j in range(length):
            prob = random.random()
            if prob < probability:
                matrix[i][j] = np.nan
    return matrix

# Methods - Sensor Sparsity

In [None]:
def random_index(length: int, probability: float) -> np.array:
    """
        Creates a boolean array of sensors that will be excluded
    """
    array = np.ones((length), dtype=int)
    for i in range(length):
        prob = random.random()
        if prob < probability:
            array[i] = 0
    return array

In [None]:
def remove_sensors(matrix: np.array, list_index: np.array) -> np.array:
    """
        Remove sensors from a adjacency matrix
    """
    lin, col = matrix.shape

    for i in range(len(list_index)):
        if list_index[i] == 0:
            adj_i = []
            for j in range(col):
                if matrix[i][j] != 0.0:
                    adj_i.append(j)

            for j in range(lin):
                if matrix[j][i] != 0.0:
                    for k in adj_i:
                        matrix[j][k] = 1

    boolean_array = list_index == 1
    matrix = matrix[:,boolean_array]
    return matrix[boolean_array,:]

In [None]:
def update_data_matrix(matrix: np.array, list_index: np.array) -> np.array:
    """
        Remove sensors from the temporal series
    """
    list_index2 = []
    for i in range(len(list_index)):
        if list_index[i] == 0:
            list_index2.append(i)
    return np.delete(matrix, list_index2, axis=0)

In [None]:
def remove_sensors_list(listt: np.array, list_index: np.array) -> np.array:
    """
        Remove sensors from a list
    """
    boolean_array = list_index == 1
    return listt[boolean_array]

# Methods - Sensor Sparsity - Stretch 

In [None]:
def verify_zeros(transition_matrix, list_index):
    count_list = []
    for i in range(len(list_index)):
        line = transition_matrix[list_index[i]]
        count_list.append(np.count_nonzero(line))
    return count_list

In [None]:
def neighbors(transition_matrix, list_index):
    list_neighbors = []
    size = len(list_index)
    for i in range(size):
        line = transition_matrix[list_index[i]]
        listt = []
        for j in range(len(line)):
            if line[j] != 0.0:
                if i != size - 1:
                    if list_index[i+1] != j:
                        listt.append(j)
                else:
                    listt.append(j)
        list_neighbors.append(listt)
    return list_neighbors

In [None]:
def origins(transition_matrix, list_index):
    list_origins = []
    size = len(list_index)
    for i in range(size):
        line = transition_matrix[:,list_index[i]]
        listt = []
        for j in range(len(line)):
            if line[j] != 0.0:
                if i > 0:
                    if list_index[i-1] != j:
                        listt.append(j)
                else:
                    listt.append(j)
        list_origins.append(listt)
    return list_origins

In [None]:
def neighbors_origins(transition_matrix, list_index):
    list_origins = []
    list_neighbors = []
    size = len(list_index)
    for i in range(size):
        line = transition_matrix[list_index[i]]
        column = transition_matrix[:,list_index[i]]
        length = len(line)
        listt_origins = []
        listt_neighbors = []
        for j in range(length):
            if column[j] != 0.0:
                listt_origins.append(j)
            if line[j] != 0.0:
                listt_neighbors.append(j)
        list_origins.append(listt_origins)
        list_neighbors.append(listt_neighbors)
    return (list_origins, list_neighbors)

In [None]:
def expanse_boolean_array(size, list_index, list_bool, same_position):
    boolean_array = np.zeros(size, dtype=int) == 0
    for i in range(len(list_index)):
        if not list_bool[i]:
            boolean_array[list_index[i]] = False
    for i in range(len(list_bool)):
        if not list_bool[i]:
            array_same = same_position[i]
            for index in array_same:
                boolean_array[index] = False
    return boolean_array

In [None]:
def exclude_sensors(np_array, list_bool):
    np_array2 = np.delete(np_array, np.where(list_bool), axis=0)
    return np.delete(np_array2, np.where(list_bool), axis=1)

In [None]:
def neighbors_list(path, list_neighbors, list_origins):
    length = len(path) - 1
    j = 0
    k = 1
    same_position = [[]]
    dict_input = {}
    dict_output = {}
    list_nodes = [0]
    for i in range(1, length):
        neighbors = list_neighbors[i-1]
        origins = list_origins[i+1]
        if neighbors == origins:
            same_position.append(neighbors)
        elif len(neighbors) > len(origins):
            list_nodes.append(i)
            neig = set(neighbors)
            og = set(origins)
            same_position.append(list(neig & og))
            diff = list(neig - og)
            dict_input[path[i]] = diff
        else:
            list_nodes.append(i)
            neig = set(neighbors)
            og = set(origins)
            same_position.append(list(neig & og))
            diff = list(og - neig)
            dict_output[path[i]] = diff
        j += 1
        k += 1
    list_nodes.append(length)
    same_position.append([])
    return (same_position, dict_input, dict_output, list_nodes)


In [None]:
def delete_sensors_path(list_bool, interval: int, start: int, end: int) -> np.array:
    i = start
    while i < end:
        list_bool[i] = True
        i += interval
    return list_bool

In [None]:
def divide_path(path, interval, list_index):
    length = len(path)
    list_bool = np.zeros(length) == 1
    for i in range(len(list_index) - 1):
        list_bool = delete_sensors_path(list_bool, interval, list_index[i], list_index[i+1])
    list_bool[length - 1] = True
    return list_bool

In [None]:
def delete_subpath(path: np.array) -> np.array:
    list_index = np.zeros(len(path)) == 1
    return list_index

In [None]:
def index_bool(list_bool):
    array = []
    for i in range(len(list_bool)):
        if list_bool[i]:
            array.append(i)
    return array

In [None]:
def same_position_array(origins):
    new = origins[1:]
    return new + [[]]

In [None]:
def sum_distances(transition_matrix, path, boolean_array):
    dict_distance = {}
    sum = 0.0
    initial_index = 0
    for i in range(1, len(boolean_array)):
        preview = path[i-1]
        current = path[i]
        sum += transition_matrix[preview][current]
        if boolean_array[i]:
            dict_distance[(initial_index, i)] = sum
            initial_index = i
    return dict_distance

In [None]:
 def exclude(transition_matrix, list_index, list_bool, same_position, dict_distance):
    array_index = index_bool(list_bool)
    for i in range(len(array_index)):
        if i != len(array_index) - 1:
            current_index = list_index[array_index[i]]
            next_index = list_index[array_index[i + 1]]
            list_nodes = same_position[array_index[i]] + [current_index]
            list_neighbors = same_position[array_index[i+1]] + [next_index]
            for current in list_nodes:
                for next in list_neighbors:
                    transition_matrix[current][next] = dict_distance[(array_index[i], array_index[i+1])]
    list_bool_full = expanse_boolean_array(len(transition_matrix), list_index, list_bool, same_position)
    transition_matrix = transition_matrix[:,list_bool_full]
    return transition_matrix[list_bool_full,:]


# Noise Methods

In [None]:
def sum_noise_matrix(temporal_series, noise_matrix):
    result_matrix = temporal_series + noise_matrix
    lines, columns = temporal_series.shape
    for i in range(lines):
        for j in range(columns):
            if result_matrix[i][j] < 0.0:
                result_matrix[i][j] = 0.0
    return result_matrix

## Generate Transition Matrix

### Load sensors data

In [None]:
# Sensors without data
exclude_lines = [4, 78, 85, 198, 260, 316, 330, 331, 332, 333, 334, 338, 339]

In [None]:
dict_sensors = generate_dict_sensors("PEMSd3.csv")

In [None]:
transition_matrix = generate_transition_matrix(dict_sensors, [])

In [None]:
transition_matrix = transition_matrix_definitive(transition_matrix, exclude_lines)

# Code for sensors sparsity testing 

In [None]:
class Grafo:
    def __init__(self, matriz):
        self.matriz = matriz

    def dfs_util(self, u, destino, visitado, caminho):
        visitado[u] = True
        caminho.append(u)

        if u == destino:
            print("Caminho encontrado:", caminho)
            return True

        for v in range(len(self.matriz[u])):
            if self.matriz[u][v] != 0.0 and not visitado[v]:
                if self.dfs_util(v, destino, visitado, caminho):
                    return True

        caminho.pop()
        return False

    def dfs(self, origem, destino):
        visitado = [False] * len(self.matriz)
        caminho = []
        self.dfs_util(origem, destino, visitado, caminho)
        return caminho

In [None]:
def generate_boolean_array(size):
    return np.zeros(size) == 0

In [None]:
def add_true_index(array_index, path):
    for i in range(len(path)):
        array_index[path[i]] = False
    return array_index

In [None]:
def exclude_sensors_matrix(transition_matrix, array_index):
    transition_matrix = transition_matrix[:,array_index]
    return transition_matrix[array_index,:]

### Code for sensors sparsity testing - random

In [None]:
# Load existent data
probability = 0.9
name = "Teste Esparcidade Geográfica - GNN/sensors90.npz"
data_m = np.load(name)
array_index = data_m['arr_0']
transition_matrix = remove_sensors(transition_matrix, array_index)

In [None]:
# Create new random array
lin, col = transition_matrix.shape
probability = 0.2
array_index = random_index(col, probability)
transition_matrix = remove_sensors(transition_matrix, array_index)

In [None]:
array_index_np = np.array(array_index)
np.savez("Teste Esparcidade Geográfica - Regressão/sensors10.npz",array_index_np)

### Load speed data

In [None]:
dataf = pd.read_csv('max_speed2_pemsd3.csv')

In [None]:
df_avg = pd.read_csv('avg_speed2_pemsd3.csv')
list_avg = df_avg['Avg Speed'].values

In [None]:
list_max = dataf['Max Speed'].values
list_count = dataf['Count'].values

In [None]:
list_max = interpolate_values(list_max, -m.inf)
list_avg = interpolate_values(list_avg, 0)

### Code for sensors sparsity tests

In [None]:
list_max = remove_sensors_list(list_max, array_index)
list_avg = remove_sensors_list(list_avg, array_index)

### Generate transition matrix

In [None]:
matrix = definitive_transition_matrix(transition_matrix, list_max, list_avg)

In [None]:
matrix_sparse = sp.sparse.csr_matrix(matrix)
print(matrix_sparse)

## Generation of Neural Network input

In [None]:
numpy_array = np.load("list_pemsd3.npy") 

In [None]:
initial_date = dt.datetime(2023, 1, 1, 0, 0, 0)
final_date = dt.datetime(2023, 8, 28, 23, 55, 0)

In [None]:
list_datetime = list_datetime_timedelta(initial_date,final_date, 5)
list_datetime = list_datetime[:(len(list_datetime)//8)]

In [None]:
absolut_nan = pd_nan.iloc[0]['NaN Values']
total = len(numpy_array)
percent = pd_nan.iloc[0]['Percentage']

In [None]:
df_matrix = pd.read_csv('matrix2_pemsd3.csv')
numpy_array = df_matrix.values

In [None]:
noise_matrix = np.load("PEMSd3/data/Noise/Noise 5/noise_matrix.npy")
numpy_array = sum_noise_matrix(numpy_array, noise_matrix)

### Code for sensors sparsity testing

In [None]:
numpy_array = update_data_matrix(numpy_array, array_index)

### Evaluate interpolated data

In [None]:
lin, col = numpy_array.shape
absolut_nan,total,percent, df_lines = avaliate_nan_values(numpy_array)
print("Total de NaN values:",absolut_nan)
print("Total de entradas da matriz:",total)
print("Percentual NaN/Total:",percent,"%")

### Code for interpolation test

In [None]:
interpolate_rate = 0.0

In [None]:
# Load existent matrix
name = "Teste Interpolação - Regressão/matrix_0.npz"
data_m = np.load(name)
numpy_array = data_m['array']

In [None]:
# Create new matrix
numpy_array = remove_random_data(numpy_array, interpolate_rate, len(list_datetime))
np.savez_compressed('Teste Interpolação - Regressão/matrix_0.npz', array=numpy_array)

In [None]:
lin, col = numpy_array.shape
absolut_nan2,total2,percent2, df_lines = avaliate_nan_values(numpy_array[:,:col//8])
print("Total de NaN values:",absolut_nan2)
print("Total de entradas da matriz:",total2)
print("Percentual NaN/Total:",percent2,"%")

In [None]:
numpy_array = update_data_matrix(numpy_array, array_index)
print(numpy_array.shape)

## Data Metrics

In [None]:
data_id_d = 0
name_d = 'PEMSd3'
initial_date_d = str(initial_date)
final_date_d = str(final_date)
samples_interval = 5
plus_interval = 1
short_samples = 2
medium_time = 24
medium_samples = 6
long_time = 168
long_samples = 12
sensors = 1
#sensors = len(df_lines)
#interpolate_data = percent
interpolate_data = 0
observations = ""
name_archive_data = "data_metrics.csv"

## Input Generation

### Interpolate data

In [None]:
matrix_flow = interpolate_list(numpy_array)

In [None]:
#matrix_flow, exclude_lines2 = interpolate_matrix(numpy_array)
matrix_flow = spline_interpolate_matrix(numpy_array)
matrix_flow = numpy_array[:,:col//8]

In [None]:
matrix_flow = numpy_array

### Code for temporal sparsity testing

In [None]:
matrix_flow = remove_data(matrix_flow,plus_interval)

In [None]:
list_datetime = list_datetime[:matrix_flow.shape[1]]
print(list_datetime[0],list_datetime[-1])

### Generate input data list

In [None]:
list_Xt = create_list_Xt(matrix_flow,list_datetime, medium_time, long_time, medium_samples, long_samples)
print(len(list_Xt))

## Model Parameters

In [None]:
data_id_m = 0
model_id_m = 0
split_array = [0.4,0.5]
final_relu = False
conv_array = [1]
k_array = [4]
lr = 0.01
name_archive_model = "model_metrics.csv"

## Separation of Data into Training, Validation and Testing

In [None]:
training, validation, test = sequential_split(list_Xt,split_array[0],split_array[1])

In [None]:
dataset = Dataset_C(matrix_sparse,training)
print(dataset)

In [None]:
dataset_validation = Dataset_C(matrix_sparse,validation)
print(dataset_validation)

In [None]:
dataset_test = Dataset_C(matrix_sparse,test)
print(dataset_test)

## Training Parameters

In [None]:
batch_size_training = 1
epochs_training = 50

In [None]:
reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.1, patience=2, min_lr=0.00001)
optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir='./logs', histogram_freq=1)

In [None]:
# Create and compile model
model = GNN(conv_array, k_array, final_relu)
model.compile(optimizer=optimizer,loss=squared_error, metrics=[rmse,nrmse,"mae","mape"])

In [None]:
# Create Loaders
loader = BatchLoader(dataset, batch_size=batch_size_training,shuffle=True)
loader_validation = BatchLoader(dataset_validation, batch_size=batch_size_training)
loader_test = BatchLoader(dataset_test, batch_size=batch_size_training)

In [None]:
# Training the model
init_time = time.time()
metrics_fit = model.fit(loader.load(), use_multiprocessing=True, workers=-1, verbose=0, steps_per_epoch=loader.steps_per_epoch, epochs=epochs_training, validation_data=loader_validation.load(), validation_steps=loader_validation.steps_per_epoch, callbacks=[reduce_lr, tensorboard_callback])
end_time = time.time()

In [None]:
# Validate the model
init_time = time.time()
metrics_validation = model.evaluate(loader_validation.load(), steps=loader_validation.steps_per_epoch)
end_time = time.time()

In [None]:
# Testing the model
init_time = time.time()
metrics_testing = model.evaluate(loader_test.load(), steps=loader_test.steps_per_epoch)
end_time = time.time()

## Data Separation for Linear Regression Model Training

In [None]:
def data_split_training(list_Xt,split_array):
    training, validation, test = sequential_split(list_Xt,split_array[0],split_array[1])
    x_training, y_training = split_x_and_y(training)
    return (x_training, y_training)

In [None]:
def data_split_validation(list_Xt,split_array):
    training, validation, test = sequential_split(list_Xt,split_array[0],split_array[1])
    x_validation, y_validation = split_x_and_y(validation)
    return (x_validation, y_validation)

In [None]:
def data_split_test(list_Xt,split_array):
    training, validation, test = sequential_split(list_Xt,split_array[0],split_array[1])
    x_test, y_test = split_x_and_y(test)
    return (x_test, y_test)

In [None]:
def data_training(matrix_flow, list_datetime, medium_time, long_time, medium_samples, long_samples, split_array):
    list_Xt = create_list_Xt(matrix_flow,list_datetime, medium_time, long_time, medium_samples, long_samples)
    return data_split_training(list_Xt,split_array)

In [None]:
def data_validation(matrix_flow, list_datetime, medium_time, long_time, medium_samples, long_samples, split_array):
    list_Xt = create_list_Xt(matrix_flow,list_datetime, medium_time, long_time, medium_samples, long_samples)
    return  data_split_validation(list_Xt,split_array)

In [None]:
def data_test(matrix_flow, list_datetime, medium_time, long_time, medium_samples, long_samples, split_array):
    list_Xt = create_list_Xt(matrix_flow,list_datetime, medium_time, long_time, medium_samples, long_samples)
    return  data_split_test(list_Xt,split_array)

In [None]:
def dimension(matrix_flow, list_datetime, medium_time, long_time, medium_samples, long_samples, split_array):
    x_validation, y_validation = data_validation(matrix_flow, list_datetime, medium_time, long_time, medium_samples, long_samples, split_array)
    return x_validation[0].shape

In [None]:
x_training, y_training = data_training(matrix_flow, list_datetime, medium_time, long_time, medium_samples, long_samples, split_array)
x_validation, y_validation = data_validation(matrix_flow, list_datetime, medium_time, long_time, medium_samples, long_samples, split_array)
x_test, y_test = data_test(matrix_flow, list_datetime, medium_time, long_time, medium_samples, long_samples, split_array)

### Training Parameters

In [None]:
batch_size_training = 300
epochs_training = 200

## Multiple Linear Regression and Multiple CNN

In [None]:
def linear_regression(lin, col):
    model = tf.keras.Sequential([
        tf.keras.layers.Input(shape=(col)),
        tf.keras.layers.Dense(1)
    ])
    model.compile(optimizer='adam', loss='mse',metrics=[rmse,nrmse,"mae","mape"])
    return model

In [None]:
def fully_connected(lin, col):
    model = tf.keras.models.Sequential([
        tf.keras.layers.Input(shape=(col)),              
        tf.keras.layers.Dense(128, activation='relu'),  
        tf.keras.layers.Dropout(0.2),                   
        tf.keras.layers.Dense(1)                        
    ])
    model.compile(optimizer='adam', loss='mse',metrics=[rmse,nrmse,"mae","mape"])
    return model

In [None]:
def list_regressions(lines, columns):
    listt = []
    for i in range(lines):
        listt.append(linear_regression(1, columns))
    return listt

In [None]:
def list_fully_connected(lines, columns):
    listt = []
    for i in range(lines):
        listt.append(fully_connected(1, columns))
    return listt

In [None]:
def load_weights_model(listt, path):
    for i in range(len(listt)):
        name = path + f"/model_{i}.h5"
        listt[i].load_weights(name)

In [None]:
def train_model(listt, temporal_series, list_datetime, medium_time, long_time, medium_samples, long_samples, split_array, path):
    batch_size_training = 1
    epochs_training = 50
    for i in range(145, len(listt)):
        print(i)
        reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.1, patience=2, min_lr=0.00001)
        x_training, y_training = data_training(temporal_series[i], list_datetime, medium_time, long_time, medium_samples, long_samples, split_array)
        listt[i].fit(x_training, y_training, epochs=epochs_training, batch_size=batch_size_training, verbose=0, callbacks=[reduce_lr])
        path2 = path + f"/model_{i}.h5"
        listt[i].save_weights(path2)
        del x_training
        del y_training
        del reduce_lr

In [None]:
def train_index_model(listt, temporal_series, list_datetime, medium_time, long_time, medium_samples, long_samples, split_array, index, path):
    batch_size_training = 1
    epochs_training = 50
    reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.1, patience=2, min_lr=0.00001)
    x_training, y_training = data_training(temporal_series[index], list_datetime, medium_time, long_time, medium_samples, long_samples, split_array)
    listt[index].fit(x_training, y_training, epochs=epochs_training, batch_size=batch_size_training, verbose=1, callbacks=[reduce_lr])
    path2 = path + f"/model_{index}.h5"
    listt[index].save_weights(path2)

In [None]:
def evaluate_train_model(listt, temporal_series, list_datetime, medium_time, long_time, medium_samples, long_samples, split_array):
    for i in range(len(listt)):
        print(i)
        x_training, y_training = data_training(temporal_series[i], list_datetime, medium_time, long_time, medium_samples, long_samples, split_array)
        listt[i].evaluate(x_training, y_training)
        del x_training
        del y_training

In [None]:
def predict_training_model(listt, temporal_series, list_datetime, medium_time, long_time, medium_samples, long_samples, split_array):
    x_training, y_training_og = data_training(temporal_series, list_datetime, medium_time, long_time, medium_samples, long_samples, split_array)
    del x_training
    print(y_training_og, y_training_og.shape)
    prev_values = np.zeros((len(y_training_og), len(listt)),dtype=float)
    print(len(y_training_og), len(listt))
    for i in range(len(listt)):
        x_training, y_training = data_training(temporal_series[i], list_datetime, medium_time, long_time, medium_samples, long_samples, split_array)
        previsoes = listt[i].predict(x_training)
        previsoes = previsoes.flatten()
        prev_values[:,i] = previsoes
        del x_training
        del y_training
    new_expected = np.reshape(y_training_og, (*y_training_og.shape, 1))
    new_prev = np.reshape(prev_values, (*prev_values.shape, 1))
    return (new_expected, new_prev)

In [None]:
def predict_validation_model(listt, temporal_series, list_datetime, medium_time, long_time, medium_samples, long_samples, split_array):
    x_training, y_training_og = data_validation(temporal_series, list_datetime, medium_time, long_time, medium_samples, long_samples, split_array)
    del x_training
    print(y_training_og, y_training_og.shape)
    prev_values = np.zeros((len(y_training_og), len(listt)),dtype=float)
    print(len(y_training_og), len(listt))
    for i in range(len(listt)):
        x_training, y_training = data_validation(temporal_series[i], list_datetime, medium_time, long_time, medium_samples, long_samples, split_array)
        previsoes = listt[i].predict(x_training)
        previsoes = previsoes.flatten()
        prev_values[:,i] = previsoes
        del x_training
        del y_training
    new_expected = np.reshape(y_training_og, (*y_training_og.shape, 1))
    new_prev = np.reshape(prev_values, (*prev_values.shape, 1))
    return (new_expected, new_prev)

In [None]:
def predict_test_model(listt, temporal_series, list_datetime, medium_time, long_time, medium_samples, long_samples, split_array):
    x_training, y_training_og = data_test(temporal_series, list_datetime, medium_time, long_time, medium_samples, long_samples, split_array)
    del x_training
    print(y_training_og, y_training_og.shape)
    prev_values = np.zeros((len(y_training_og), len(listt)),dtype=float)
    print(len(y_training_og), len(listt))
    for i in range(len(listt)):
        x_training, y_training = data_test(temporal_series[i], list_datetime, medium_time, long_time, medium_samples, long_samples, split_array)
        previsoes = listt[i].predict(x_training)
        previsoes = previsoes.flatten()
        prev_values[:,i] = previsoes
        del x_training
        del y_training
    new_expected = np.reshape(y_training_og, (*y_training_og.shape, 1))
    new_prev = np.reshape(prev_values, (*prev_values.shape, 1))
    return (new_expected, new_prev)

In [None]:
def metrics_training_model(listt, temporal_series, list_datetime, medium_time, long_time, medium_samples, long_samples, split_array):
    x_training, y_training_og = data_training(temporal_series, list_datetime, medium_time, long_time, medium_samples, long_samples, split_array)
    del x_training
    expected_values = np.zeros((len(listt), len(y_training_og)),dtype=float)
    prev_values = np.zeros((len(listt), len(y_training_og)),dtype=float)
    for i in range(len(listt)):
        x_training, y_training = data_training(temporal_series[i], list_datetime, medium_time, long_time, medium_samples, long_samples, split_array)
        previsoes = listt[i].predict(x_training)
        previsoes = previsoes.flatten()
        prev_values[i] = previsoes
        expected_values[:, i] = y_training_og[i]
        del x_training
        del y_training
    new_expected = np.reshape(expected_values, (*expected_values.shape, 1))
    new_prev = np.reshape(prev_values, (*prev_values.shape, 1))
    return (new_expected, new_prev)

### Multiple Linear Regression

In [None]:
lin, col = dimension(matrix_flow, list_datetime, medium_time, long_time, medium_samples, long_samples, split_array)
listt_regression = list_regressions(lin, col)

In [None]:
train_model(listt_cnn, matrix_flow, list_datetime, medium_time, long_time, medium_samples, long_samples, split_array, 'CNN')

In [None]:
load_weights_model(listt_regression, 'Regression')

In [None]:
expected, prev = predict_training_model(listt_regression, matrix_flow, list_datetime, medium_time, long_time, medium_samples, long_samples, split_array)

In [None]:
expected, prev = predict_validation_model(listt_regression, matrix_flow, list_datetime, medium_time, long_time, medium_samples, long_samples, split_array)

In [None]:
expected, prev = predict_test_model(listt_regression, matrix_flow, list_datetime, medium_time, long_time, medium_samples, long_samples, split_array)

### Multiple CNN

In [None]:
lin, col = dimension(matrix_flow, list_datetime, medium_time, long_time, medium_samples, long_samples, split_array)
listt_cnn = list_fully_connected(lin, col)

In [None]:
train_model(listt_regression, matrix_flow, list_datetime, medium_time, long_time, medium_samples, long_samples, split_array)

In [None]:
load_weights_model(listt_cnn, 'CNN')

In [None]:
expected, prev = predict_training_model(listt_cnn, matrix_flow, list_datetime, medium_time, long_time, medium_samples, long_samples, split_array)

In [None]:
expected, prev = predict_validation_model(listt_cnn, matrix_flow, list_datetime, medium_time, long_time, medium_samples, long_samples, split_array)

In [None]:
expected, prev = predict_test_model(listt_cnn, matrix_flow, list_datetime, medium_time, long_time, medium_samples, long_samples, split_array)

### Metrics

In [None]:
expected2 = np.squeeze(expected, axis=-1)
prev2 = np.squeeze(prev, axis=-1)

In [None]:
expected3 = expected2.T
prev3 = prev2.T

In [None]:
rmse_training = rmse2(expected, prev)
nrmse_training = nrmse2(expected, prev)
mae_training = mean_absolute_error(expected2, prev2)
mape_training = mean_absolute_percentage_error(expected2, prev2)
print("RMSE:", rmse_training)
print("NRMSE:", nrmse_training)
print("MAE:", mae_training)
print("MAPE:", mape_training)

In [None]:
rmse_validation = rmse2(expected, prev)
nrmse_validation = nrmse2(expected, prev)
mae_validation = mean_absolute_error(expected2, prev2)
mape_validation = mean_absolute_percentage_error(expected2, prev2)
print("RMSE:", rmse_validation)
print("NRMSE:", nrmse_validation)
print("MAE:", mae_validation)
print("MAPE:", mape_validation)

In [None]:
rmse_testing = rmse2(expected, prev)
nrmse_testing = nrmse2(expected, prev)
mae_testing = mean_absolute_error(expected2, prev2)
mape_testing = mean_absolute_percentage_error(expected2, prev2)
print("RMSE:", rmse_test)
print("NRMSE:", nrmse_test)
print("MAE:", mae_test)
print("MAPE:", mape_test)

In [None]:
rmse_value = rmse2(expected2, prev2)
nrmse_value = nrmse2(expected2, prev2)
mae_value = mean_absolute_error(expected2, prev2)
mape_value = mean_absolute_percentage_error(expected2, prev2)
print("RMSE:", rmse_value)
print("NRMSE:", nrmse_value)
print("MAE:", mae_value)
print("MAPE:", mape_value)

In [None]:
rmse_value = rmse2(expected3, prev3)
nrmse_value = nrmse2(expected3, prev3)
mae_value = mean_absolute_error(expected3, prev3)
mape_value = mean_absolute_percentage_error(expected3, prev3)
print("RMSE:", rmse_value)
print("NRMSE:", nrmse_value)
print("MAE:", mae_value)
print("MAPE:", mape_value)

## CNN Fully Connected Implementation

In [None]:
lin, col = x_training[0].shape

In [None]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Input(shape=(lin,col)),            
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.2),                  
    tf.keras.layers.Dense(1)                       
])

In [None]:
reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.1, patience=2, min_lr=0.00001)

In [None]:
model.compile(optimizer='adam', loss='mse',metrics=[rmse,nrmse,"mae","mape"])

In [None]:
tensorboard_callback = TensorBoard(log_dir='./logs', histogram_freq=1)

In [None]:
model.summary()

In [None]:
init_time = time.time()
metrics_fit = model.fit(x_training, y_training, epochs=epochs_training, batch_size=batch_size_training, validation_data=(x_validation, y_validation), verbose=0, callbacks=[reduce_lr,tensorboard_callback])
end_time = time.time()

In [None]:
init_time = time.time()
metrics_validation = model.evaluate(x_validation,y_validation)
end_time = time.time()

In [None]:
init_time = time.time()
metrics_testing = model.evaluate(x_test,y_test)
end_time = time.time()

## Linear Regression Implementation

In [None]:
lin, col = x_training[0].shape

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(lin, col)),
    tf.keras.layers.Dense(1)
])

In [None]:
reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.1, patience=2, min_lr=0.00001)

In [None]:
model.compile(optimizer='adam', loss='mse',metrics=[rmse,nrmse,"mae","mape"])

In [None]:
tensorboard_callback = TensorBoard(log_dir='./logs', histogram_freq=1)

In [None]:
model.summary()

In [None]:
# Training the model
init_time = time.time()
metrics_fit = model.fit(x_training, y_training, epochs=epochs_training, batch_size=batch_size_training, validation_data=(x_validation, y_validation), verbose=0, callbacks=[reduce_lr,tensorboard_callback])
end_time = time.time()

In [None]:
# Validates the model
init_time = time.time()
metrics_validation = model.evaluate(x_validation,y_validation)
end_time = time.time()

In [None]:
# Testing the model
init_time = time.time()
metrics_testing = model.evaluate(x_test, y_test)
end_time = time.time()