### Author: Rafael de Oliveira Magalhães

# BehaviorXplore 

# Imports

In [None]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

import tensorflow as tf

import keras
from keras.callbacks import ReduceLROnPlateau
from keras.layers import Dense, Dropout, Activation, Flatten, Reshape, Conv2D, MaxPooling2D, SeparableConv2D, Lambda
from keras.regularizers import l2
from keras.optimizers import SGD, RMSprop
from keras.utils import to_categorical
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.utils import plot_model
from tensorflow.keras.callbacks import TensorBoard
import tensorflow.keras.backend as K
from keras.metrics import Metric
from keras.utils import plot_model
from keras.layers import Add, Concatenate, Input, GlobalAveragePooling2D, Layer
from keras import models, initializers
from keras.models import Model
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, brier_score_loss
from sklearn.datasets import make_classification

from spektral.datasets import TUDataset
from spektral.layers import GCNConv, GlobalSumPool, ChebConv
from spektral.data import SingleLoader, BatchLoader
from spektral.data import Graph
from spektral.data import Dataset

import statsmodels.api as sm
import scipy.stats as stats
from scipy.stats import shapiro
from statsmodels.stats.stattools import durbin_watson
from statsmodels.stats.diagnostic import het_breuschpagan
from statsmodels.graphics.tsaplots import plot_acf
from scipy.stats import f_oneway
from scipy.stats import f

from IPython.display import Image

# Helper libraries

from bokeh.io import show
from bokeh.plotting import gmap
from bokeh.models import GMapOptions
from bokeh.io import output_notebook
from bokeh.models import ColumnDataSource, HoverTool
from collections import deque
from mpl_toolkits.mplot3d import Axes3D
import csv
import fnmatch
import pandas as pd
import geopandas as gpd
import osmnx as ox
import networkx as nx
import plotly.graph_objects as go
import math as m
import numpy as np
import random
import scipy as sp
import datetime as dt
import re
import time
import gmaps as gm
import seaborn as sns
from statistics import mean
from shapely.geometry import Point, LineString
from shapely import wkt
from tqdm import tqdm
from numba import jit, cuda
from sodapy import Socrata
from datetime import datetime
import matplotlib.pyplot as plt
import warnings as w

# Auxiliary Functions

In [None]:
def generate_dict(array: list) -> dict:
    """
        Generate a dict from a array
    """
    dictionary = {value: index for index, value in enumerate(array)}
    return dictionary

# Methods - Neural Network Input Generation

In [None]:
def data_short_time(temporal_series: np.array, index: int, s: int) -> tuple:
    """
        Generate short time data in format of numpy array

        Parameters:
        - temporal_series: a numpy array that represents a temporal series
        - index: The sample index
        - s: interval between selected samples of the temporal series.
             This value must be an integer representing the number of time series samples to be discarded

        Returns:
        A tuple that contains a numpy array of the short time data and the corresponding expected data
    """
    expected_vector = temporal_series[:,index]
    previous_data = temporal_series[:,(index - s):index]
    return (previous_data, expected_vector)

In [None]:
def data_medium_time(temporal_series: np.array, index: int, m: int, mm: int) -> tuple:
    """
        Generate medium time data in format of numpy array

        Parameters:
        - temporal_series: A numpy array that represents a temporal series
        - index: The sample index
        - m: The number of samples in a medium time data
        - mm: interval between selected samples of the temporal series.
             This value must be an integer representing the number of time series samples to be discarded

        Returns:
        A tuple that contains a numpy array of the medium time data and the corresponding expected data
    """
    expected_vector = temporal_series[:,index]
    m *= mm
    previous_data = temporal_series[:,(index-m):index:mm]
    return (previous_data, expected_vector)

In [None]:
def data_long_time(temporal_series: np.array, index: int, l: int, ll: int) -> tuple:
    """
        Generate long time data in format of numpy array

        Parameters:
        - temporal_series: A numpy array that represents a temporal series
        - index: The sample index
        - l: The number of samples in a long time data
        - ll: interval between selected samples of the temporal series.
             This value must be an integer representing the number of time series samples to be discarded

        Returns:
        A tuple that contains a numpy array of the long time data and the corresponding expected data
    """
    expected_vector = temporal_series[:,index]
    l *= ll
    previous_data = temporal_series[:,(index - l):index:ll]
    return (previous_data, expected_vector)

In [None]:
def concatenation(short_data: np.array, medium_data: np.array, long_data: np.array) -> np.array:
    """
        Concatenate short, medium and long term data
    """
    return np.concatenate((short_data, medium_data, long_data), axis=1)

In [None]:
def elements_medium_term(medium_timedelta: int, sample_timedelta: int) -> int:
    """
        Calculate the number of elements in a medium term

        Args:
        - medium_timedelta: The timedelta between the initial and the final sample
        of the medium time data
        - sample_timedelta: The timedelta between consecutive samples of the medium time data
    """
    current_time = dt.datetime.now()
    td = dt.timedelta(hours=medium_timedelta)
    initial_time = current_time - td
    count = 0
    while initial_time < current_time:
        count += 1
        time_change = dt.timedelta(minutes=sample_timedelta)
        initial_time += time_change
    return count

In [None]:
def elements_long_term(long_timedelta: int, sample_timedelta: int) -> int:
    """
        Calculate the number of elements in a long term

        Args:
        - long_timedelta: The timedelta between the initial and the final sample
        of the long time data
        - sample_timedelta: The timedelta between consecutive samples of the long time data
    """
    current_time = dt.datetime.now()
    td = dt.timedelta(hours=long_timedelta)
    initial_time = current_time - td
    count = 0
    while initial_time < current_time:
        count += 1
        time_change = dt.timedelta(minutes=sample_timedelta)
        initial_time += time_change
    return count

In [None]:
def create_Xt(temporal_series: np.array, index: int, sample_timedelta: int, medium_timedelta: int, long_timedelta: int, mm: int, ll: int) -> tuple:
    """
        Create a tuple of X and y input of the GNN

        Args:
        - temporal_series: A numpy array that represents a temporal series
        - index: The sample index
        - sample_timedelta: The timedelta betwenn consecutive samples
        - medium_timedelta: The timedelta between the initial and the final sample
        of the medium time data
        - long_timedelta: The timedelta between the initial and the final sample
        of the long time data
        - mm: interval between selected samples of the temporal series.
             This value must be an integer representing the number of time series samples to be discarded
        - ll: interval between selected samples of the temporal series.
             This value must be an integer representing the number of time series samples to be discarded
    """
    s = 2
    m = elements_medium_term(medium_timedelta, sample_timedelta * mm)
    l = elements_long_term(long_timedelta, sample_timedelta * ll)
    short_data, expected_data_1 = data_short_time(temporal_series, index, s)
    medium_data, expected_data_2 = data_medium_time(temporal_series, index, m, mm)
    long_data, expected_data_3 = data_long_time(temporal_series, index, l, ll)
    return (concatenation(short_data, medium_data, long_data), expected_data_1)

In [None]:
def initial_index(long_timedelta: int, sample_timedelta: int) -> int:
    """
        Determine the initial index to generate the data

        Args:
        - long_timedelta: The timedelta between the initial and the final sample
        of the long time data
        - sample_timedelta: The timedelta betwenn consecutive samples
    """
    delta = dt.timedelta(hours=long_timedelta)
    now = dt.datetime.now()
    tomorrow = now + delta
    count = 0
    while now <= tomorrow:
        count += 1
        time_change = dt.timedelta(minutes=sample_timedelta)
        now += time_change
    return count

In [None]:
def create_list_Xt(matrix: np.array, sample_timedelta: int, medium_timedelta, long_timedelta, mm: int, ll: int) -> list:
    """
        Create a list of input data
    """
    start_index = initial_index(long_timedelta, sample_timedelta)
    list_Xt = []
    lines, columns = matrix.shape[0], matrix.shape[1]
    for i in range(start_index, columns):
        tuplee = create_Xt(matrix, i, sample_timedelta, medium_timedelta, long_timedelta, mm, ll)
        list_Xt.append(tuplee)
    return list_Xt

# Methods - Separation of Data into Training, Validation and Testing Sets

In [None]:
def random_split(list_Xt: list, prob_training: float, prob_validation: float) -> tuple:
    """
      Split the input data in training, validation and testing sets.

      Args:
      - list_Xt: List of input data
      - prob_training: Probability of a input being placed in training set
      - prob_validation: Probability of a input being placed in validation set
    """
    training = []
    validation = []
    test = []
    for i in range(len(list_Xt)):
        val = np.random.rand()
        if val < prob_training:
            training.append(list_Xt[i])
        elif val < prob_validation:
            validation.append(list_Xt[i])
        else:
            test.append(list_Xt[i])
    return (training, validation, test)

In [None]:
def sequential_split(list_Xt: list, frac_training: float, frac_validation: float) -> tuple:
    """
      Split the input data in training, validation and testing sets.

      Args:
      - list_Xt: List of input data
      - frac_training: Fraction of the input that will be for training
      - frac_validation: Fraction of the input that will be for validation
    """
    training = []
    validation = []
    test = []
    index_training = m.floor(len(list_Xt) * frac_training)
    index_validation = index_training + m.floor(len(list_Xt) * (frac_validation - frac_training))
    for i in range(len(list_Xt)):
        if i < index_training:
            training.append(list_Xt[i])
        elif i < index_validation:
            validation.append(list_Xt[i])
        else:
            test.append(list_Xt[i])
    return (training, validation, test)

In [None]:
def split_x_and_y(list_of_tuples: list) -> tuple:
    """
        Split a list of tuples into two lists
    """
    sett_x = []
    sett_y = []

    for _ in range(10):
      random.shuffle(list_of_tuples)

    for tuplee in list_of_tuples:
        x, y = tuplee
        sett_x.append(x)
        sett_y.append(y)
    return (np.array(sett_x), np.array(sett_y))

# GNN Implementation

In [None]:
class Dataset_C(Dataset):
    """
        Create a dataset of the input data
    """

    def __init__(self, adjacency_matrix: np.array, list_Xt: list, **kwargs):
        self.adjacency_matrix = adjacency_matrix
        self.list_Xt = list_Xt
        super().__init__(**kwargs)

    def read(self):
        # We must return a list of Graph objects
        list_graphs = []
        for tuplee in self.list_Xt:
            Xt, yt = tuplee
            list_graphs.append(Graph(x=Xt, a=self.adjacency_matrix, e=None, y=yt))

        return list_graphs

In [None]:
class LearnableMatrixMultiplicationLayer(tf.keras.layers.Layer):
    """
        Class for learnable matrix multiplation layer
    """

    def __init__(self, channels: int):
        super(LearnableMatrixMultiplicationLayer, self).__init__()
        self.channels = channels

    def build(self, input_shape):
        # Creates the learnable tensor with the correct dimensions
        self.kernel = self.add_weight("kernel", shape=[input_shape[-1], self.channels], trainable=True)

    def call(self, inputs):
        # Multiply the input tensor by the learnable tensor
        return tf.matmul(inputs, self.kernel)

In [None]:
class FixedMatrixMultiplicationLayer(tf.keras.layers.Layer):
    """
        Class for fixed matrix multiplation layer
    """
    def __init__(self, channels: int):
        super(FixedMatrixMultiplicationLayer, self).__init__()
        self.channels = channels

    def build(self, input_shape):
        # Creates the learnable tensor with the correct dimensions
        input_x, input_y = input_shape
        self.kernel = self.add_weight("kernel", shape=[input_x[-1], self.channels],initializer=initializers.Ones(),trainable=False)


    def call(self, inputs):
        # Multiply the input tensor by the learnable tensor
        input1, input2 = inputs
        return tf.matmul(input1,self.kernel)

In [None]:
class FixedMatrixMultiplicationLayer2(tf.keras.layers.Layer):
    """
        Class for fixed matrix multiplation layer
    """
    def __init__(self, channels: int):
        super(FixedMatrixMultiplicationLayer2, self).__init__()
        self.channels = channels

    def build(self, input_shape):
        # Creates the learnable tensor with the correct dimensions
        input_x = input_shape
        #self.kernel = self.add_weight("kernel", shape=[input_x[-1], input_x[-2]],initializer=initializers.Ones(),trainable=False)
        self.kernel = self.add_weight("kernel", shape=[input_x[-1], self.channels],initializer=initializers.Ones(),trainable=False)


    def call(self, inputs):
        # Multiply the input tensor by the learnable tensor
        input1 = inputs
        return tf.matmul(input1,self.kernel)

In [None]:
def rmse(y_true: np.array, y_pred: np.array) -> float:
    """
    Root Mean Squared Error
    Args:
        y_true ([np.array]): test samples
        y_pred ([np.array]): predicted samples
    Returns:
        [float]: root mean squared error
    """
    y_pred2 = tf.squeeze(y_pred)
    return K.sqrt(K.mean(K.square(y_pred2 - y_true), axis=-1))

In [None]:
def nrmse(y_true: np.array, y_pred: np.array) -> float:
    """
    Normalized Root Mean Squared Error
    Args:
        y_true ([np.array]): test samples
        y_pred ([np.array]): predicted samples
    Returns:
        [float]: normalized root mean squared error
    """
    y_pred2 = tf.squeeze(y_pred)
    return K.sqrt(K.mean(K.square(y_pred2 - y_true), axis=-1)) / K.mean(K.abs(y_true), axis=-1)

In [None]:
def rmse2(y_true, y_pred):
    """
    Root Mean Squared Error
    """
    y_true_tensor = tf.convert_to_tensor(y_true, dtype=tf.float32)
    y_pred_tensor = tf.convert_to_tensor(y_pred, dtype=tf.float32)
    return K.sqrt(K.mean(K.square(y_pred_tensor - y_true_tensor)))

In [None]:
def nrmse2(y_true, y_pred):
    """
    Normalized Root Mean Squared Error
    """
    y_true_tensor = tf.convert_to_tensor(y_true, dtype=tf.float32)
    y_pred_tensor = tf.convert_to_tensor(y_pred, dtype=tf.float32)
    
    return K.sqrt(K.mean(K.square(y_pred_tensor - y_true_tensor))) / K.mean(y_true_tensor)

In [None]:
def squared_error(y_true: np.array, y_pred: np.array) -> float:
    """
        Method for square error loss
    """
    error = tf.square(y_true - y_pred)
    loss = tf.reduce_mean(error)
    return loss

In [None]:
class GNN(Model):
    """
        Class for GNN model
    """

    def __init__(self, channels: int, k_layers: int, relu_last=False):
        super().__init__()
        self.channels = channels
        self.k_layers = k_layers
        self.num_layers = len(channels)
        self.relu_last = relu_last
        self.init_layers()

    def init_layers(self):
        self.concatenate = Concatenate(axis=2)
        self.add = Add()
        self.relu = tf.keras.layers.Activation('relu')

        for i in range(self.num_layers):
            setattr(self, f'cheb_stgi_l{i+1}', [])
            kk = self.k_layers[i]
            for k in range(1, kk + 1):
                layer = ChebConv(self.channels[i], K=k, activation='relu', use_bias=True)
                getattr(self, f'cheb_stgi_l{i+1}').append(layer)

        self.dot_learnable_layers = [LearnableMatrixMultiplicationLayer(self.channels[i]) for i in range(self.num_layers)]
        self.dot_fixed_layers = [FixedMatrixMultiplicationLayer(self.channels[i]) for i in range(self.num_layers)]

    def call(self, inputs):
        x, y = inputs
        out = None

        for i in range(self.num_layers):
            cheb_stgi_layers = getattr(self, f'cheb_stgi_l{i+1}')
            out_layers = [cheb(inputs) for cheb in cheb_stgi_layers]
            concatenate = self.concatenate(out_layers)
            mult_learnable = self.dot_learnable_layers[i](concatenate)
            mult_fixed = self.dot_fixed_layers[i](inputs)
            add = self.add([mult_learnable, mult_fixed])
            if i < self.num_layers - 1 or self.relu_last:
                relu = self.relu(add)
                out = relu
            else:
                out = add
            inputs = (out,y)

        return out

# Load Data

In [None]:
directory = "" # fill out
file_temporal_series = directory + "normalized.npy"
file_transition_matrix = directory + "Transition Matrix/transition_matrix.npy"

**Load Temporal Series**

In [None]:
temporal_series = np.load(file_temporal_series)
temporal_series = temporal_series.astype(float)
lines, columns = temporal_series.shape

**Load Transition Matrix**

In [None]:
transition_matrix = load_matrix(file_transition_matrix)
matrix_sparse = sp.sparse.csr_matrix(transition_matrix)

# Input Data Generation

In [None]:
samples_interval = 5
short_samples = 2
medium_time = 24
medium_samples = 6
long_time = 168
long_samples = 12
sensors = lines

In [None]:
list_Xt = create_list_Xt(temporal_series, samples_interval, medium_time, long_time, medium_samples, long_samples)
print(len(list_Xt))

# Model Parameters

In [None]:
split_array = [0.4,0.5]
final_relu = False
conv_array = [1]
k_array = [4]
lr = 0.01

# Data Separation into Training, Validation and Testing Sets

In [None]:
def data_split_training(list_Xt, split_array, matrix_sparse):
    training, validation, test = sequential_split(list_Xt,split_array[0],split_array[1])
    return Dataset_C(matrix_sparse, training)

In [None]:
def data_split_validation(list_Xt,split_array,matrix_sparse):
    training, validation, test = sequential_split(list_Xt,split_array[0],split_array[1])
    return Dataset_C(matrix_sparse, validation)

In [None]:
def data_split_test(list_Xt,split_array,matrix_sparse):
    training, validation, test = sequential_split(list_Xt,split_array[0],split_array[1])
    return Dataset_C(matrix_sparse, test)

In [None]:
dataset = data_split_training(list_Xt,split_array,matrix_sparse)
print(dataset)

In [None]:
dataset_validation = data_split_validation(list_Xt,split_array,matrix_sparse)
print(dataset_validation)

In [None]:
dataset_test = data_split_test(list_Xt,split_array,matrix_sparse)
print(dataset_test)

# GNN Training

## Hyperparameters

In [None]:
batch_size_training = 1
epochs_training = 30

In [None]:
reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.1, patience=2, min_lr=0.00001)
optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir='./logs', histogram_freq=1)

## Training

In [None]:
model = GNN(conv_array,k_array,final_relu)
model.compile(optimizer=optimizer,loss=squared_error,metrics=[rmse,nrmse,"mae","mape"])

In [None]:
loader = BatchLoader(dataset, batch_size=batch_size_training,shuffle=True)
loader_validation = BatchLoader(dataset_validation, batch_size=batch_size_training)
loader_test = BatchLoader(dataset_test, batch_size=batch_size_training)

In [None]:
init_time = time.time()
metrics_fit = model.fit(loader.load(), use_multiprocessing=True, workers=-1, verbose=0, steps_per_epoch=loader.steps_per_epoch, epochs=epochs_training, validation_data=loader_validation.load(), validation_steps=loader_validation.steps_per_epoch, callbacks=[reduce_lr, tensorboard_callback])
end_time = time.time()

## Evaluation

In [None]:
init_time = time.time()
metrics_validation = model.evaluate(loader_validation.load(), steps=loader_validation.steps_per_epoch)
end_time = time.time()

In [None]:
init_time = time.time()
metrics_testing = model.evaluate(loader_test.load(), steps=loader_test.steps_per_epoch)
end_time = time.time()

# Data Separation for Linear Regression and FCN

In [None]:
def data_split_training(list_Xt,split_array):
    training, validation, test = sequential_split(list_Xt,split_array[0],split_array[1])
    x_training, y_training = split_x_and_y(training)
    return (x_training, y_training)

In [None]:
def data_split_validation(list_Xt,split_array):
    training, validation, test = sequential_split(list_Xt,split_array[0],split_array[1])
    x_validation, y_validation = split_x_and_y(validation)
    return (x_validation, y_validation)

In [None]:
def data_split_test(list_Xt,split_array):
    training, validation, test = sequential_split(list_Xt,split_array[0],split_array[1])
    x_test, y_test = split_x_and_y(test)
    return (x_test, y_test)

In [None]:
def data_training(matrix_flow, samples_interval, medium_time, long_time, medium_samples, long_samples, split_array):
    list_Xt = create_list_Xt(matrix_flow, samples_interval, medium_time, long_time, medium_samples, long_samples)
    return  data_split_training(list_Xt,split_array)

In [None]:
def data_validation(matrix_flow, samples_interval, medium_time, long_time, medium_samples, long_samples, split_array):
    list_Xt = create_list_Xt(matrix_flow, samples_interval, medium_time, long_time, medium_samples, long_samples)
    return  data_split_validation(list_Xt,split_array)

In [None]:
def data_test(matrix_flow, samples_interval, medium_time, long_time, medium_samples, long_samples, split_array):
    list_Xt = create_list_Xt(matrix_flow, samples_interval, medium_time, long_time, medium_samples, long_samples)
    return  data_split_test(list_Xt,split_array)

In [None]:
x_training, y_training = data_split_training(list_Xt,split_array)
x_validation, y_validation = data_split_validation(list_Xt,split_array)
x_test, y_test = data_split_test(list_Xt,split_array)

# Linear Regression Training

## Hyperparameters

In [None]:
batch_size_training = 1
epochs_training = 30
reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.1, patience=2, min_lr=0.00001)

## Linear Regression Implementation

In [None]:
lin, col = x_training[0].shape

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(lin, col)),  
    tf.keras.layers.Dense(1)                
])

In [None]:
model.compile(optimizer='adam', loss=squared_error,metrics=[rmse,nrmse,"mae","mape"])

## Training

In [None]:
init_time = time.time()
metrics_fit = model.fit(x_training, y_training, epochs=epochs_training, batch_size=batch_size_training, validation_data=(x_validation, y_validation), verbose=0, callbacks=[reduce_lr,tensorboard_callback])
end_time = time.time()

## Evaluation

In [None]:
init_time = time.time()
metrics_validation = model.evaluate(x_validation,y_validation)
end_time = time.time()

In [None]:
init_time = time.time()
metrics_testing = model.evaluate(x_test,y_test)
end_time = time.time()

# FCN Training

## Hyperparameters

In [None]:
batch_size_training = 1
epochs_training = 30
reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.1, patience=2, min_lr=0.00001)

## FCN Implementation

In [None]:
lin, col = x_training[0].shape

In [None]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Input(shape=(lin,col)),              
    tf.keras.layers.Dense(128, activation='relu'),  
    tf.keras.layers.Dropout(0.2),                  
    tf.keras.layers.Dense(1)                     
])

In [None]:
model.compile(optimizer='adam', loss='mse',metrics=[rmse,nrmse,"mae","mape"])

## Training

In [None]:
init_time = time.time()
metrics_fit = model.fit(x_training, y_training, epochs=epochs_training, batch_size=batch_size_training, validation_data=(x_validation, y_validation), verbose=0, callbacks=[reduce_lr,tensorboard_callback])
end_time = time.time()

## Evaluation

In [None]:
init_time = time.time()
metrics_validation = model.evaluate(x_validation,y_validation)
end_time = time.time()

In [None]:
init_time = time.time()
metrics_testing = model.evaluate(x_test,y_test)
end_time = time.time()