In [1]:
# Model Architecture Params
input_seq_length = 8
output_seq_length = 12
seq_length = input_seq_length + output_seq_length
num_features = 2
hidden_dim = 128
embedding_dim = 64
grad_clip = 10.0

# Discretization
neighborhood_size = 32
# grid_ratio = 0.001
grid_size = 0.5
# spatial_pooling_size = 32
# pooling_window_size = (8, 8)

# Training Params
num_epochs = 5
batch_size = 5
learning_rate = 0.005
decay_rate = 0.95
dropout_keep_prob = 0.8
dropout = 0.5

In [2]:
from dataloader import DataLoader

file_path = "../..//data/eth/hotel/pixel_pos_interpolatae.csv"
dataset_name = "eth_hotel"
file_path_processed = "../..//data/preprocessed/"

dataloader = DataLoader(file_path, dataset_name, file_path_processed, batch_size=batch_size)

Load preprocessed data


In [3]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Dense, ReLU, LSTMCell, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import RMSprop

import time

# Check TensorFlow version
tf_version = tf.__version__
print("TensorFlow version:", tf_version)

# Check if there is a GPU available
if tf.config.list_physical_devices('GPU'):
    print("GPU is available")
else:
    print("GPU is not available")

TensorFlow version: 2.8.0
GPU is not available


In [4]:
import itertools

def get_occupancy_map(dense_representation, ped_id_to_index_map, num_peds_per_seq, ped_per_frame, is_animate=False):
    """_summary_

    Args:
        dense_representation (numpy.ndarray): _description_
        ped_id_to_index_map (dict): _description_
        num_peds_per_seq (_type_): _description_
        ped_per_frame (_type_): _description_

    Returns:
        occupancy_grid (lsit(tf.Variable)): occupancy_grid[i] with shape (num_peds_per_seq, num_peds_per_seq, neighborhood_size**2)
    """
    grid_size_half = grid_size / 2.0

    if is_animate:
        plot_animation(dense_representation, ped_id_to_index_map, [1,3])

    occupancy_grid = []

    for frame_id in range(seq_length):
        occupancy_grid_frame = np.zeros((num_peds_per_seq, num_peds_per_seq, neighborhood_size, neighborhood_size))
        
        ped_indices = [ped_id_to_index_map[ped_id] for ped_id in ped_per_frame[frame_id]]

        for ped_i, ped_j in itertools.permutations(ped_indices, 2):
            ped_i_y, ped_i_x = dense_representation[frame_id, ped_i]
            ped_j_y, ped_j_x = dense_representation[frame_id, ped_j]
            ped_i_neighbor_y_low, ped_i_neighbor_x_low = ped_i_y - grid_size_half, ped_i_x - grid_size_half
            ped_i_neighbor_y_high, ped_i_neighbor_x_high = ped_i_y + grid_size_half - 1, ped_i_x + grid_size_half - 1

            if ped_j_y >= ped_i_neighbor_y_low and ped_j_y <= ped_i_neighbor_y_high and ped_j_x >= ped_i_neighbor_x_low and ped_j_x <= ped_i_neighbor_x_high:
                # ped_j in ped_i neighborhood
                cell_y = int(np.floor((ped_j_y - ped_i_neighbor_y_low) / grid_size * neighborhood_size))
                cell_x = int(np.floor((ped_j_x - ped_i_neighbor_x_low) / grid_size * neighborhood_size))
                occupancy_grid_frame[ped_i, ped_j, cell_y, cell_x] = 1

        occupancy_grid.append(tf.Variable(tf.reshape(occupancy_grid_frame, (num_peds_per_seq, num_peds_per_seq, -1))))

    return occupancy_grid

In [5]:
class SocialLSTM(Model):
    def __init__(self, embedding_dim=64, hidden_dim=128, pool_size=(8,8), neighbors=32, num_features=2, out_features=5, dropout=0.5):
        super(SocialLSTM, self).__init__()
        
        self.hidden_dim = hidden_dim
        self.input_embedding_layer = Dense(embedding_dim)
        self.social_embedding_layer = Dense(embedding_dim)  # TODO
        self.lstm = LSTMCell(hidden_dim)
        self.output_embedding_layer = Dense(out_features)
        self.dropout_layer = Dropout(dropout)

        self.relu = ReLU()


    def call(self, input, lstm_hidden, lstm_cell, occupancy_grid):
        # occupancy_grid: (seq_length, num_peds_per_seq, num_peds_per_seq, neighborhood_size*neighborhood_size)
        out_split = tf.split(tf.zeros(input.shape), num_or_size_splits=input.shape[0], axis=0)
                    
        for frame_id in range(input.shape[0]):
            # Input embedding
            input_t = input[frame_id]
            input_embed_out = self.input_embedding_layer(input_t)
            input_embed_out = self.relu(input_embed_out)
            # input_embed_out = self.dropout_layer(input_embed_out)

            # Social embedding
            occupancy_grid_frame = occupancy_grid[frame_id]
            occupancy_grid_frame_float32 = tf.cast(occupancy_grid_frame, dtype=tf.float32)  # cast from float64 to float32

            result = tf.einsum('ijk,jx->ikx', occupancy_grid_frame_float32, lstm_hidden)    # (num_peds_per_seq, num_peds_per_seq, neighborhood_size**2), (num_peds_per_seq, hidden_dim) -> (num_peds_per_seq, neighborhood_size**2, hidden_dim)
            result = tf.reshape(result, (result.shape[0], -1))

            social_embed_out = self.social_embedding_layer(result)
            social_embed_out = self.relu(social_embed_out)
            # social_embed_out = self.dropout_layer(social_embed_out)

            # Concat input embedding and social embedding outputs
            embed_out = tf.concat([input_embed_out, social_embed_out], axis=1)

            # LSTM
            lstm_out, new_state = self.lstm(embed_out, (lstm_hidden, lstm_cell))
            lstm_hidden, lstm_cell = new_state
            
            # Output embedding
            out_t = self.output_embedding_layer(lstm_out)

            out_t = tf.expand_dims(out_t, axis=0)

            out_split[frame_id] = out_t
        
        out = tf.concat(out_split, axis=0)
        return out

In [6]:
def mse_loss(y_true, y_pred, ped_id_to_index_map):
    """
    Calculate the Mean Squared Error (MSE) loss between predicted and true positions.

    Args:
        y_true (list): True pedestrian positions of shape (num_sequences, np.array(num_pedestrians, 3)).
        y_pred (tf.Tensor): Predicted pedestrian positions of shape (num_sequences, num_pedestrians, 2).
        ped_id_to_index_map (dict): Dictionary mapping pedestrian IDs to their indices.

    Returns:
        tf.Tensor: Mean Squared Error loss.
    """
    num_seq, num_ped = len(y_true), len(ped_id_to_index_map)
    y_true_dense_representation = np.zeros((num_seq, num_ped, 2))
    for sequence_idx in range(len(y_true)):
        indices = [ped_id_to_index_map[x] for x in y_true[sequence_idx][:, 0] if x in ped_id_to_index_map.keys()]
        if not indices:
            continue
        y_true_dense_representation[sequence_idx, indices, :] = y_true[sequence_idx][:, 1:3]
    
    return -tf.math.reduce_mean(y_true_dense_representation-y_pred)

def gaussian_loss(y_true, y_out, ped_id_to_index_map, input_seq_length = 8, output_seq_length = 12):
    # y_out: shape (seq_length, num_ped, 5)
    num_seq, num_ped = len(y_true), len(ped_id_to_index_map)
    y_true_dense_representation = np.zeros((num_seq, num_ped, 2))
    for sequence_idx in range(len(y_true)):
        indices = [ped_id_to_index_map[x] for x in y_true[sequence_idx][:, 0] if x in ped_id_to_index_map.keys()]
        if not indices:
            continue
        y_true_dense_representation[sequence_idx, indices, :] = y_true[sequence_idx][:, 1:3]
    
    mu_y, mu_x, sigma_y, sigma_x, rho = y_out[:,:,0], y_out[:,:,1], y_out[:,:,2], y_out[:,:,3], y_out[:,:,4]
    
    y_offset = y_true_dense_representation[:,:,0] - mu_y
    x_offset = y_true_dense_representation[:,:,1] - mu_x

    z = (x_offset/sigma_x)**2 + (y_offset/sigma_y)**2 - 2.0*rho*x_offset*y_offset/(sigma_x*sigma_y)
    constant = - 1.0 / (2.0 * (1 - rho**2))

    epsilon = 1e-20
    result = tf.math.exp(constant*z) / (2*tf.constant(np.pi)*sigma_x*sigma_y*tf.math.sqrt(1-rho**2))
    result = -tf.math.log(tf.clip_by_value(result, clip_value_min=epsilon, clip_value_max=tf.float32.max))

    # Sum over all pedestrians
    # result = constant * z - 0.5 * tf.math.log(2 * tf.constant(np.pi) * sigma_x**2 * sigma_y**2)
    # result = tf.reduce_sum(result, axis=1) / num_ped

    loss = tf.reduce_sum(result[input_seq_length:input_seq_length+output_seq_length]) / output_seq_length

    return loss


In [11]:
from utils import mse_loss
import time

device = '/CPU:0'
if tf.config.list_physical_devices('GPU'):
    device = '/GPU:0'
    print("Using GPU...")

dataloader.reset_frame_ptr()
with tf.device(device):

    # Declare model and optimizer
    model = SocialLSTM()
    lr = tf.Variable(learning_rate, trainable=False)
    optimizer = RMSprop(lr, clipvalue=grad_clip)

    # For each epoch
    for epoch in range(num_epochs):
        epoch_loss = 0.0      

        # For each batch  
        for batch in dataloader.generate_batches():
            start_time = time.time()
            inputs, targets, batch_ped_indices = batch
            if not inputs or not targets:
                # Traverse to the end
                break

            # ped_per_frame = dataloader.ped_indices
            batch_loss = 0.0

            # For each sequence
            for sequence_idx in range(batch_size):
                input, target, ped_per_frame = inputs[sequence_idx], targets[sequence_idx], batch_ped_indices[sequence_idx]

                dense_representation, ped_id_to_index_map = dataloader.convert_to_dense_representation(input) 
                dense_representation_tf = tf.Variable(tf.convert_to_tensor(dense_representation))
                num_peds_per_frame = len(ped_id_to_index_map)

                # Create occupancy grid
                occupancy_grid = get_occupancy_map(dense_representation, ped_id_to_index_map, num_peds_per_frame, ped_per_frame)
                
                # Initialize LSTM params
                lstm_hidden = tf.Variable(tf.random.truncated_normal((num_peds_per_frame, hidden_dim), stddev=0.1))
                lstm_cell = tf.Variable(tf.random.truncated_normal((num_peds_per_frame, hidden_dim), stddev=0.1))


                with tf.GradientTape() as tape:
                    # Forward pass
                    out = model(dense_representation_tf, lstm_hidden, lstm_cell, occupancy_grid)
                    loss = gaussian_loss(target, out, ped_id_to_index_map)
                    loss += 0.0005 * sum(tf.reduce_sum(tf.square(vars)) for vars in model.trainable_variables)    
                batch_loss += loss


                # Compute gradients
                grads = tape.gradient(loss, model.trainable_variables)
                # print(grads)

                # Clip gradients by norm
                grads, _ = tf.clip_by_global_norm(grads, grad_clip)

                
                # Update parameters
                trainable_variables = model.trainable_variables
                optimizer.apply_gradients(zip(grads, trainable_variables))
                print("one step loses: ", loss)
            print("total batch loss: ", batch_loss)
            batch_loss /= batch_size
        epoch_loss += batch_loss
        batch_time = time.time() - start_time
        print("Batch time: ", batch_time)
            
    print('(epoch {}/{}), train_loss = {:.3f}'.format(
                epoch,
                num_epochs,
                epoch_loss))


one step loses:  tf.Tensor(391.14987, shape=(), dtype=float32)
one step loses:  tf.Tensor(102.831406, shape=(), dtype=float32)
one step loses:  tf.Tensor(23.649584, shape=(), dtype=float32)
one step loses:  tf.Tensor(96.41806, shape=(), dtype=float32)
one step loses:  tf.Tensor(1.3696456, shape=(), dtype=float32)
total batch loss:  tf.Tensor(615.4186, shape=(), dtype=float32)
one step loses:  tf.Tensor(176.33502, shape=(), dtype=float32)
one step loses:  tf.Tensor(199.60292, shape=(), dtype=float32)
one step loses:  tf.Tensor(10.414954, shape=(), dtype=float32)
one step loses:  tf.Tensor(13.583582, shape=(), dtype=float32)
one step loses:  tf.Tensor(291.2459, shape=(), dtype=float32)
total batch loss:  tf.Tensor(691.1824, shape=(), dtype=float32)
one step loses:  tf.Tensor(-1.6579301, shape=(), dtype=float32)
one step loses:  tf.Tensor(95.40254, shape=(), dtype=float32)
one step loses:  tf.Tensor(205.03874, shape=(), dtype=float32)
one step loses:  tf.Tensor(45.029385, shape=(), dtype=

KeyboardInterrupt: 

In [15]:
from utils import mse_loss

device = '/CPU:0'
if tf.config.list_physical_devices('GPU'):
    device = '/GPU:0'
    print("Using GPU...")

dataloader.reset_frame_ptr()
with tf.device(device):

    # Declare model and optimizer
    model = SocialLSTM()
    lr = tf.Variable(learning_rate, trainable=False)
    optimizer = RMSprop(lr, clipvalue=grad_clip)

    # For each epoch
    for epoch in range(num_epochs):
        epoch_loss = 0.0      

        # For each batch  
        for batch in dataloader.generate_batches():
            inputs, targets, batch_ped_indices = batch
            if not inputs or not targets:
                # Traverse to the end
                break
            
            with tf.GradientTape() as tape:
                
                # ped_per_frame = dataloader.ped_indices
                batch_loss = 0.0

                # For each sequence
                for sequence_idx in range(batch_size):
                    input, target, ped_per_frame = inputs[sequence_idx], targets[sequence_idx], batch_ped_indices[sequence_idx]

                    dense_representation, ped_id_to_index_map = dataloader.convert_to_dense_representation(input) 
                    dense_representation_tf = tf.constant(tf.convert_to_tensor(dense_representation))
                    num_peds_per_frame = len(ped_id_to_index_map)

                    # Create occupancy grid
                    occupancy_grid = get_occupancy_map(dense_representation, ped_id_to_index_map, num_peds_per_frame, ped_per_frame)
                    
                    # Initialize LSTM params
                    lstm_hidden = tf.Variable(tf.random.truncated_normal((num_peds_per_frame, hidden_dim), stddev=0.1))
                    lstm_cell = tf.Variable(tf.random.truncated_normal((num_peds_per_frame, hidden_dim), stddev=0.1))



                    
                    # Forward pass
                    out = model(dense_representation_tf, lstm_hidden, lstm_cell, occupancy_grid)
                    loss = gaussian_loss(target, out, ped_id_to_index_map)
                    batch_loss += loss
                
            
                batch_loss += 0.0005 * sum(tf.reduce_sum(tf.square(vars)) for vars in model.trainable_variables)    
                
            # Compute gradients
            grads = tape.gradient(batch_loss, model.trainable_variables)
            # print(grads)

            # Clip gradients by norm
            grads, _ = tf.clip_by_global_norm(grads, grad_clip)

            
            # Update parameters
            trainable_variables = model.trainable_variables
            optimizer.apply_gradients(zip(grads, trainable_variables))

            batch_loss /= batch_size
            print("batch_loss", batch_loss)
        epoch_loss += batch_loss
            
        print('(epoch {}/{}), train_loss = {:.3f}'.format(
                    epoch,
                    num_epochs,
                    epoch_loss))


batch_loss tf.Tensor(278.7958, shape=(), dtype=float32)
batch_loss tf.Tensor(15.299756, shape=(), dtype=float32)
batch_loss tf.Tensor(15.72596, shape=(), dtype=float32)
batch_loss tf.Tensor(117.12273, shape=(), dtype=float32)
batch_loss tf.Tensor(120.94258, shape=(), dtype=float32)
batch_loss tf.Tensor(6.3819237, shape=(), dtype=float32)
batch_loss tf.Tensor(60.49824, shape=(), dtype=float32)
batch_loss tf.Tensor(1.8956293, shape=(), dtype=float32)
batch_loss tf.Tensor(210.36592, shape=(), dtype=float32)
batch_loss tf.Tensor(14.9192295, shape=(), dtype=float32)
batch_loss tf.Tensor(1.4831622, shape=(), dtype=float32)
batch_loss tf.Tensor(20.914919, shape=(), dtype=float32)
batch_loss tf.Tensor(1.7142359, shape=(), dtype=float32)
batch_loss tf.Tensor(0.045264922, shape=(), dtype=float32)
batch_loss tf.Tensor(-0.43390432, shape=(), dtype=float32)
batch_loss tf.Tensor(46.42433, shape=(), dtype=float32)
batch_loss tf.Tensor(46.11577, shape=(), dtype=float32)
batch_loss tf.Tensor(32.57626, 