In [45]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [46]:
import numpy as np
import matplotlib.pyplot as plt
import h5py
import torch


from tqdm import tqdm

In [47]:
import sys
sys.path.append('/ceph/hdd/students/schmitj/MA_Diffusion_based_trajectory_prediction/src/MA_Diffusion_base_trajectory_prediction')

from utils.data_utils import TDRIVE, GEOLIFE, load_data, calculate_bbox_and_filter, \
    plot_coordinates, plot_paths, load_new_format, find_cycles, split_cycle_in_paths, \
    plot_histograms_before_after_split, \
    get_edge_used_by_trajectories, modify_and_save_data

# Data Loading

In [48]:
# WHICH = TDRIVE
# Paths to cleaned and filtered data
GEOLIFE_PATH = '/ceph/hdd/students/schmitj/MA_Diffusion_based_trajectory_prediction/data/geolife.h5'
TDRIVE_PATH = '/ceph/hdd/students/schmitj/MA_Diffusion_based_trajectory_prediction/data/tdrive.h5'
MUNICH_PATH = '/ceph/hdd/students/schmitj/MA_Diffusion_based_trajectory_prediction/data/munich.h5'
# pNEUMA dataset is not in the correct format yet
PNEUMA_PATH = '/ceph/hdd/students/schmitj/MA_Diffusion_based_trajectory_prediction/data/pNEUMA.h5'
# merged_path = GEOLIFE_PATH if WHICH == GEOLIFE else TDRIVE_PATH

In [73]:
def load_new_format(file_path, edge_features, device):
        paths = []
        with h5py.File(file_path, 'r') as new_hf:
            node_coordinates = torch.tensor(new_hf['graph']['node_coordinates'][:], dtype=torch.float, device=device)
            # Normalize the coordinates to (0, 1) if any of the coordinates is larger than 1
            if node_coordinates.max() > 1:
                max_values = node_coordinates.max(0)[0]
                min_values = node_coordinates.min(0)[0]
                node_coordinates[:, 0] = (node_coordinates[:, 0] - min_values[0]) / (max_values[0] - min_values[0])
                node_coordinates[:, 1] = (node_coordinates[:, 1] - min_values[1]) / (max_values[1] - min_values[1])
            edges = new_hf['graph']['edges'][:]
            edge_coordinates = node_coordinates[edges]
            nodes = [(i, {'pos': torch.tensor(pos, device=device)}) for i, pos in enumerate(node_coordinates)]
            edges = [tuple(edge) for edge in edges]
            for i in tqdm(new_hf['trajectories'].keys()):
                path_group = new_hf['trajectories'][i]
                path = {attr: torch.tensor(path_group[attr][()], device=device) for attr in path_group.keys() if attr in ['coordinates', 'edge_idxs', 'edge_orientations']}
                paths.append(path)
            if 'road_type' in edge_features:
                if 'highway' in new_hf['graph']['edge_features'].keys():
                    road_type = new_hf['graph']['edge_features']['highway'][()]
                    road_type = [byte_string.decode('utf-8')[1:-1] for byte_string in road_type]
                    road_type_clean = []
                    for string in road_type:
                        # Split the string by comma, remove duplicates, and join back
                        cleaned_string = string.split(',')[0]
                        road_type_clean.append(cleaned_string)
                    unique_labels = list(set(road_type_clean))
                    label_to_index = {label: idx for idx, label in enumerate(unique_labels)}

                    # Convert string labels to integer labels using the mapping
                    integer_encoded = [label_to_index[label] for label in road_type_clean]

                    # Step 4: One-hot encode the numerical labels (optional)
                    integer_encoded_tensor = torch.tensor(integer_encoded, dtype=torch.long, device=device)
                    num_classes = len(unique_labels)
                    onehot_encoded_road_type = torch.nn.functional.one_hot(integer_encoded_tensor, num_classes=num_classes)
                    return paths, nodes, edges, edge_coordinates, onehot_encoded_road_type
            else:
                return paths, nodes, edges, edge_coordinates

In [74]:
paths, node_coord, edges, edge_coordinates, roads = load_new_format(TDRIVE_PATH, ['coord', 'road_type'], device='cpu')

  nodes = [(i, {'pos': torch.tensor(pos, device=device)}) for i, pos in enumerate(node_coordinates)]
100%|██████████| 7218/7218 [00:06<00:00, 1054.71it/s]


In [75]:
print(roads.shape)

torch.Size([16784, 9])


## Features to include
### T-Drive
highway,
lanes (mostly 0),
length (mostly nan),
maxspeed (mostly nan)
### pNEUMA
highway,
lanes,
length,
maxspeed,
tunnel (maybe)
### Geolife
highway,
lanes,
length,
maxspeed
### Munich
None\

# --> Only use highway

In [11]:
print(paths[0])

{'coordinates': tensor([[0.7536, 0.5798],
        [0.7536, 0.5799],
        [0.7536, 0.5800],
        [0.7536, 0.5798],
        [0.7535, 0.5800],
        [0.7349, 0.5560],
        [0.7642, 0.4845],
        [0.7624, 0.3738],
        [0.7354, 0.3724],
        [0.7335, 0.3581],
        [0.7338, 0.3530],
        [0.7338, 0.3540],
        [0.7339, 0.3562],
        [0.7339, 0.3573]], dtype=torch.float64), 'edge_idxs': tensor([15116, 15115, 15117,   470,   471, 16749,   462,   466, 11607, 11606,
        11486, 11122,  5276,  3908,  2138,  2137,  4117,  4116,  8745,  9139,
         9138,  1169,  1170,  1541,  1540,  5274,    65,    67,   550,  3004,
         3001,  8710,  8709, 14955,  9668]), 'edge_orientations': tensor([-1,  1,  1, -1,  1,  1, -1,  1, -1,  1, -1, -1, -1, -1, -1,  1, -1,  1,
         1, -1,  1, -1,  1, -1,  1, -1, -1,  1,  1, -1,  1, -1,  1, -1, -1])}


In [None]:
paths, node_coordinates, edges = load_new_format(TDRIVE_PATH)
# edge_coordinates = node_coordinates[edges]

In [None]:
print()

In [None]:
# Number of Edges
num_edges = len(edges)
print("Number of Edges:", num_edges)

# Number of Nodes
num_nodes = len(node_coordinates)
print("Number of Nodes:", num_nodes)

# Distribution of Path Lengths
path_lengths = [len(path['edge_idxs']) for path in paths]

avg_length = round(sum(path_lengths) / len(path_lengths), 2)
print(f"Average length: {avg_length} edges")

path = np.random.randint(len(paths))
print(f'Exemplary Path: {path}')
print(paths[path])
print('\nCoordinates:')
#print(paths[path]['coordinates'])
print('\nTimestamps:')
#print(paths[path]['timestamps'])
print('\nTaxi Index:')
#print(paths[path]['taxi_idx'])
print('\nEdge Indexes:')
print(paths[path]['edge_idxs'])

In [None]:
plot_paths(paths, node_coordinates, edges, num_paths_to_plot=4, random=False, start_id=path)

# Filtering

## Cycle Filtering

In [None]:
cycles = find_cycles(paths)
print("Number of paths with cycles:", np.sum(cycles))

### Plot Paths with Cycles

In [None]:
paths_with_cycles = list(path for path, mask in zip(paths, cycles) if mask)
plot_paths(paths_with_cycles, node_coordinates, edges)

### Split paths with cycles

In [None]:
split_paths = split_cycle_in_paths(paths)
print(
    f"Number of paths before split: {len(paths)}, after split: {len(split_paths)}, ratio: {len(split_paths) / len(paths)}")
print("Number of paths with cycles after split:", np.sum(find_cycles(split_paths)))

### Plot Splitted Paths

In [None]:
plot_paths(split_paths, node_coordinates, edges)

## Remove Short Paths (<5 edges)

In [None]:
num_edges_before_split = np.array([len(path['edge_idxs']) for path in paths])
num_edges_after_split = np.array([len(path['edge_idxs']) for path in split_paths])
print(
    f"Number of paths smaller than 5 before split: {np.sum(num_edges_before_split < 5)} out of {len(num_edges_before_split)}, ratio: {np.sum(num_edges_before_split < 5) / len(num_edges_before_split)}")
print(
    f"Number of paths smaller than 5 after split: {np.sum(num_edges_after_split < 5)} out of {len(num_edges_after_split)}, ratio: {np.sum(num_edges_after_split < 5) / len(num_edges_after_split)}")

In [None]:
path_length_cutoff_value = 5
longer_paths = [path for path in split_paths if len(path['edge_idxs']) >= path_length_cutoff_value]
print(
    f"Number of paths longer than {path_length_cutoff_value}: {len(longer_paths)} out of {len(split_paths)}, ratio: {len(longer_paths) / len(split_paths)}")

In [None]:
import h5py
import torch
from tqdm import tqdm
def load_new_format(file_path, device):
        paths = []
        with h5py.File(file_path, 'r') as new_hf:
            node_coordinates = torch.tensor(new_hf['graph']['node_coordinates'][:], dtype=torch.float, device=device)
            # Normalize the coordinates to (0, 1) if any of the coordinates is larger than 1
            if node_coordinates.max() > 1:
                max_values = node_coordinates.max(0)[0]
                min_values = node_coordinates.min(0)[0]
                node_coordinates[:, 0] = (node_coordinates[:, 0] - min_values[0]) / (max_values[0] - min_values[0])
                node_coordinates[:, 1] = (node_coordinates[:, 1] - min_values[1]) / (max_values[1] - min_values[1])
            #edges = torch.tensor(new_hf['graph']['edges'][:], dtype=torch.long, device=device)
            edges = new_hf['graph']['edges'][:]
            edge_coordinates = node_coordinates[edges]
            nodes = [(i, {'pos': torch.tensor(pos, device=device)}) for i, pos in enumerate(node_coordinates)]
            #edges = [(torch.tensor(edge[0], device=device), torch.tensor(edge[1], device=device)) for edge in edges]
            edges = [tuple(edge) for edge in edges]

            '''nodes = [(i, {'pos': tuple(pos)}) for i, pos in enumerate(node_coordinates)]
            edges = [tuple(edge) for edge in edges]'''

            for i in tqdm(new_hf['trajectories'].keys()):
                path_group = new_hf['trajectories'][i]
                path = {attr: torch.tensor(path_group[attr][()], device=device) for attr in path_group.keys() if attr in ['coordinates', 'edge_idxs', 'edge_orientations']}
                # path = {attr: path_group[attr][()] for attr in path_group.keys()}
                paths.append(path)
            
        return paths, nodes, edges, edge_coordinates
    
# paths, nodes, edges, edge_coordinates = load_new_format('/ceph/hdd/students/schmitj/MA_Diffusion_based_trajectory_prediction/data/tdrive_train.h5', 'cpu')

In [None]:
paths, nodes, edges, edge_coordinates = load_new_format('/ceph/hdd/students/schmitj/MA_Diffusion_based_trajectory_prediction/data/tdrive_train.h5', 'cpu')

In [None]:
paths, nodes, edges, edge_coordinates = load_new_format('/ceph/hdd/students/schmitj/MA_Diffusion_based_trajectory_prediction/data/tdrive_train.h5', 'cpu')
lens = [paths[i]['edge_idxs'].size()[0] for i in range(len(paths))]
import matplotlib.pyplot as plt

plt.hist(lens)
plt.xlabel('Length')
plt.ylabel('Frequency')
plt.title('Histogram of Path lengths')
plt.show()

In [None]:
paths, nodes, edges, edge_coordinates = load_new_format('/ceph/hdd/students/schmitj/MA_Diffusion_based_trajectory_prediction/data/geolife_train.h5', 'cpu')
lens = [paths[i]['edge_idxs'].size()[0] for i in range(len(paths))]
import matplotlib.pyplot as plt

plt.hist(lens)
plt.xlabel('Length')
plt.ylabel('Frequency')
plt.title('Histogram of Path lengths')
plt.show()

In [None]:
paths, nodes, edges, edge_coordinates = load_new_format('/ceph/hdd/students/schmitj/MA_Diffusion_based_trajectory_prediction/data/pneuma_train.h5', 'cpu')
lens = [paths[i]['edge_idxs'].size()[0] for i in range(len(paths))]
import matplotlib.pyplot as plt

plt.hist(lens)
plt.xlabel('Length')
plt.ylabel('Frequency')
plt.title('Histogram of Path lengths')
plt.show()

In [None]:
paths, nodes, edges, edge_coordinates = load_new_format('/ceph/hdd/students/schmitj/MA_Diffusion_based_trajectory_prediction/data/munich_train.h5', 'cpu')
lens = [paths[i]['edge_idxs'].size()[0] for i in range(len(paths))]
import matplotlib.pyplot as plt

plt.hist(lens)
plt.xlabel('Length')
plt.ylabel('Frequency')
plt.title('Histogram of Path lengths')
plt.show()

In [None]:
print(nodes)

In [None]:
print(edges)

In [None]:
print(edge_coordinates)

In [None]:
paths, nodes, edges, edge_coordinates = load_new_format('/ceph/hdd/students/schmitj/MA_Diffusion_based_trajectory_prediction/data/munich_train.h5', 'cpu')

In [None]:
print(paths)

In [None]:
print(nodes)

In [None]:
print(edges)

In [None]:
print(edge_coordinates.tolist())

In [3]:
import numpy as np
import matplotlib.pyplot as plt
import h5py
import torch
from tqdm import tqdm

def load_new_format(file_path, edge_features, device):
        paths = []
        with h5py.File(file_path, 'r') as new_hf:
            node_coordinates = torch.tensor(new_hf['graph']['node_coordinates'][:], dtype=torch.float, device=device)
            # Normalize the coordinates to (0, 1) if any of the coordinates is larger than 1
            if node_coordinates.max() > 1:
                max_values = node_coordinates.max(0)[0]
                min_values = node_coordinates.min(0)[0]
                node_coordinates[:, 0] = (node_coordinates[:, 0] - min_values[0]) / (max_values[0] - min_values[0])
                node_coordinates[:, 1] = (node_coordinates[:, 1] - min_values[1]) / (max_values[1] - min_values[1])
            edges = new_hf['graph']['edges'][:]
            edge_coordinates = node_coordinates[edges]
            nodes = [(i, {'pos': torch.tensor(pos, device=device)}) for i, pos in enumerate(node_coordinates)]
            edges = [tuple(edge) for edge in edges]
            for i in tqdm(new_hf['trajectories'].keys()):
                path_group = new_hf['trajectories'][i]
                path = {attr: torch.tensor(path_group[attr][()], device=device) for attr in path_group.keys() if attr in ['coordinates', 'edge_idxs', 'edge_orientations']}
                paths.append(path)
            if 'road_type' in edge_features:
                onehot_encoded_road_type = new_hf['graph']['road_type'][:]
                return paths, nodes, edges, edge_coordinates, onehot_encoded_road_type
            else:
                return paths, nodes, edges, edge_coordinates

In [37]:
paths, nodes, edges, edge_coordinates = load_new_format('/ceph/hdd/students/schmitj/MA_Diffusion_based_trajectory_prediction/data/geolife_val.h5', [], 'cpu')

  nodes = [(i, {'pos': torch.tensor(pos, device=device)}) for i, pos in enumerate(node_coordinates)]
100%|██████████| 1472/1472 [00:01<00:00, 956.02it/s]


In [13]:
print("Munich Train")
print(sum(lens)/len(lens))

Munich Train
24.094827586206897


In [16]:
print("Munich Val")
lens = [paths[i]['edge_idxs'].size()[0] for i in range(len(paths))]
print(sum(lens)/len(lens))

Munich Val
25.18881118881119


In [18]:
print("Munich Test")
lens = [paths[i]['edge_idxs'].size()[0] for i in range(len(paths))]
print(sum(lens)/len(lens))

Munich Test
24.29268292682927


In [20]:
print("Tdrive train")
lens = [paths[i]['edge_idxs'].size()[0] for i in range(len(paths))]
print(sum(lens)/len(lens))

Tdrive train
23.970171149144253


In [22]:
print("Tdrive Val")
lens = [paths[i]['edge_idxs'].size()[0] for i in range(len(paths))]
print(sum(lens)/len(lens))

Tdrive Val
24.305555555555557


In [24]:
print("Tdrive Test")
lens = [paths[i]['edge_idxs'].size()[0] for i in range(len(paths))]
print(sum(lens)/len(lens))

Tdrive Test
24.199170124481327


In [26]:
print("Pneuma Train")
lens = [paths[i]['edge_idxs'].size()[0] for i in range(len(paths))]
print(sum(lens)/len(lens))

Pneuma Train
11.47207140296061


In [28]:
print("Pneuma Val")
lens = [paths[i]['edge_idxs'].size()[0] for i in range(len(paths))]
print(sum(lens)/len(lens))

Pneuma Val
11.061397779229262


In [30]:
print("Pneuma Test")
lens = [paths[i]['edge_idxs'].size()[0] for i in range(len(paths))]
print(sum(lens)/len(lens))

Pneuma Test
10.489171835890739


In [32]:
print("Geolife Train")
lens = [paths[i]['edge_idxs'].size()[0] for i in range(len(paths))]
print(sum(lens)/len(lens))

Geolife Train
11.996086105675147


In [38]:
print("Geolife Val")
lens = [paths[i]['edge_idxs'].size()[0] for i in range(len(paths))]
print(sum(lens)/len(lens))

Geolife Val
14.472826086956522


In [36]:
print("Geolife Test")
lens = [paths[i]['edge_idxs'].size()[0] for i in range(len(paths))]
print(sum(lens)/len(lens))

Geolife Test
11.43569731930777
