In [4]:
import h5py
import networkx as nx
from tqdm import tqdm

In [5]:
import torch
def load_new_format(file_path, edge_features, device):
        paths = []
        with h5py.File(file_path, 'r') as new_hf:
            node_coordinates = torch.tensor(new_hf['graph']['node_coordinates'][:], dtype=torch.float, device=device)
            # Normalize the coordinates to (0, 1) if any of the coordinates is larger than 1
            if node_coordinates.max() > 1:
                max_values = node_coordinates.max(0)[0]
                min_values = node_coordinates.min(0)[0]
                node_coordinates[:, 0] = (node_coordinates[:, 0] - min_values[0]) / (max_values[0] - min_values[0])
                node_coordinates[:, 1] = (node_coordinates[:, 1] - min_values[1]) / (max_values[1] - min_values[1])
            edges = new_hf['graph']['edges'][:]
            edge_coordinates = node_coordinates[edges]
            nodes = [(i, {'pos': torch.tensor(pos, device=device)}) for i, pos in enumerate(node_coordinates)]
            edges = [tuple(edge) for edge in edges]
            for i in tqdm(new_hf['trajectories'].keys()):
                path_group = new_hf['trajectories'][i]
                path = {attr: torch.tensor(path_group[attr][()], device=device) for attr in path_group.keys() if attr in ['coordinates', 'edge_idxs', 'edge_orientations']}
                paths.append(path)
            print(new_hf['graph'].keys())
            if 'road_type' in edge_features:
                if 'highway' in new_hf['graph']['edge_features'].keys():
                    road_type = new_hf['graph']['edge_features']['highway'][()]
                    road_type = [byte_string.decode('utf-8')[1:-1] for byte_string in road_type]
                    road_type_clean = []
                    for string in road_type:
                        # Split the string by comma, remove duplicates, and join back
                        cleaned_string = string.split(',')[0]
                        road_type_clean.append(cleaned_string)
                    unique_labels = list(set(road_type_clean))
                    label_to_index = {label: idx for idx, label in enumerate(unique_labels)}

                    # Convert string labels to integer labels using the mapping
                    integer_encoded = [label_to_index[label] for label in road_type_clean]

                    integer_encoded_tensor = torch.tensor(integer_encoded, dtype=torch.long, device=device)
                    num_classes = len(unique_labels)
                    onehot_encoded_road_type = torch.nn.functional.one_hot(integer_encoded_tensor, num_classes=num_classes)
                    return paths, nodes, edges, edge_coordinates, onehot_encoded_road_type
                return paths, nodes, edges, edge_coordinates

In [6]:
def train_val_test_split(file_path, train_ratio=0.85, val_ratio=0.05, test_ratio=0.1, save=False):
    import numpy as np
    print("Load Data...")
    paths, nodes, edges, edge_coordinates, road_type = load_new_format(file_path, ['road_type'], device='cpu')
    if 'tdrive' in file_path.lower():
        dataset = 'tdrive'
    elif 'geolife' in file_path.lower():
        dataset = 'geolife'
    elif 'pneuma' in file_path.lower():
        dataset = 'pneuma'
    elif 'munich' in file_path.lower():
        dataset = 'munich'
    else:
        raise ValueError('Unknown dataset')
    
    n = len(paths)
    print("Dataset: ", dataset)
    print("Total number of paths: ", n)
    train_size = int(n * train_ratio)
    val_size = int(n * val_ratio)
    test_size = n - train_size - val_size

    train_paths = paths[:train_size]
    val_paths = paths[train_size:train_size + val_size]
    test_paths = paths[train_size + val_size:]
    
    if save:
        # Save the data
        print("Saving the data...")
        output_folder = '/ceph/hdd/students/schmitj/MA_Diffusion_based_trajectory_prediction/data/'
        for output_file_path, split_paths in zip([f'{output_folder}{dataset}_train.h5', f'{output_folder}{dataset}_val.h5', f'{output_folder}{dataset}_test.h5'], [train_paths, val_paths, test_paths]):
            with h5py.File(output_file_path, 'w') as f:
                # Save graph structure
                grp_graph = f.create_group('graph')
                grp_graph.create_dataset('node_coordinates', data=[list(pos['pos']) for _, pos in nodes])
                grp_graph.create_dataset('edges', data=np.array(edges))
                grp_graph.create_dataset('road_type', data=road_type)

                # Save the selected trajectories
                grp_trajectories = f.create_group('trajectories')
                for i, path in enumerate(split_paths):
                    grp = grp_trajectories.create_group(f'trajectory_{i}')
                    print(grp)
                    for key, value in path.items():
                        grp.create_dataset(key, data=value)
        print("Data saved!")
    
    else:
        return train_paths, val_paths, test_paths

In [10]:
train_val_test_split('/ceph/hdd/students/schmitj/MA_Diffusion_based_trajectory_prediction/data/pNEUMA_filtered.h5', save=True)

Load Data...


  nodes = [(i, {'pos': torch.tensor(pos, device=device)}) for i, pos in enumerate(node_coordinates)]
100%|██████████| 91874/91874 [00:43<00:00, 2105.78it/s]


<KeysViewHDF5 ['coordinate_scale', 'coordinate_shift', 'edge_features', 'edge_used_by_trajectory', 'edges', 'node_coordinates', 'node_features']>
Dataset:  pneuma
Total number of paths:  91874
Saving the data...
<HDF5 group "/trajectories/trajectory_0" (0 members)>
<HDF5 group "/trajectories/trajectory_1" (0 members)>
<HDF5 group "/trajectories/trajectory_2" (0 members)>
<HDF5 group "/trajectories/trajectory_3" (0 members)>
<HDF5 group "/trajectories/trajectory_4" (0 members)>
<HDF5 group "/trajectories/trajectory_5" (0 members)>
<HDF5 group "/trajectories/trajectory_6" (0 members)>
<HDF5 group "/trajectories/trajectory_7" (0 members)>
<HDF5 group "/trajectories/trajectory_8" (0 members)>
<HDF5 group "/trajectories/trajectory_9" (0 members)>
<HDF5 group "/trajectories/trajectory_10" (0 members)>
<HDF5 group "/trajectories/trajectory_11" (0 members)>
<HDF5 group "/trajectories/trajectory_12" (0 members)>
<HDF5 group "/trajectories/trajectory_13" (0 members)>
<HDF5 group "/trajectories/t

In [9]:
train_val_test_split('/ceph/hdd/students/schmitj/MA_Diffusion_based_trajectory_prediction/data/munich.h5', save=True)

Load Data...


100%|██████████| 2866/2866 [00:06<00:00, 432.29it/s]


Dataset:  munich
Total number of paths:  2866
Saving the data...
<HDF5 group "/trajectories/trajectory_0" (0 members)>
<HDF5 group "/trajectories/trajectory_1" (0 members)>
<HDF5 group "/trajectories/trajectory_2" (0 members)>
<HDF5 group "/trajectories/trajectory_3" (0 members)>
<HDF5 group "/trajectories/trajectory_4" (0 members)>
<HDF5 group "/trajectories/trajectory_5" (0 members)>
<HDF5 group "/trajectories/trajectory_6" (0 members)>
<HDF5 group "/trajectories/trajectory_7" (0 members)>
<HDF5 group "/trajectories/trajectory_8" (0 members)>
<HDF5 group "/trajectories/trajectory_9" (0 members)>
<HDF5 group "/trajectories/trajectory_10" (0 members)>
<HDF5 group "/trajectories/trajectory_11" (0 members)>
<HDF5 group "/trajectories/trajectory_12" (0 members)>
<HDF5 group "/trajectories/trajectory_13" (0 members)>
<HDF5 group "/trajectories/trajectory_14" (0 members)>
<HDF5 group "/trajectories/trajectory_15" (0 members)>
<HDF5 group "/trajectories/trajectory_16" (0 members)>
<HDF5 grou

In [6]:
# Define the train_val_test_regional_split function
def train_val_test_regional_split(file_path, val_coords: list, test_coords: list, save=False):
    import numpy as np
    print("Load Data...")
    paths, nodes, edges, edge_coordinates = load_new_format(file_path)
    if 'tdrive' in file_path.lower():
        dataset = 'tdrive'
    elif 'geolife' in file_path.lower():
        dataset = 'geolife'
    elif 'pneuma' in file_path.lower():
        dataset = 'pneuma'
    elif 'munich' in file_path.lower():
        dataset = 'munich'
    else:
        raise ValueError('Unknown dataset')
    
    # Get the indices of the paths that have 'coordinates' only in the val_coords range
    val_indices = [i for i, path in enumerate(paths) if all((path['coordinates'][:, 0] >= val_coords[0][0]) & (path['coordinates'][:, 0] <= val_coords[0][1]) & (path['coordinates'][:, 1] >= val_coords[1][0]) & (path['coordinates'][:, 1] <= val_coords[1][1]))]
    
    # Get the indices of the paths that have 'coordinates' only in the test_coords range
    test_indices = [i for i, path in enumerate(paths) if all((path['coordinates'][:, 0] >= test_coords[0][0]) & (path['coordinates'][:, 0] <= test_coords[0][1]) & (path['coordinates'][:, 1] >= test_coords[1][0]) & (path['coordinates'][:, 1] <= test_coords[1][1]))]
    
    # Get the validation paths
    val_paths = []
    for i in (val_indices):
        val_paths.append(paths[i])
    
    # Get the test paths
    test_paths = []
    for i in test_indices:
        test_paths.append(paths[i])
    
    # Get the train paths
    train_indices = [i for i in range(len(paths)) if i not in val_indices and i not in test_indices]
    train_paths = []
    for i in train_indices:
        train_paths.append(paths[i])
    
    # Print the ratios
    print("Dataset: ", dataset)
    print("Total number of paths: ", len(paths))
    print("Ratio of training paths: ", len(train_paths) / len(paths))
    print("Ratio of validation paths: ", len(val_paths) / len(paths))
    print("Ratio of testing paths: ", len(test_paths) / len(paths))
    import matplotlib.pyplot as plt

    # Create a figure and axis
    fig, ax = plt.subplots()

    # Plot the rectangle
    rectangle = plt.Rectangle((0, 0), 1, 1, edgecolor='black', facecolor='grey')
    ax.add_patch(rectangle)

    # Plot the val area
    val_rectangle = plt.Rectangle((val_coords[0][0], val_coords[1][0]), 
                                  val_coords[0][1] - val_coords[0][0], 
                                  val_coords[1][1] - val_coords[1][0], 
                                  edgecolor='red', facecolor='red')
    ax.add_patch(val_rectangle)
    plt.text((val_coords[0][0] + val_coords[0][1])/2, (val_coords[1][0] + val_coords[1][1])/2, 'Validation', color='white', ha='center', va='center')

    # Plot the test area
    test_rectangle = plt.Rectangle((test_coords[0][0], test_coords[1][0]), 
                                   test_coords[0][1] - test_coords[0][0], 
                                   test_coords[1][1] - test_coords[1][0], 
                                   edgecolor='blue', facecolor='blue')
    ax.add_patch(test_rectangle)
    plt.text((test_coords[0][0] + test_coords[0][1])/2, (test_coords[1][0] + test_coords[1][1])/2, 'Test', color='White', ha='center', va='center')


    # Set the aspect ratio to equal
    ax.set_aspect('equal')

    # Set the x and y limits
    ax.set_xlim(0, 1)
    ax.set_ylim(0, 1)

    # Show the plot
    plt.show()
    
    if save:
        # Save the data
        print("Saving the data...")
        output_folder = '/ceph/hdd/students/schmitj/MA_Diffusion_based_trajectory_prediction/data/'
        for output_file_path, split_paths in zip([f'{output_folder}{dataset}_coordinate_split_train.h5', 
                                                f'{output_folder}{dataset}_coordinate_split_x_{val_coords[0][0]}_{val_coords[0][1]}_y_{val_coords[1][0]}_{val_coords[1][1]}_val.h5', 
                                                f'{output_folder}{dataset}_coordinate_split_x_{test_coords[0][0]}_{test_coords[0][1]}_y_{test_coords[1][0]}_{test_coords[1][1]}_test.h5'], 
                                                [train_paths, val_paths, test_paths]):
            with h5py.File(output_file_path, 'w') as f:
                # Save graph structure
                grp_graph = f.create_group('graph')
                grp_graph.create_dataset('node_coordinates', data=[list(pos['pos']) for _, pos in nodes])
                grp_graph.create_dataset('edges', data=np.array(edges))

                # Save the selected trajectories
                grp_trajectories = f.create_group('trajectories')
                for i, path in enumerate(split_paths):
                    grp = grp_trajectories.create_group(f'trajectory_{i}')
                    print(grp)
                    for key, value in path.items():
                        grp.create_dataset(key, data=value)
        print("Data saved!")
                        
    else:
        return train_paths, val_paths, test_paths, nodes, edges, edge_coordinates

In [7]:
train_val_test_regional_split('/ceph/hdd/students/schmitj/MA_Diffusion_based_trajectory_prediction/data/pNEUMA_filtered.h5', [[0.1, 0.25], [0.4, 0.55]], [[0.6, 1.0], [0.0, 0.4]], save=True)

Load Data...


100%|██████████| 91874/91874 [01:38<00:00, 937.44it/s] 


KeyError: 'coordinates'

In [None]:
train_val_test_regional_split('/ceph/hdd/students/schmitj/MA_Diffusion_based_trajectory_prediction/data/geolife.h5', [[0.1, 0.25], [0.4, 0.55]], [[0.6, 1.0], [0.0, 0.4]], save=True)

In [None]:
train_val_test_regional_split('/ceph/hdd/students/schmitj/MA_Diffusion_based_trajectory_prediction/data/munich.h5', [[0.1, 0.25], [0.4, 0.55]], [[0.6, 1.0], [0.0, 0.4]], save=True)