In [3]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
from sklearn.preprocessing import StandardScaler
from torch.utils.data import Dataset, DataLoader
from read_data import read_data

original_data_path = './data/' # this is my path, change it to yours
# separated_data_path = 

In [8]:
# # Read data
# def read_data(setname):
#     data = pd.read_hdf(original_data_path+"train"+setname+'.zarr', key='data') # fill in your path and file name
#     return data[['case_id','time','x_leader','x_follower','v_leader','v_follower']]
from read_data import read_data


def read_data_wrapper(setname, type):
    # data = pd.read_hdf(
    #     f"{original_data_path}/{type}{setname}.zarr"
    # )  # fill in your path and file name
    data = read_data(setname, type)
    return data[["case_id", "time", "x_leader", "x_follower", "v_leader", "v_follower"]]
# data_HA = read_data('HA') # I guess this is HA_train or something in your PC
# data_HH = read_data('HH')

data_HA = read_data_wrapper("HA", "train")


In [9]:
data_HA.head()

Unnamed: 0,case_id,time,x_leader,x_follower,v_leader,v_follower
0,0,0.0,0.0,-9.428864,2.629775,0.784441
1,0,0.1,0.263154,-9.341833,2.66553,0.810204
2,0,0.2,0.535099,-9.265412,2.776808,0.828016
3,0,0.3,0.82892,-9.107064,2.966034,0.889182
4,0,0.4,1.147062,-8.933068,3.168645,0.979196


In [10]:
# Segment data to make 30 timesteps input and 10 timesteps output
def segment_data(data):
    data = data.copy()
    data['delta_velocity'] = data['v_follower'] - data['v_leader']
    data['delta_position'] = data['x_leader'] - data['x_follower']
    data = data.sort_values(by=['case_id','time']).set_index('case_id')
    features = []
    labels = []
    idx = 0
    for case_id in tqdm(data.index.unique()):
        df = data.loc[case_id]
        future_idx_end = np.arange(40,len(df),40) # This line creates samples without overlapping, do that if the data amount is not enough or as you wish
        # future_idx_end = np.concatenate((future_idx_end, future_idx_end[1:]-15)) # make 10 timesteps overlapping, of course running time will also double
        future_idx_start = future_idx_end - 10
        history_idx_end = future_idx_start
        history_idx_start = history_idx_end - 30
        for hstart,hend,fstart,fend in zip(history_idx_start,history_idx_end,future_idx_start,future_idx_end):
            feature = df.iloc[hstart:hend][['time','delta_velocity','delta_position','v_follower']].copy()
            feature['sample_id'] = idx
            label = df.iloc[fstart:fend][['time','v_follower']].copy()
            label['sample_id'] = idx
            features.append(feature)
            labels.append(label)
            idx += 1
    features = pd.concat(features).reset_index()
    # Standardize features
    scaler = StandardScaler()
    features[['delta_velocity','delta_position','v_follower']] = scaler.fit_transform(features[['delta_velocity','delta_position','v_follower']])
    # But do not standardize labels
    labels = pd.concat(labels).reset_index()
    return features, labels

features_HA, labels_HA = segment_data(data_HA.loc[data_HA['case_id']<(1e5+500)])
print('number of samples in HA:', labels_HA['sample_id'].nunique())
# features_HA.to_hdf(separated_data_path+'features_HA.h5', key='features') # or save in other format you are familiar with
# labels_HA.to_hdf(separated_data_path+'labels_HA.h5', key='labels')
# features_HH, labels_HH = segment_data(data_HH)
# print('number of samples in HH:', labels_HH['sample_id'].nunique())
# features_HH.to_hdf(separated_data_path+'features_HH.h5', key='features')
# labels_HH.to_hdf(separated_data_path+'labels_HH.h5', key='labels')

100%|██████████| 26394/26394 [01:17<00:00, 339.41it/s]


number of samples in HA: 159369


In [None]:
# Read features and labels in local
features_HA = pd.read_hdf(separated_data_path+'features_HA.h5', key='features')
labels_HA = pd.read_hdf(separated_data_path+'labels_HA.h5', key='labels')
features_HH = pd.read_hdf(separated_data_path+'features_HH.h5', key='features')
labels_HH = pd.read_hdf(separated_data_path+'labels_HH.h5', key='labels')

In [14]:
def data_split(features,labels):
    # Split data into training, validation, test set by idx
    # make sure the random choice of features and labels are the same!
    all_indices_HA = labels['sample_id'].unique()
    train_indices_HA = np.random.choice(all_indices_HA, size=int(0.7*len(all_indices_HA)), replace=False)
    test_indices_HA = np.random.choice(np.setdiff1d(all_indices_HA,train_indices_HA), size=int(0.3*len(all_indices_HA)), replace=False)
    # val_set you can apply the previous code to val_HA that is already existing
    train_features_HA = features[features['sample_id'].isin(train_indices_HA)]
    train_labels_HA = labels[labels['sample_id'].isin(train_indices_HA)]
    test_features_HA = features[features['sample_id'].isin(test_indices_HA)]
    test_labels_HA = labels[labels['sample_id'].isin(test_indices_HA)]

    return train_features_HA, train_labels_HA, test_features_HA, test_labels_HA

train_features_HA, train_labels_HA, test_features_HA, test_labels_HA = data_split(features_HA,labels_HA)
# the same for HH
# ...

In [15]:
# Create dataloader function
class CreateDataset:
    def __init__(self, features, labels):
        self.idx_list = labels['sample_id'].unique()
        self.labels = labels.sort_values(['sample_id','time']).set_index('sample_id')
        self.features = features.sort_values(['sample_id','time']).set_index('sample_id')        

    def __len__(self):        
        return len(self.idx_list)

    def __getitem__(self, idx):
        # idx is the index of items in the data and labels
        sample_id = self.idx_list[idx]
        history = self.features.loc[sample_id][['delta_velocity','delta_position','v_follower']].values
        history = torch.from_numpy(history).float()
        future = self.labels.loc[sample_id]['v_follower'].values
        future = torch.from_numpy(future).float()
        return history, future
    
# Create dataloader to be used
train_dataloader_HA = DataLoader(CreateDataset(train_features_HA, train_labels_HA), batch_size=64, shuffle=False) # batch_size can also be 128
# ... the same for others



In [16]:
# Test if the dataloader works
history, future = next(iter(train_dataloader_HA))
print(f"Feature batch shape: {history.size()}")
print(f"Labels batch shape: {future.size()}")


Feature batch shape: torch.Size([64, 30, 3])
Labels batch shape: torch.Size([64, 10])


In [17]:
history

tensor([[[-1.5460, -0.8946, -1.7200],
         [-1.5539, -0.8764, -1.7146],
         [-1.6281, -0.8562, -1.7108],
         ...,
         [-1.2533, -0.5123, -0.7601],
         [-1.0633, -0.5150, -0.6872],
         [-0.8279, -0.5253, -0.6057]],

        [[-0.4961, -0.5692, -0.3384],
         [-0.6173, -0.5619, -0.3544],
         [-0.7183, -0.5502, -0.3641],
         ...,
         [-1.8279, -0.3294, -0.2099],
         [-1.8093, -0.3104, -0.1864],
         [-1.7922, -0.2925, -0.1580]],

        [[ 0.2521, -0.2318,  0.5332],
         [ 0.2888, -0.2482,  0.5483],
         [ 0.2394, -0.2512,  0.5467],
         ...,
         [ 0.4438, -0.2793,  0.5700],
         [ 0.5182, -0.3038,  0.5849],
         [ 0.5689, -0.3327,  0.5922]],

        ...,

        [[-0.1149, -1.1914, -1.8954],
         [-0.1117, -1.1853, -1.8945],
         [-0.0882, -1.1851, -1.8883],
         ...,
         [-0.0885, -1.1987, -1.8883],
         [-0.0877, -1.1996, -1.8881],
         [-0.1005, -1.1951, -1.8915]],

        [[