In [11]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
from sklearn.preprocessing import StandardScaler
from torch.utils.data import Dataset, DataLoader

original_data_path = './CFdata/' # this is my path, change it to yours
separated_data_path = '../super_data/'

In [7]:
from read_data import read_data


def read_data_wrapper(setname, type):
    # data = pd.read_hdf(
    #     f"{original_data_path}/{type}{setname}.zarr"
    # )  # fill in your path and file name
    data = read_data(setname, type)
    return data[["case_id", "time", "x_leader", "x_follower", "v_leader", "v_follower"]]





In [8]:
# Segment data to make 30 timesteps input and 10 timesteps output
def segment_data(data):
    data = data.copy()
    data['delta_velocity'] = data['v_follower'] - data['v_leader']
    data['delta_position'] = data['x_leader'] - data['x_follower']
    data = data.sort_values(by=['case_id','time']).set_index('case_id')
    features = []
    labels = []
    idx = 0
    for case_id in tqdm(data.index.unique()):
        df = data.loc[case_id]
        future_idx_end = np.arange(40,len(df),40) # This line creates samples without overlapping, do that if the data amount is not enough or as you wish
        # future_idx_end = np.concatenate((future_idx_end, future_idx_end[1:]-15)) # make 10 timesteps overlapping, of course running time will also double
        future_idx_start = future_idx_end - 10
        history_idx_end = future_idx_start
        history_idx_start = history_idx_end - 30
        for hstart,hend,fstart,fend in zip(history_idx_start,history_idx_end,future_idx_start,future_idx_end):
            feature = df.iloc[hstart:hend][['time','delta_velocity','delta_position','v_follower']].copy()
            feature['sample_id'] = idx
            label = df.iloc[fstart:fend][['time','v_follower']].copy()
            label['sample_id'] = idx
            features.append(feature)
            labels.append(label)
            idx += 1
    features = pd.concat(features).reset_index()
    # Standardize features
    scaler = StandardScaler()
    features[['delta_velocity','delta_position','v_follower']] = scaler.fit_transform(features[['delta_velocity','delta_position','v_follower']])
    # But do not standardize labels
    labels = pd.concat(labels).reset_index()
    return features, labels




100%|██████████| 26394/26394 [01:43<00:00, 254.87it/s]


number of samples in HA: 159369


In [13]:
## HA

data_HA = read_data_wrapper(
    "HA", "train"
)  # I guess this is HA_train or something in your PC
features_HA, labels_HA = segment_data(data_HA.loc[data_HA["case_id"] < (1e5 + 500)])
print("number of samples in HA:", labels_HA["sample_id"].nunique())
features_HA.to_hdf(
    separated_data_path + "features_HA.h5", key="features"
)  # or save in other format you are familiar with

## HH

data_HH = read_data_wrapper("HH", "train")
labels_HA.to_hdf(separated_data_path + "labels_HA.h5", key="labels")
features_HH, labels_HH = segment_data(data_HH)
print("number of samples in HH:", labels_HH["sample_id"].nunique())
features_HH.to_hdf(separated_data_path + "features_HH.h5", key="features")
labels_HH.to_hdf(separated_data_path + "labels_HH.h5", key="labels")

100%|██████████| 26394/26394 [01:45<00:00, 251.19it/s]


number of samples in HA: 159369


100%|██████████| 38683/38683 [01:49<00:00, 351.73it/s]


number of samples in HH: 166365


In [9]:
# Read features and labels in local
features_HA = pd.read_hdf(separated_data_path+'features_HA.h5', key='features')
labels_HA = pd.read_hdf(separated_data_path+'labels_HA.h5', key='labels')
features_HH = pd.read_hdf(separated_data_path+'features_HH.h5', key='features')
labels_HH = pd.read_hdf(separated_data_path+'labels_HH.h5', key='labels')

FileNotFoundError: File ../separated/features_HA.h5 does not exist

In [None]:
# Split data into training, validation, test set by idx
# make sure the random choice of features and labels are the same!
all_indices_HA = labels_HA['sample_id'].unique()
train_indices_HA = np.random.choice(all_indices_HA, size=int(0.7*len(all_indices_HA)), replace=False)
test_indices_HA = np.random.choice(np.setdiff1d(all_indices_HA,train_indices_HA), size=int(0.3*len(all_indices_HA)), replace=False)
# val_set you can apply the previous code to val_HA that is already existing
train_features_HA = features_HA[features_HA['sample_id'].isin(train_indices_HA)]
train_labels_HA = labels_HA[labels_HA['sample_id'].isin(train_indices_HA)]
test_features_HA = features_HA[features_HA['sample_id'].isin(test_indices_HA)]
test_labels_HA = labels_HA[labels_HA['sample_id'].isin(test_indices_HA)]
# the same for HH
# ...

In [None]:
# Create dataloader function
class CreateDataset:
    def __init__(self, features, labels):
        self.idx_list = labels['sample_id'].unique()
        self.labels = labels.sort_values(['sample_id','time']).set_index('sample_id')
        self.features = features.sort_values(['sample_id','time']).set_index('sample_id')        

    def __len__(self):        
        return len(self.idx_list)

    def __getitem__(self, idx):
        # idx is the index of items in the data and labels
        sample_id = self.idx_list[idx]
        history = self.features.loc[sample_id][['delta_velocity','delta_position','v_follower']].values
        history = torch.from_numpy(history).float()
        future = self.labels.loc[sample_id]['v_follower'].values
        future = torch.from_numpy(future).float()
        return history, future
    
# Create dataloader to be used
train_dataloader_HA = DataLoader(CreateDataset(train_features_HA, train_labels_HA), batch_size=64, shuffle=False) # batch_size can also be 128
# ... the same for others



In [None]:
# Test if the dataloader works
history, future = next(iter(train_dataloader_HA))
print(f"Feature batch shape: {history.size()}")
print(f"Labels batch shape: {future.size()}")
