# Data preprocessing

In [1]:
import numpy as np
import pandas as pd
import os

def get_n_element_slices(dataframe: pd.DataFrame, n: int) -> list[pd.DataFrame]:
    if not isinstance(dataframe, pd.DataFrame):
        raise TypeError("Input must be a pandas DataFrame.")
    if not isinstance(n, int) or n <= 0:
        raise ValueError("n must be a positive integer.")
    if n > len(dataframe):
        return [] # No possible slices of length n if n is larger than the DataFrame

    slices = []
    for i in range(len(dataframe) - n + 1):
        slice_df = dataframe.iloc[i : i + n].copy()
        regularize_timestamp(slice_df)
        slices.append(slice_df)
    return slices

# reset start to zero
# TODO: 
def regularize_timestamp(df: pd.DataFrame):
    start = df.iloc[0, 0]
    df["timestamp"] = df["timestamp"] - start


all_slices = {}

data_path = "../data/clean"
for filename in os.listdir(data_path):
    if filename.endswith(".csv"):
        df = pd.read_csv(os.path.join(data_path, filename), 
                         sep=' ',
                         comment="#", 
                         header=None,
                         names=["timestamp", "tx", "ty", "tz", "qx", "qy", "qz", "qw"])

        slices = get_n_element_slices(df, 200)
        all_slices[filename] = slices


In [2]:
total_slices = 0
for name, slices in all_slices.items():
    curr_slices =len(slices) 
    total_slices += curr_slices

print("Total slices: ", total_slices)
# Prevent data leakage by spliting by path
test_slices = 0
test_set = {}
val_slices = 0
val_set = {}
train_slices = 0
train_set = {}
for name, slices in all_slices.items():
    if test_slices / total_slices < 0.15:
        test_slices += len(slices)
        test_set[name] = slices
    elif val_slices / total_slices < 0.15:
        val_slices += len(slices)
        val_set[name] = slices
    else:
        train_slices += len(slices)
        train_set[name] = slices

print("Train slices: ", round(train_slices / total_slices, 2))
for name, slices in train_set.items():
    print(name, len(slices))
print("Val slices: ", round(val_slices / total_slices, 2))
for name, slices in val_set.items():
    print(name, len(slices))
print("Test slices: ", round(test_slices / total_slices, 2))
for name, slices in test_set.items():
    print(name, len(slices))

Total slices:  306316
Train slices:  0.68
indoor_forward_6_snapdragon_with_gt.csv 14901
indoor_forward_5_snapdragon_with_gt.csv 9451
indoor_forward_7_snapdragon_with_gt.csv 33151
indoor_45_12_snapdragon_with_gt.csv 19951
indoor_forward_9_snapdragon_with_gt.csv 14201
indoor_45_9_snapdragon_with_gt.csv 10501
indoor_forward_10_snapdragon_with_gt.csv 14751
outdoor_forward_1_snapdragon_with_gt.csv 21101
indoor_45_14_snapdragon_with_gt.csv 18301
indoor_45_2_snapdragon_with_gt.csv 24501
outdoor_forward_5_snapdragon_with_gt.csv 8601
indoor_45_13_snapdragon_with_gt.csv 18501
Val slices:  0.15
indoor_45_4_snapdragon_with_gt.csv 21401
indoor_forward_3_snapdragon_with_gt.csv 24551
Test slices:  0.17
outdoor_45_1_snapdragon_with_gt.csv 9801
outdoor_forward_3_snapdragon_with_gt.csv 42651


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

class TrajectoryDataset(Dataset):
    def __init__(self, slices):
        # print(slices)
        self.data = torch.tensor(np.array(slices))
            
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # X, y
        return self.data[idx, 0:len(self.data)-1], self.data[idx, len(self.data)-1]

train_loader = DataLoader(TrajectoryDataset([x for lst in train_set.values() for x in lst]), batch_size=32, shuffle=True)
val_loader = DataLoader(TrajectoryDataset([x for lst in val_set.values() for x in lst]), batch_size=32, shuffle=False)
test_loader = DataLoader(TrajectoryDataset([x for lst in test_set.values() for x in lst]), batch_size=32, shuffle=False)