# Consolidating data

This file consists of functions that consolidate our disparate datasets into one large dataset that is useful in training our model. 

The goal is to generate a file with 30 columns (this number should be variable), such that each column is a state in time. 

Ideally, this will be done with heirachical data, ie `p1` is the first point in time, and within `p1` you have an x component, y component, etc.

https://pandas.pydata.org/docs/user_guide/advanced.html

## Input data format

It is assumed that the input data with have the columns: `[timestamp,tx,ty,tz,qx,qy,qz,qw]`

## Extracting the data we want

This function will create velocity and acceleration columns.

In [7]:
import numpy as np
import pandas as pd

def extract_features(raw: pd.DataFrame, dropna: bool = False) -> None:
    raw['vx'] = raw['tx'].diff() / raw['timestamp'].diff()
    raw['vy'] = raw['ty'].diff() / raw['timestamp'].diff()
    raw['vz'] = raw['tz'].diff() / raw['timestamp'].diff()

    raw['ax'] = raw['vx'].diff() / raw['timestamp'].diff()
    raw['ay'] = raw['vy'].diff() / raw['timestamp'].diff()
    raw['az'] = raw['vz'].diff() / raw['timestamp'].diff()

    if dropna: raw.dropna(inplace=True)

# test the above functions

# df = pd.read_csv("../data/fpv_uzh/indoor_forward_3_davis_with_gt.txt")
# extract_features(df, dropna=True)

# print(df.head())
# print(df['timestamp'])

# trial = df[0:4] 
# trial.head()

# temp = np.DataFrame.arange(4)
# print(temp)
# trial.loc[:, "trajNum"] = np.arange(len(trial))
# trial.loc[:, "slice"] = 1
# trial.head()

# trial.set_index(['slice', 'trajNum'], inplace=True)
# trial.head()

## Slicing the data

Now, we want rows of data that represent a specific range of time. In this case, we want 30 points for each new row.

In [8]:
def generate_slices(data: pd.DataFrame, n: int) -> pd.DataFrame:
    # each row in the original data is a "point". Each row in the output 
    # is a list of points of size n. 
    cols = [f"{col}_{i}" for i in range(n) for col in data.columns]
    slices = []
    for i in range(len(data) - n):
        flattened = pd.DataFrame([data[i:i+n].to_numpy().flatten()])
        flattened.columns = cols
        slices.append(flattened)
    return pd.concat(slices, ignore_index=False)

# test the above function for 4 pints in each row

# slices = generate_slices(df, 4)
# # print(slices.head())
# slices.head()

In [9]:
def multiIndex(data: pd.DataFrame, n: int, filename: str) -> pd.DataFrame:
    # each row in the original data is a "point". Each row in the output 
    # is a list of points of size n. 
    slices = []
    # i is the number of slices we want
    for i in range(len(data) - n):
        trial = data.copy()[i:i+n]
        trial.loc[:, "trajNum"] = np.arange(len(trial))
        trial.loc[:, "slice"] = filename + ":" + str(i)
        trial.set_index(['slice', 'trajNum'], inplace=True)
        slices.append(trial)
    return pd.concat(slices, ignore_index=False)

# test
# slices = multiIndex(df, 30, 0)
# # print(slices.head())
# print(slices.head(40))
# type(slices)

## Consolidate all our original data

Now, we want to consolidate our data from all the other sources.

In [10]:
import os
from datasets import load_dataset

ds = load_dataset("riotu-lab/Synthetic-UAV-Flight-Trajectories")
print(ds)

n = 30
is_multi_index = True
slices = []
output_path = "../data/output"
if not os.path.exists(output_path):
    os.makedirs(output_path)



consolidated = pd.concat(slices, ignore_index=False)
consolidated.to_csv(os.path.join(output_path, "consolidated.csv"))

Resolving data files:   0%|          | 0/5093 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['timestamp', 'tx', 'ty', 'tz'],
        num_rows: 766140
    })
})


ValueError: No objects to concatenate