# Consolidating data

This file consists of functions that consolidate our disparate datasets into one large dataset that is useful in training our model. 

The goal is to generate a file with 30 columns (this number should be variable), such that each column is a state in time. 

Ideally, this will be done with heirachical data, ie `p1` is the first point in time, and within `p1` you have an x component, y component, etc.

https://pandas.pydata.org/docs/user_guide/advanced.html

## Input data format

It is assumed that the input data with have the columns: `[timestamp,tx,ty,tz,qx,qy,qz,qw]`

## Extracting the data we want

This function will create velocity and acceleration columns.

In [18]:
import numpy as np
import pandas as pd

def extract_features(raw: pd.DataFrame) -> None:
    raw['vx'] = raw['tx'].diff() / raw['timestamp'].diff()
    raw['vy'] = raw['ty'].diff() / raw['timestamp'].diff()
    raw['vz'] = raw['tz'].diff() / raw['timestamp'].diff()

    raw['ax'] = raw['vx'].diff() / raw['timestamp'].diff()
    raw['ay'] = raw['vy'].diff() / raw['timestamp'].diff()
    raw['az'] = raw['vz'].diff() / raw['timestamp'].diff()

In [19]:
# test the above functions

df = pd.read_csv("../data/fpv_uzh/indoor_forward_3_davis_with_gt.txt")

extract_features(df)

print(df.head())
print(df['timestamp'])

      timestamp        tx        ty        tz        qx        qy        qz  \
0  1.540820e+09  7.605101  0.240696 -0.754351 -0.269284 -0.661755  0.641923   
1  1.540820e+09  7.604394  0.241403 -0.754263 -0.269227 -0.661660  0.641935   
2  1.540820e+09  7.603792  0.242395 -0.753890 -0.269081 -0.661483  0.642058   
3  1.540820e+09  7.603930  0.243608 -0.753434 -0.269001 -0.661389  0.642086   
4  1.540820e+09  7.604787  0.244973 -0.752511 -0.268968 -0.661503  0.641952   

         qw        vx        vy        vz        ax        ay        az  
0  0.278390       NaN       NaN       NaN       NaN       NaN       NaN  
1  0.278642 -0.010881  0.010879  0.001350       NaN       NaN       NaN  
2  0.278923 -0.006013  0.009916  0.003732  0.048684 -0.009624  0.023827  
3  0.279157  0.001377  0.012131  0.004557  0.073898  0.022147  0.008241  
4  0.279228  0.008571  0.013648  0.009230  0.071941  0.015168  0.046731  
0      1.540820e+09
1      1.540820e+09
2      1.540820e+09
3      1.540820e+09
4

## Slicing the data

Now, we want rows of data that represent a specific range of time. In this case, we want 30 points for each new row.

In [20]:
def generate_slices(data: pd.DataFrame, n: int) -> pd.DataFrame:
    # each row in the original data is a "point". Each row in the output 
    # is a list of points of size n. 
    cols = [f"{col}_{i}" for i in range(n) for col in data.columns]
    out = pd.DataFrame(columns=cols)
    for i in range(len(data) - n):
        flattened = pd.DataFrame([data[i:i+n].to_numpy().flatten()])
        flattened.columns = cols
        out = pd.concat([out if not out.empty else None, flattened], ignore_index=False)
    return out

In [21]:
# test the above function

slices = generate_slices(df, 4)
print(slices.head())

    timestamp_0      tx_0      ty_0      tz_0      qx_0      qy_0      qz_0  \
0  1.540820e+09  7.605101  0.240696 -0.754351 -0.269284 -0.661755  0.641923   
0  1.540820e+09  7.604394  0.241403 -0.754263 -0.269227 -0.661660  0.641935   
0  1.540820e+09  7.603792  0.242395 -0.753890 -0.269081 -0.661483  0.642058   
0  1.540820e+09  7.603930  0.243608 -0.753434 -0.269001 -0.661389  0.642086   
0  1.540820e+09  7.604787  0.244973 -0.752511 -0.268968 -0.661503  0.641952   

       qw_0      vx_0      vy_0  ...      qx_3      qy_3      qz_3      qw_3  \
0  0.278390       NaN       NaN  ... -0.269001 -0.661389  0.642086  0.279157   
0  0.278642 -0.010881  0.010879  ... -0.268968 -0.661503  0.641952  0.279228   
0  0.278923 -0.006013  0.009916  ... -0.269007 -0.661649  0.641821  0.279146   
0  0.279157  0.001377  0.012131  ... -0.268870 -0.661571  0.641922  0.279229   
0  0.279228  0.008571  0.013648  ... -0.269057 -0.662028  0.641452  0.279046   

       vx_3      vy_3      vz_3      ax_3   

## Consolidate all our original data

Now, we want to consolidate our data from all the other sources.

In [22]:
import os

fpv_data = "../data/fpv_uzh"
random_traj_data = "../data/random_trajectory_100ms"
output_path = "../data/output"
n = 30 # we want 30 points per row
slices = []

# consolidate the fpv data
for filename in filter(lambda p: p.endswith("txt"), os.listdir(fpv_data)):
    filepath = os.path.join(fpv_data, filename)
    df = pd.read_csv(filepath)
    
    extract_features(df)
    # the columns should be handled in the generate slices funciton
    slices.append(generate_slices(df, n))
    
# consolidate the synthetic data
for filename in filter(lambda p: p.endswith("txt"), os.listdir(random_traj_data)):
    filepath = os.path.join(random_traj_data, filename)
    df = pd.read_csv(filepath)
    
    extract_features(df)
    # the columns should be handled in the generate slices funciton
    slices.append(generate_slices(df, n))

consolidated = pd.concat(slices, ignore_index=False)
consolidated.to_csv(os.path.join(output_path, "consolidated.csv"))