# Consolidating data

This file consists of functions that consolidate our disparate datasets into one large dataset that is useful in training our model. 

The goal is to generate a file with 30 columns (this number should be variable), such that each column is a state in time. 

Ideally, this will be done with heirachical data, ie `p1` is the first point in time, and within `p1` you have an x component, y component, etc.

https://pandas.pydata.org/docs/user_guide/advanced.html

## Input data format

It is assumed that the input data with have the columns: `[timestamp,tx,ty,tz,qx,qy,qz,qw]`

## Extracting the data we want

In our case, we want just the velocity data (for now).

In [None]:
import numpy as np
import pandas as pd

def generate_velocity(position_data: pd.DataFrame) -> pd.DataFrame:
    velocity_data = {
        'timestamp': position_data['timestamp'],
        'vx': position_data['tx'].diff() / position_data['timestamp'].diff(),
        'vy': position_data['ty'].diff() / position_data['timestamp'].diff(),
        'vz': position_data['tz'].diff() / position_data['timestamp'].diff()
    }

    return pd.DataFrame(velocity_data).dropna().reset_index(drop=True)

## Slicing the data

Now, we want rows of data that represent a specific range of time. In this case, we want 30 points for each new row.

In [None]:
def generate_slices(data: pd.DataFrame, n: int) -> pd.DataFrame:
    # each row in the original data is a "point". Each row in the output 
    # is a list of points of size n. 
    cols = [f"{col}_{i}" for i in range(n) for col in data.columns]
    out = pd.DataFrame(columns=cols)
    for i in range(len(data) - n):
        flattened = pd.DataFrame([data[i:i+n].to_numpy().flatten()])
        flattened.columns = cols
        out = pd.concat([out if not out.empty else None, flattened], ignore_index=False)
    return out

## Consolidate all our original data

Now, we want to consolidate our data from all the other sources.

In [None]:
import os

fpv_data = "../data/fpv_uzh"
random_traj_data = "../data/random_trajectory_100ms"
output_path = "../data/output"
if not os.path.exists(output_path):
    os.makedirs(output_path)

n = 30 # we want 30 points per row
slices = []

# consolidate the fpv data
for filename in filter(lambda p: p.endswith("txt"), os.listdir(fpv_data)):
    filepath = os.path.join(fpv_data, filename)
    pos_df = pd.read_csv(filepath)
    
    vel_df = generate_velocity(pos_df)
    # the columns should be handled in the generate slices funciton
    slices.append(generate_slices(vel_df, n))
    
# consolidate the synthetic data
for filename in filter(lambda p: p.endswith("txt"), os.listdir(random_traj_data)):
    filepath = os.path.join(random_traj_data, filename)
    pos_df = pd.read_csv(filepath)
    
    vel_df = generate_velocity(pos_df)
    # the columns should be handled in the generate slices funciton
    slices.append(generate_slices(vel_df, n))

consolidated = pd.concat(slices, ignore_index=False)
consolidated.to_csv(os.path.join(output_path, "consolidated.csv"))