# Preprocessing Overview
The notebook here preprocesses the raw recorded data provided and turns them into standardized, syncronized tables.

In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
from pathlib import Path
from scipy.integrate import cumtrapz
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

In [None]:
BASE_DIR = Path('..') / 'input' / 'data_publish_v2' / 'data_publish_v2'
all_files_df = pd.DataFrame({'path': list(BASE_DIR.glob('*/*.txt'))})
all_files_df['exp_code'] = all_files_df['path'].map(lambda x: x.parent.stem)
all_files_df['activity'] = all_files_df['exp_code'].map(lambda x: '_'.join(x.split('_')[1:]))
all_files_df['person'] = all_files_df['exp_code'].map(lambda x: x.split('_')[0])
all_files_df['data_src'] = all_files_df['path'].map(lambda x: x.stem)
all_files_df.sample(5)

In [None]:
data_df = all_files_df.pivot_table(values='path', 
                         columns='data_src', 
                         index=['activity', 'person'],
                        aggfunc='first').\
    reset_index().\
    dropna(axis=1) # remove mostly empty columns
data_df.head(5)

# Generate the Data
Here we run the [gen_data script](https://github.com/higerra/ridi_imu/blob/master/python/gen_dataset.py) on all of the files in the project

In [None]:
!pip install -qq numpy-quaternion

In [None]:
import sys
import os
import subprocess
import numpy as np
import scipy.interpolate
import quaternion
import quaternion.quaternion_time_series
import pandas
def interpolate_quaternion_linear(quat_data, input_timestamp, output_timestamp):
    """
    This function interpolate the input quaternion array into another time stemp.
    
    Args:
        quat_data: Nx4 array containing N quaternions.
        input_timestamp: N-sized array containing time stamps for each of the input quaternion.
        output_timestamp: M-sized array containing output time stamps.
    Return:
        quat_inter: Mx4 array containing M quaternions.
    """
    n_input = quat_data.shape[0]
    assert input_timestamp.shape[0] == n_input
    assert quat_data.shape[1] == 4
    n_output = output_timestamp.shape[0]

    quat_inter = np.zeros([n_output, 4])
    ptr1 = 0
    ptr2 = 0
    for i in range(n_output):
        if ptr1 >= n_input - 1 or ptr2 >= n_input:
            raise ValueError("")
        # Forward to the correct interval
        while input_timestamp[ptr1 + 1] < output_timestamp[i]:
            ptr1 += 1
            if ptr1 == n_input - 1:
                break
        while input_timestamp[ptr2] < output_timestamp[i]:
            ptr2 += 1
            if ptr2 == n_input:
                break
        q1 = quaternion.quaternion(*quat_data[ptr1])
        q2 = quaternion.quaternion(*quat_data[ptr2])
        quat_inter[i] = quaternion.as_float_array(quaternion.quaternion_time_series.slerp(q1, q2, input_timestamp[ptr1],
                                                                                          input_timestamp[ptr2],
                                                                                          output_timestamp[i]))
    return quat_inter


def interpolate_3dvector_linear(input, input_timestamp, output_timestamp):
    """
    This function interpolate n-d vectors (despite the '3d' in the function name) into the output time stamps.
    
    Args:
        input: Nxd array containing N d-dimensional vectors.
        input_timestamp: N-sized array containing time stamps for each of the input quaternion.
        output_timestamp: M-sized array containing output time stamps.
    Return:
        quat_inter: Mxd array containing M vectors.
    """
    assert input.shape[0] == input_timestamp.shape[0]
    func = scipy.interpolate.interp1d(input_timestamp, input, axis=0)
    interpolated = func(output_timestamp)
    return interpolated

In [None]:
data_df

In [None]:
skip_front = 800
skip_end = 800
agg_pandas = []

In [None]:
for _, c_row in data_df.iterrows():
    motion_type = c_row['activity']
    out_root = '{activity}_{person}'.format(**c_row)
    nano_to_sec = 1000000000.0
    total_length = 0.0
    data_root = c_row['acce'].parent
    print('------------------\nProcessing {}, type {}'.format(data_root, motion_type))
    # drop the head and tail
    pose_data = np.genfromtxt(data_root / 'pose.txt')[skip_front:-skip_end, :]

    # swap tango's orientation from [x,y,z,w] to [w,x,y,z]
    pose_data[:, [-4, -3, -2, -1]] = pose_data[:, [-1, -4, -3, -2]]

    output_timestamp = pose_data[:, 0]
    output_samplerate = output_timestamp.shape[0] * nano_to_sec / (output_timestamp[-1] - output_timestamp[0])
    if 195 < output_samplerate < 205:
        pass
    else:
        from warnings import warn
        warn('Wrong output sample rate: %f' % output_samplerate)

    print('Pose sample rate: {:2f}Hz'.format(output_samplerate))

    acce_data = np.genfromtxt(data_root / 'acce.txt')
    print('Acceleration found. Sample rate:{:2f} Hz'
          .format((acce_data.shape[0] - 1.0) * nano_to_sec / (acce_data[-1, 0] - acce_data[0, 0])))
    gyro_data = np.genfromtxt(data_root / 'gyro.txt')
    print('Gyroscope found. Sample rate:{:2f} Hz'
          .format((gyro_data.shape[0] - 1.0) * nano_to_sec / (gyro_data[-1, 0] - gyro_data[0, 0])))
    linacce_data = np.genfromtxt(data_root / 'linacce.txt')
    print('Linear acceleration found. Sample rate:{:2f} Hz'
          .format((linacce_data.shape[0] - 1.0) * nano_to_sec / (linacce_data[-1, 0] - linacce_data[0, 0])))
    gravity_data = np.genfromtxt(data_root / 'gravity.txt')
    print('Gravity found. Sample rate:{:2f} Hz'
          .format((gravity_data.shape[0] - 1.0) * nano_to_sec / (gravity_data[-1, 0] - gravity_data[0, 0])))

    magnet_data = np.genfromtxt(data_root / 'magnet.txt')
    print('Magnetometer: {:.2f}Hz'.
          format((magnet_data.shape[0] - 1.0) * nano_to_sec / (magnet_data[-1, 0] - magnet_data[0, 0])))

    orientation_data = np.genfromtxt(data_root / 'orientation.txt')
    print('Orientation found. Sample rate:{:2f}'
          .format((orientation_data.shape[0] - 1.0) * nano_to_sec /
                  (orientation_data[-1, 0] - orientation_data[0, 0])))

    # Generate dataset

    # output_gyro_linear = interpolateAngularRateLinear(gyro_data, output_timestamp)
    output_gyro_linear = interpolate_3dvector_linear(gyro_data[:, 1:], gyro_data[:, 0], output_timestamp)
    output_accelerometer_linear = interpolate_3dvector_linear(acce_data[:, 1:], acce_data[:, 0],
                                                              output_timestamp)
    output_linacce_linear = interpolate_3dvector_linear(linacce_data[:, 1:], linacce_data[:, 0],
                                                        output_timestamp)
    output_gravity_linear = interpolate_3dvector_linear(gravity_data[:, 1:], gravity_data[:, 0],
                                                        output_timestamp)
    output_magnet_linear = np.zeros([output_timestamp.shape[0], 3])
    
    output_magnet_linear = interpolate_3dvector_linear(magnet_data[:, 1:], magnet_data[:, 0],
                                                       output_timestamp)
    # swap from x,y,z,w to w,x,y,z
    orientation_data[:, [1, 2, 3, 4]] = orientation_data[:, [4, 1, 2, 3]]
    # Convert rotation vector to quaternion
    output_orientation = interpolate_quaternion_linear(orientation_data[:, 1:], orientation_data[:, 0],
                                                       output_timestamp)

    # construct a Pandas DataFrame
    column_list = 'time,gyro_x,gyro_y,gyro_z,acce_x'.split(',') + \
                  'acce_y,acce_z,linacce_x,linacce_y,linacce_z,grav_x,grav_y,grav_z'.split(',') + \
                  'magnet_x,magnet_y,magnet_z'.split(',') + \
                  'pos_x,pos_y,pos_z,ori_w,ori_x,ori_y,ori_z,rv_w,rv_x,rv_y,rv_z'.split(',')
    data_mat = np.concatenate([output_timestamp[:, None], output_gyro_linear,
                               output_accelerometer_linear,
                               output_linacce_linear,
                               output_gravity_linear,
                               output_magnet_linear,
                               pose_data[:, 1:4],
                               pose_data[:, -4:],
                               output_orientation], axis=1)
    
    data_pandas = pandas.DataFrame(data_mat, columns=column_list)
    
    data_pandas.to_csv(out_root + '_data.csv')
    
    length = (data_pandas['time'].values[-1] - data_pandas['time'].values[0]) / nano_to_sec
    hertz = data_pandas.shape[0] / length
    agg_pandas += [data_pandas.assign(activity=c_row['activity']).assign(person=c_row['person']).assign(hertz=hertz)]

In [None]:
pd.concat(agg_pandas).reset_index(drop=True).to_csv('all_readings.csv.zip', index=False)