Import relevant packages

In [1]:
import os
import pandas as pd
import math
import numpy as np
from IPython.display import Image
import seaborn as sns
import matplotlib.pyplot as plt 
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
from sklearn.preprocessing import MinMaxScaler

Read all csv files in data folder, and save to separate dataframes

In [2]:
# list .xlsx files in Data folder
path = r'Raw Data'
files = os.listdir(path)
print('All files in Data folder:', files)
# Pick out '.xlsx' files
files_csv = [f for f in files if f[-3:] == 'csv']
print('All .csv files in Data folder:',files_csv)

All files in Data folder: ['ID_1_TRIAL_1.csv', 'ID_1_TRIAL_2.csv', 'ID_2_TRIAL_1.csv', 'ID_3_TRIAL_1.csv', 'ID_4_TRIAL_1.csv', 'ID_5_TRAIL_1.csv', 'ID_5_TRAIL_2.csv', 'ID_6_TRIAL_1.csv', 'ID_7_TRAIL_1.csv', 'ID_8_TRAIL_1.csv', 'ID_8_TRIAL_2.csv', 'ID_9_TRAIL_1.csv', 'ID_9_TRIAL_2.csv']
All .csv files in Data folder: ['ID_1_TRIAL_1.csv', 'ID_1_TRIAL_2.csv', 'ID_2_TRIAL_1.csv', 'ID_3_TRIAL_1.csv', 'ID_4_TRIAL_1.csv', 'ID_5_TRAIL_1.csv', 'ID_5_TRAIL_2.csv', 'ID_6_TRIAL_1.csv', 'ID_7_TRAIL_1.csv', 'ID_8_TRAIL_1.csv', 'ID_8_TRIAL_2.csv', 'ID_9_TRAIL_1.csv', 'ID_9_TRIAL_2.csv']


In [3]:
# function to interpolate between values - data observations should already be sorted in ascending time
def interpolate(query, x, y):
    new_array = np.interp(query, x, y)
    return new_array

Load each .csv, remove unneccessary columns, feature engineering, time step standardisation, append to overall input and output dataframes for all .csv's.

In [4]:
# all .csv files in folder 'Data' are loaded in, the time step is standardised through interpolation of all columns, and then all saved to
# the large matrices 'input' and 'output' ready for further processing.
input = pd.DataFrame()
output = pd.DataFrame()

for f in files_csv:
    
    df = pd.DataFrame()
    data = pd.read_csv(r'Data/' + f,index_col=False)
    print(f)
    
    # remove spaces at the beginning of column names
    data.columns = data.columns.str.lstrip()
    
    # remove unneccessary columns - Position y, Velocity y, Rotation x, Rotation z

    data.drop(columns=['Position y','Velocity y', 'Rotation x','Rotation z','Angular Velocity x','Angular Velocity z'])

    # convert Rotation y from quarternion to radians

    data['Rotation y'] = pd.Series(data['Rotation y']).transform(lambda x: 2*math.asin(x)*(180/math.pi))

    idx = np.argwhere((data['Rotation y'].to_numpy()<-179) | (data['Rotation y'].to_numpy()>179))
    if idx.any():
        idx = min(idx)-1
        # remove all rows PAST idx
        data.drop(data.tail(len(data['Rotation y'])-idx[0]).index,inplace=True)

    #data = data.drop(columns = ['Position y','Velocity y','Rotation x','Rotation z'])

    # feature engineering - calculate alphaX, alphaZ, omega, omega_dot
    time_diff = [j-i for i, j in zip(data['Time'].iloc[:-1][:-1], data['Time'].iloc[:-1][1:])]

    alphax = [(j - i) / (k - l) for j, i, k, l in zip(data['Velocity x'][:-1], data['Velocity x'].iloc[1:], data['Time'][:-1], data['Time'].iloc[1:])]
    alphaz = [(j - i) / (k - l) for j, i, k, l in zip(data['Velocity z'][:-1], data['Velocity z'].iloc[1:], data['Time'][:-1], data['Time'].iloc[1:])]
    omega_dot = [(j - i) / (k - l) for j, i, k, l in zip(data['Angular Velocity y'][:-1], data['Angular Velocity y'].iloc[1:], data['Time'][:-1], data['Time'].iloc[1:])]
    

    # get rid of last 2 entries of original values and last entry of first order engineered features
    data.drop(data.tail(2).index,inplace=True) 
    new_features = pd.DataFrame({'Acceleration x': alphax, 'Acceleration z': alphaz, 'Angular Acceleration y': omega_dot},columns = ['Acceleration x','Acceleration z', 'Angular Acceleration y'])
    new_features.drop(new_features.tail(1).index,inplace=True) 

    # combine all data into 'data' dataframe
    data[new_features.columns] = new_features
    #data['Omega dot'] = omega_dot

    # find max and min time so that a query list of time values can be generated
    time_step = 0.02 # seconds
    # check current min resolution
    min_res = min(time_diff)
    max_res = max(time_diff)
    
    # generate linearly spaced array of time steps starting at 0
    count = 0
    query = [0]
    while query[count] + time_step < max(data['Time']):
        count += 1
        query.append(count*time_step)
    
    # interpolate for constant time step - assume linearity over a small time step
    # query - query times, x - list of times from data, y - list of corresponding attribute values
    # remove time column
    Time = data['Time']
    data = data.drop(columns=['Time'])

    for col in data:
        df[col] = interpolate(query,Time, data[col])
    
    # add query (time) column back into df
    df.insert(loc = 0, column = 'Time', value=query)
    
    # create input and output matrices by removing last row of input and removing first row of output

    input_local = df[:-1]
    output_local = df.iloc[1: , :]

    # append all values to the same input and output arrays

    input = input.append(input_local)
    output = output.append(output_local)

# drop unneccessary output columns

input = input.drop(columns=['Time'])
output = output.drop(columns=['Time','Throttle','Brake','Steering'])


ID_1_TRIAL_1.csv
ID_1_TRIAL_2.csv
ID_2_TRIAL_1.csv
ID_3_TRIAL_1.csv
ID_4_TRIAL_1.csv
ID_5_TRAIL_1.csv
ID_5_TRAIL_2.csv
ID_6_TRIAL_1.csv
ID_7_TRAIL_1.csv
ID_8_TRAIL_1.csv
ID_8_TRIAL_2.csv
ID_9_TRAIL_1.csv
ID_9_TRIAL_2.csv


In [5]:
input.to_csv('complete_input.csv',index=False)
output.to_csv('complete_output.csv',index=False)

In [6]:
# Normalise the input data to values between 0 and 1
# scaler = MinMaxScaler()
# norm_input = scaler.fit_transform(input)
# norm_input = pd.DataFrame(norm_input, index = input.index, columns = input.columns)


In [7]:
# norm_input.to_csv('complete_input_normalised.csv',index=False)