## preparation

- place all test data (in csv format) in `/grab_ai_challenge/data/safety/test` folder

## import packages

In [None]:
import os
import glob
import pandas as pd
from sklearn.preprocessing import RobustScaler
import matplotlib.pyplot as plt
import numpy as np
import time
import joblib
from io import StringIO

## prepare notebook

In [None]:
# config project folder
os.chdir('../')

# config notebook display
%matplotlib inline
pd.set_option("display.max_rows", None)
pd.set_option('display.max_columns', None)

## load test features

In [None]:
# read all test data in the data/safety/ folder at once

def load_data(kind):
    path ='data/safety/'+ kind
    all_files = glob.glob(os.path.join(path, "*.csv")) 
    features = pd.concat((pd.read_csv(f) for f in all_files), ignore_index=True)
    return features

In [None]:
features = load_data(kind="test")

## data cleaning

* some records have abnormally big value for "second" (data error) --> remove these records from features

In [None]:
features_df = features.copy()

In [None]:
# remove records with abnormally big value for "second" (> e+05): 
abnormality_threshold = 10**5
features_df = features_df.loc[features_df.second<=abnormality_threshold]

* since training data only include trips <= 1 hr, split test trips longer than 1 hr into multiple **sub-trips**, label them with sub ID
    * second in [0, 3600) : subID = 1
    * second in [3600, 7200) : subID = 2
    * second in [7200, 10800) : subID = 3
* bookingID_new = bookingID + subID

In [None]:
# generate subID from second
features_df['subID'] = np.floor(features_df.second/60/60).astype(int)+1

# transform second by subID
features_df['second'] = features_df.second - (features_df.subID-1)*60*60

# combine bookingID with subID
features_df['bookingID_new'] = features_df.bookingID.astype(str) + "-" + features_df.subID.astype(str)

* sort bookings by bookingID, subID and second

In [None]:
features_df.sort_values(['bookingID','subID','second'], inplace=True)

## feature scaling
- the features have many outliers and these outliers are important (indicative of dangerous driving behaviour), hence, we do not want to clip the data
- StandardScaler and MinMaxScaler are sensitive to the presence of outliers, hence, we'll use RobustScaler

#### acceleration
* shifting acceleration_y by gravitational force g = 9.81
* since acceleration_x is in the range of [0.5, -0.5], we don't want to scale acceleration again
* since 0 acceleration has special meaning, we don't want to center acceleration

In [None]:
g = 9.81 # gravitational force
features_df['acceleration_y_shifted'] = features_df['acceleration_y'] - g
features_df.drop('acceleration_y', axis=1, inplace=True)
features_df.rename(columns={'acceleration_y_shifted':'acceleration_y'}, inplace=True)

#### GPS
- `scaler_gps = RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True, with_scaling=True)`

In [None]:
# use scaler fitted on training features to transform test features

scaler_gps = joblib.load('model/scaler_gps.save') 
features_df[['Accuracy', 'Bearing','Speed']] = scaler_gps.transform(features_df[['Accuracy', 'Bearing','Speed']])

#### gyro
- `scaler_gyro = RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=False, with_scaling=True)`
- since 0 gyro has special meaning, we don't want to center gyro (it's already quite centered)

In [None]:
# use scaler fitted on training features to transform test features
scaler_gyro = joblib.load('model/scaler_gyro.save') 
features_df[['gyro_x', 'gyro_y','gyro_z']] = scaler_gyro.transform(features_df[['gyro_x', 'gyro_y','gyro_z']])

## padding

to prepare for CNN, pad all sub-trips with 0 so that they have the same length (1 hour)

In [None]:
# create table keys
unique_bookingID = features_df[['bookingID_new']].drop_duplicates().sort_values('bookingID_new')
unique_second = pd.DataFrame(data=list(np.arange(0,3600,1.0)), columns=["second"])

In [None]:
# cartesian product with keys
unique_bookingID['key'] = 0
unique_second['key'] = 0
bookingID_second = unique_bookingID.merge(unique_second, on='key', how='outer').drop(columns=['key'])

In [None]:
# outer join with features by bookingID and second
features_df = bookingID_second.merge(features_df, on=['bookingID_new', 'second'], how='outer')

In [None]:
# padding with 0
features_df.fillna(0, inplace=True)

In [None]:
# check features df shape after padding
features_df.shape[0]==unique_bookingID.shape[0]*unique_second.shape[0]

## convert features df to 3d array

* array dimension = (#bookings, #seconds, #features)

In [None]:
# GPS features

print(time.strftime("%H:%M:%S"),": start Accuracy")
accuracy_arr = features_df.pivot_table(index="bookingID_new", columns="second", values="Accuracy").values

print(time.strftime("%H:%M:%S"),": start Bearing")
bearing_arr = features_df.pivot_table(index="bookingID_new", columns="second", values="Bearing").values

print(time.strftime("%H:%M:%S"),": start Speed")
speed_arr = features_df.pivot_table(index="bookingID_new", columns="second", values="Speed").values

In [None]:
# acceleration

print(time.strftime("%H:%M:%S"),": start acceleration_x")
acceleration_x_arr = features_df.pivot_table(index="bookingID_new", columns="second", values="acceleration_x").values

print(time.strftime("%H:%M:%S"),": start acceleration_y")
acceleration_y_arr = features_df.pivot_table(index="bookingID_new", columns="second", values="acceleration_y").values

print(time.strftime("%H:%M:%S"),": start acceleration_z")
acceleration_z_arr = features_df.pivot_table(index="bookingID_new", columns="second", values="acceleration_z").values

In [None]:
# gyro

print(time.strftime("%H:%M:%S"),": start gyro_x")
gyro_x_arr = features_df.pivot_table(index="bookingID_new", columns="second", values="gyro_x").values

print(time.strftime("%H:%M:%S"),": start gyro_y")
gyro_y_arr = features_df.pivot_table(index="bookingID_new", columns="second", values="gyro_y").values

print(time.strftime("%H:%M:%S"),": start gyro_z")
gyro_z_arr = features_df.pivot_table(index="bookingID_new", columns="second", values="gyro_z").values

In [None]:
# stack all features together to create a 3d array
test_arr = np.stack((accuracy_arr, bearing_arr, speed_arr, 
                     acceleration_x_arr, acceleration_y_arr, acceleration_z_arr, 
                     gyro_x_arr, gyro_y_arr, gyro_z_arr), axis=2)

In [None]:
test_arr.shape

## save processed test data

In [None]:
# save test array
save_path = 'data/safety/test_arr.npy'
np.save(save_path, test_arr)

In [None]:
# save test bookingID and subID
unique_bookingID.drop(columns=['key'], inplace=True)
unique_bookingID[['bookingID', 'subID']] = pd.read_csv(
    StringIO(features_df['bookingID_new'].to_csv(None,index=None, header=False)),sep='-',header=None)

In [None]:
unique_bookingID.to_csv('data/safety/test_id.csv', index=False)