In [71]:
import numpy as np
import pandas as pd
import os
import glob
import xgboost as xgb
import pickle

In [55]:
# load test data. change the path accordingly, from the default 'safety/features'
# change file extension if applicable; default here is CSV
path = r'safety/features' 
all_files = glob.glob(os.path.join(path, "*.csv"))

df = pd.concat((pd.read_csv(f) for f in all_files))
df.sort_values(['bookingID', 'second'], inplace=True)
# label = pd.read_csv('safety/labels/part-00000-e9445087-aa0a-433b-a7f6-7f4c19d78ad6-c000.csv')

In [3]:
# load the model
bst = pickle.load(open("safety.pickle", "rb"))

In [61]:
# generate features for test data
# approximate the jerk along x/y/z-axis
df['jerk_x_pre'] = df.groupby('bookingID')['acceleration_x'].rolling(window=2).std().reset_index(0,drop=True).fillna(0)
df['jerk_y_pre'] = df.groupby('bookingID')['acceleration_y'].rolling(window=2).std().reset_index(0,drop=True).fillna(0)
df['jerk_z_pre'] = df.groupby('bookingID')['acceleration_z'].rolling(window=2).std().reset_index(0,drop=True).fillna(0)

# since the data is not guaranteed to be second-by-second, 
# this enables the calculation of time window during which the jerk occurred
df['elapsed_time_min'] = df.groupby('bookingID')['second'].rolling(window=2).min().reset_index(0,drop=True).fillna(-1)
df['jerk_x'] = df['jerk_x_pre'] / (df['second'] - df['elapsed_time_min'])
df['jerk_y'] = df['jerk_y_pre'] / (df['second'] - df['elapsed_time_min'])
df['jerk_z'] = df['jerk_z_pre'] / (df['second'] - df['elapsed_time_min'])

# calculate the max jerk along the y-axis
df['jerk_y_max'] = df['jerk_y'].groupby(df['bookingID']).transform('max')

# calculate the standard deviation of the jerk along x/y/z-axis
df['jerk_x_std'] = df['jerk_x'].groupby(df['bookingID']).transform('std')
df['jerk_y_std'] = df['jerk_y'].groupby(df['bookingID']).transform('std')
df['jerk_z_std'] = df['jerk_z'].groupby(df['bookingID']).transform('std')

# calculate the 95th percentile for acceleration and gyrometer along the z-axis
df['acc_z_95'] = df['acceleration_z'].groupby(df['bookingID']).transform('quantile', [.95])
df['gyro_z_95'] = df['gyro_z'].groupby(df['bookingID']).transform('quantile', [.95])

# calculate the mean of the jerk along x/y/z-axis
df['jerk_x_mean'] = df['jerk_x'].groupby(df['bookingID']).transform('mean')
df['jerk_y_mean'] = df['jerk_y'].groupby(df['bookingID']).transform('mean')
df['jerk_z_mean'] = df['jerk_z'].groupby(df['bookingID']).transform('mean')

# calculate number of times that jerk along the x/y/z-axis >= 2/3/4/6 standard deviations
df['jerk_z_std_2'] = np.where(df['jerk_z'] > (df['jerk_z_mean'] + 2 * df['jerk_z_std']), 1, 0)
df['jerk_z_std_2_count'] = df['jerk_z_std_2'].groupby(df['bookingID']).transform('sum')
df['jerk_z_std_3'] = np.where(df['jerk_z'] > (df['jerk_z_mean'] + 3 * df['jerk_z_std']), 1, 0)
df['jerk_z_std_3_count'] = df['jerk_z_std_3'].groupby(df['bookingID']).transform('sum')
df['jerk_y_std_3'] = np.where(df['jerk_y'] > (df['jerk_y_mean'] + 3 * df['jerk_y_std']), 1, 0)
df['jerk_y_std_3_count'] = df['jerk_y_std_3'].groupby(df['bookingID']).transform('sum')
df['jerk_y_std_4'] = np.where(df['jerk_y'] > (df['jerk_y_mean'] + 4 * df['jerk_y_std']), 1, 0)
df['jerk_y_std_4_count'] = df['jerk_y_std_4'].groupby(df['bookingID']).transform('sum')
df['jerk_y_std_6'] = np.where(df['jerk_y'] > (df['jerk_y_mean'] + 6 * df['jerk_y_std']), 1, 0)
df['jerk_y_std_6_count'] = df['jerk_y_std_6'].groupby(df['bookingID']).transform('sum')
df['jerk_x_std_2'] = np.where(df['jerk_x'] > (df['jerk_x_mean'] + 2 * df['jerk_x_std']), 1, 0)
df['jerk_x_std_2_count'] = df['jerk_x_std_2'].groupby(df['bookingID']).transform('sum')
df['jerk_x_std_3'] = np.where(df['jerk_x'] > (df['jerk_x_mean'] + 3 * df['jerk_x_std']), 1, 0)
df['jerk_x_std_3_count'] = df['jerk_x_std_3'].groupby(df['bookingID']).transform('sum')
df['jerk_x_std_4'] = np.where(df['jerk_x'] > (df['jerk_x_mean'] + 4 * df['jerk_x_std']), 1, 0)
df['jerk_x_std_4_count'] = df['jerk_x_std_4'].groupby(df['bookingID']).transform('sum')

In [63]:
# reduce each dataset of the same bookingID to one
df2 = df.drop_duplicates('bookingID')

# keep only the derived features
df3 = df2[['jerk_y_std_3_count', 'jerk_y_std_4_count', 'gyro_z_95',
          'jerk_z_std_3_count', 'jerk_z_std_2_count', 'acc_z_95', 'jerk_y_max',
          'jerk_x_std_3_count', 'jerk_z_std', 'jerk_y_std_6_count',
          'jerk_x_std_2_count', 'jerk_x_std_4_count']]

In [102]:
# make predictions
dtest = xgb.DMatrix(df3)
ypred = bst.predict(dtest)

# add a bias to correct for class imbalance during training
predictions = [round(value + 0.38) for value in ypred]
pred = pd.DataFrame(data=predictions, columns=['prediction'])

# save predictions to CSV
pred.to_csv('predictions.csv', index=False)