In [6]:
import pandas as pd
import numpy as np
import xgboost
from lightgbm import LGBMRegressor
from mlxtend.regressor import StackingRegressor
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error

In [7]:
train = pd.read_csv('../data/acc_data/training_large.csv')#,header = None)
test = pd.read_csv('../data/acc_data/validation_large.csv')#,header = None)

In [8]:
train.head()

Unnamed: 0,vx,vy,vz,dx,dy,vfx,vfy,vfz,afx,afy,...,num_v_labels,ax,ay,az,vx_lag_1,vy_lag_1,vz_lag_1,vx_lag_2,vy_lag_2,vz_lag_2
0,-10.738884,-2.270775,-0.017484,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,7.0,-0.370861,0.324686,0.063559,-10.776157,-2.238142,-0.011096,-10.822652,-2.210152,-0.011221
1,-10.702093,-2.299289,-0.027242,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,7.0,-0.36606,0.283704,0.097093,-10.738884,-2.270775,-0.017484,-10.776157,-2.238142,-0.011096
2,-10.649046,-2.326863,-0.015441,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,7.0,-0.527806,0.27436,-0.117417,-10.702093,-2.299289,-0.027242,-10.738884,-2.270775,-0.017484
3,-10.598294,-2.345125,-0.018072,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,7.0,-0.504967,0.181704,0.026169,-10.649046,-2.326863,-0.015441,-10.702093,-2.299289,-0.027242
4,-10.558764,-2.369425,-0.021834,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,7.0,-0.393321,0.241773,0.037435,-10.598294,-2.345125,-0.018072,-10.649046,-2.326863,-0.015441


In [32]:
X = train.loc[:,['vx', 'vy', 'vz', 'dx', 'dy', 'vfx', 'vfy', 'vfz', 'afx', 'afy', 'afz', 'num_v_labels',
                 'vx_lag_1','vy_lag_1','vz_lag_1','vx_lag_2','vy_lag_2','vz_lag_2']]
Y_ax = train.loc[:,['ax']].values.ravel() # flatten to 1d array
Y_ay = train.loc[:,['ay']].values.ravel()

# y = Y_ax
X_train_ax, X_test_ax, y_train_ax, y_test_ax = train_test_split(X, Y_ax)
X_train_ay, X_test_ay, y_train_ay, y_test_ay = train_test_split(X, Y_ay)

#### Baseline 

In [33]:
model_XGB = make_pipeline(StandardScaler(), xgboost.XGBRegressor(gamma=0.05,learning_rate=0.05, max_depth=3,
                                            n_estimators=2500,reg_alpha=0.5, reg_lambda=0.85))
print(' Extreme Gradient Boosting score is: ' ,-np.mean(cross_val_score(model_XGB,X_train_ax,y_train_ax, scoring='neg_median_absolute_error', cv = 5, n_jobs = -1)),'\n')

 Extreme Gradient Boosting score is:  0.0942819031076736 



#### Improved model

In [34]:
train.describe()

Unnamed: 0,vx,vy,vz,dx,dy,vfx,vfy,vfz,afx,afy,...,num_v_labels,ax,ay,az,vx_lag_1,vy_lag_1,vz_lag_1,vx_lag_2,vy_lag_2,vz_lag_2
count,154371.0,154371.0,154371.0,154371.0,154371.0,154371.0,154371.0,154371.0,154371.0,154371.0,...,154371.0,154371.0,154371.0,154371.0,154371.0,154371.0,154371.0,154371.0,154371.0,154371.0
mean,-0.045204,0.280796,0.007444,15.898143,-0.02032,-0.155993,-0.3725322,0.008820057,0.00697,-0.004353,...,30.397743,0.01867,-0.005758,-0.000761,-0.043328,0.280218,0.007367,-0.041451,0.279619,0.007289
std,6.792466,6.762571,0.254348,20.432149,0.639231,7.362743,4.845964,0.1935924,3.903491,5.7012,...,22.375148,0.588218,0.58993,0.303629,6.791483,6.761767,0.254404,6.790503,6.760985,0.25444
min,-29.72599,-28.805597,-1.870755,0.0,-2.82959,-36.599137,-44.39295,-2.205683,-247.073979,-376.752497,...,0.0,-4.877404,-5.027894,-5.537979,-29.72599,-28.805597,-1.870755,-29.72599,-28.805597,-1.870755
25%,-1.07136,-1.11134,-0.035837,0.0,0.0,-0.002183,0.0,0.0,0.0,0.0,...,15.0,-0.160188,-0.185948,-0.091154,-1.071304,-1.114275,-0.035877,-1.070603,-1.117803,-0.035905
50%,0.0,0.0,0.0,8.789853,0.0,0.0,0.0,0.0,0.0,0.0,...,27.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.443867,1.446956,0.041129,27.917728,0.053655,0.0,3.985386e-13,5.387028e-14,0.0,0.0,...,41.0,0.19452,0.178761,0.091352,1.447644,1.444984,0.040991,1.449968,1.443632,0.040942
max,29.348547,29.198553,2.02115,82.528374,2.869635,62.529255,36.54719,3.251867,229.525634,345.904727,...,170.0,3.580917,4.984998,4.449774,29.348547,29.198553,2.02115,29.348547,29.198553,2.02115


In [35]:
# First we will try to see if RobustScaler helps.
model_XGB = make_pipeline(RobustScaler(), xgboost.XGBRegressor(gamma=0.05,learning_rate=0.05, max_depth=3,
                                            n_estimators=2500,reg_alpha=0.5, reg_lambda=0.85))
print(' Extreme Gradient Boosting score is: ' ,-np.mean(cross_val_score(model_XGB,X_train_ax,y_train_ax, scoring='neg_median_absolute_error', cv = 5, n_jobs = -1)),'\n')

 Extreme Gradient Boosting score is:  0.09414597141827319 



We can see a small increase in accuracy, so we will keep it. Now we will try three different models and see how they work, this models are:
* XGboost
* Gradient Boosting Regressor
* LightGBM Regressor

In [36]:
def boosting_models(X_train,y_train):
    
    categorical = X_train.dtypes == object
    
    model_GBC = make_pipeline(RobustScaler(), GradientBoostingRegressor(alpha = 0.85, n_estimators = 1000, max_depth = 3))
    model_LGB = make_pipeline(RobustScaler(), LGBMRegressor(objective='regression', num_leaves=5, learning_rate=0.05, n_estimators=800))
    model_XGB = make_pipeline(RobustScaler(), xgboost.XGBRegressor(gamma=0.05, learning_rate=0.05, max_depth=3, n_estimators=2500, reg_alpha=0.5, reg_lambda=0.85))

    print('Gradient Boosting score is: ' , -np.mean(cross_val_score(model_GBC,X_train,y_train, scoring='neg_median_absolute_error', cv = 5, n_jobs = -1)),'\n')
    print('Light Gradient Boosting score is: ' , -np.mean(cross_val_score(model_LGB,X_train,y_train, scoring='neg_median_absolute_error', cv = 5, n_jobs = -1)),'\n')
    print(' Extreme Gradient Boosting score is: ' , -np.mean(cross_val_score(model_XGB,X_train,y_train, scoring='neg_median_absolute_error', cv = 5, n_jobs = -1)),'\n')
    
    
    return model_GBC, model_LGB,model_XGB

In [37]:
model_GBC, model_LGB, model_XGB = boosting_models(X_train_ax,y_train_ax)

Gradient Boosting score is:  0.10011845147698209 

Light Gradient Boosting score is:  0.13272166297633842 

 Extreme Gradient Boosting score is:  0.09414597141827319 



In [38]:
def stacking_model(X_train,y_train):
    
    lasso = Lasso(alpha = 1e-3)
    GBC = GradientBoostingRegressor(alpha = 0.85,n_estimators = 1000,max_depth = 3)
    LGB = LGBMRegressor(objective='regression',num_leaves=5, learning_rate=0.05, n_estimators=800)
    XGB = xgboost.XGBRegressor(gamma=0.05,learning_rate=0.05, max_depth=3, n_estimators=2500,reg_alpha=0.5, reg_lambda=0.85)

    categorical = X_train.dtypes == object
    model_SR = make_pipeline(RobustScaler(), StackingRegressor(regressors=[LGB,lasso,XGB], meta_regressor=lasso))

    scores = cross_val_score(model_SR, X_train, y_train, scoring='neg_median_absolute_error',cv=5, n_jobs = -1)
    print("Stacking Regressor score: ", -scores.mean(), ' std: ', scores.std())
    
    return model_SR

In [39]:
model_SR = stacking_model(X_train_ax,y_train_ax)

Stacking Regressor score:  0.07403871356478103  std:  0.000714188625620322


#### Model validation

In [40]:
test.head(2)

Unnamed: 0,vx,vy,vz,dx,dy,vfx,vfy,vfz,afx,afy,...,ax,ay,az,vx_lag_1,vy_lag_1,vz_lag_1,pred_ax,vx_lag_2,vy_lag_2,vz_lag_2
0,-1.399296,-0.110872,-0.017601,19.351571,1.713411,1.385692,0.098483,0.017623,0.201597,-0.248766,...,-0.201692,0.248871,-0.039894,-1.419567,-0.085859,-0.02161,-0.311843,-1.481712,-0.096803,-0.014761
1,-1.369596,-0.089351,-0.016559,19.215781,1.675868,1.356016,0.076936,0.016581,0.295266,0.214385,...,-0.295503,-0.214125,-0.010365,-1.399296,-0.110872,-0.017601,-0.3873,-1.419567,-0.085859,-0.02161


In [42]:
X_val = test.loc[:,['vx', 'vy', 'vz', 'dx', 'dy', 'vfx', 'vfy', 'vfz', 'afx', 'afy', 'afz', 'num_v_labels',
                    'vx_lag_1','vy_lag_1','vz_lag_1','vx_lag_2','vy_lag_2','vz_lag_2']]
y_val_ax = test.loc[:, ['ax']].values.ravel()
y_val_ay = test.loc[:, ['ay']].values.ravel()

In [43]:
model_SR.fit(X_train_ax,y_train_ax)
model_GBC.fit(X_train_ax,y_train_ax)
model_LGB.fit(X_train_ax,y_train_ax)
model_XGB.fit(X_train_ax,y_train_ax)
y_pred_SR = model_SR.predict(X_val)
y_pred_GBC = model_GBC.predict(X_val)
y_pred_LGB = model_LGB.predict(X_val)
y_pred_XGB = model_XGB.predict(X_val)

  positive)




In [44]:
print("MAE of SR:", mean_absolute_error(y_val_ax,y_pred_SR))
print("MAE of GBC:", mean_absolute_error(y_val_ax,y_pred_GBC))
print("MAE of LGB:", mean_absolute_error(y_val_ax,y_pred_LGB))
print("MAE of XGB:", mean_absolute_error(y_val_ax,y_pred_XGB))

MAE of SR: 0.13397015262406234
MAE of GBC: 0.23470056772516265
MAE of LGB: 0.31482797508678595
MAE of XGB: 0.21734635940659566
