In [0]:
import pandas as pd
import numpy as np
import xgboost
from lightgbm import LGBMRegressor
from mlxtend.regressor import StackingRegressor
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingRegressor

In [0]:
# Chelsea's path
training_path='/content/training_data_1_12.csv'
test_path='/content/validation_data_1_12.csv'

train = pd.read_csv(training_path, header = 0)
test = pd.read_csv(test_path, header = 0)

In [5]:
train.head()

Unnamed: 0,vx,vy,vz,dx,dy,vfx,vfy,vfz,afx,afy,afz,num_v_labels,ax,ay,az
0,6.792706,13.956196,-0.19724,48.522705,-0.589755,-16.459713,-1.624721,0.347638,0.0,0.0,0.0,20.0,0.0,0.0,0.0
1,6.79724,13.99315,-0.194239,48.528838,-0.60414,-16.445887,-1.687644,0.312599,-0.137566,0.626051,0.348616,20.0,-0.045113,-0.367673,-0.029864
2,6.793256,14.003455,-0.178358,48.533003,-0.599635,-16.419858,-1.721146,0.269505,-0.258971,0.333337,0.42877,21.0,0.039639,-0.102534,-0.158003
3,6.800904,14.004451,-0.168657,48.535982,-0.576815,-16.401916,-1.742591,0.226056,-0.178512,0.213361,0.432287,22.0,-0.076089,-0.009906,-0.096522
4,6.812319,14.013153,-0.179739,48.531692,-0.565499,-16.386284,-1.770749,0.222269,-0.155528,0.280159,0.037679,22.0,-0.113578,-0.086583,0.110262


In [7]:
X = train.loc[:,['vx', 'vy', 'vz', 'dx', 'dy', 'vfx', 'vfy', 'vfz', 'afx', 'afy', 'afz', 'num_v_labels']]
Y_ax = train.loc[:,['ax']].values.ravel() # flatten to 1d array
Y_ay = train.loc[:,['ay']].values.ravel()

# y = Y_ax
X_train_ax, X_test_ax, y_train_ax, y_test_ax = train_test_split(X, Y_ax)
X_train_ay, X_test_ay, y_train_ay, y_test_ay = train_test_split(X, Y_ay)

X_test = test.loc[:,['vx', 'vy', 'vz', 'dx', 'dy', 'vfx', 'vfy', 'vfz', 'afx', 'afy', 'afz', 'num_v_labels']]
Y_test_ax = test.loc[:, ['ax']].values.ravel()
Y_test_ay = test.loc[:, ['ay']].values.ravel()

print("X_train_ax shape: {}, X_test_ax shape:{}".format(X_train_ax.shape, X_test_ax.shape))
print("y_train_ax shape: {}, y_test_ax shape:{}".format(y_train_ax.shape, y_test_ax.shape))

print("X_train_ay shape: {}, X_test_ay shape:{}".format(X_train_ay.shape, X_test_ay.shape))
print("y_train_ay shape: {}, y_test_ay shape:{}".format(y_train_ay.shape, y_test_ay.shape))


X_train_ax shape: (13240, 12), X_test_ax shape:(4414, 12)
y_train_ax shape: (13240,), y_test_ax shape:(4414,)
X_train_ay shape: (13240, 12), X_test_ay shape:(4414, 12)
y_train_ay shape: (13240,), y_test_ay shape:(4414,)


# Cross validation

#### Baseline 

In [9]:
model_XGB_ax = make_pipeline(StandardScaler(), xgboost.XGBRegressor(gamma=0.05,learning_rate=0.05, max_depth=3,
                                            n_estimators=2500,reg_alpha=0.5, reg_lambda=0.85))
print(' Extreme Gradient Boosting score wrt ax is: ' ,-np.mean(cross_val_score(model_XGB_ax,X_train_ax,y_train_ax, scoring='neg_median_absolute_error', cv = 5, n_jobs = -1)),'\n')

model_XGB_ay = make_pipeline(StandardScaler(), xgboost.XGBRegressor(gamma=0.05,learning_rate=0.05, may_depth=3,
                                            n_estimators=2500,reg_alpha=0.5, reg_lambda=0.85))
print(' Extreme Gradient Boosting score wrt ay is: ' ,-np.mean(cross_val_score(model_XGB_ay,X_train_ay,y_train_ay, scoring='neg_median_absolute_error', cv = 5, n_jobs = -1)),'\n')

 Extreme Gradient Boosting score wrt ax is:  0.07232612804085899 

 Extreme Gradient Boosting score wrt ay is:  0.06316856767965322 



#### Improved model

In [10]:
train.describe()

Unnamed: 0,vx,vy,vz,dx,dy,vfx,vfy,vfz,afx,afy,afz,num_v_labels,ax,ay,az
count,17654.0,17654.0,17654.0,17654.0,17654.0,17654.0,17654.0,17654.0,17654.0,17654.0,17654.0,17654.0,17654.0,17654.0,17654.0
mean,0.895922,0.512241,-0.027057,19.094545,-0.030745,-0.6444828,-0.463074,0.022544,-0.042441,0.003143,0.00151,40.00929,0.026266,8.8e-05,-0.000867
std,4.180982,4.017711,0.225891,15.604483,0.652574,5.141496,3.445494,0.18076,3.282429,4.25011,0.289947,22.87581,0.452855,0.526465,0.232663
min,-19.150709,-20.314184,-1.397452,0.0,-2.581826,-16.45971,-14.104065,-1.460764,-127.339546,-199.099836,-7.169926,2.0,-2.827884,-2.567549,-2.496956
25%,-0.096572,-0.186617,-0.035276,9.540328,-0.244661,-2.062258,-0.248682,-0.001611,-0.203056,-0.125376,-0.027802,23.0,-0.096205,-0.136065,-0.056459
50%,0.0,0.0,0.0,14.958324,0.0,-9.418571e-14,0.0,0.0,0.0,0.0,0.0,36.0,0.0,0.0,0.0
75%,1.998405,1.586876,0.010086,27.423361,0.261646,0.001632117,0.315125,0.033351,0.089273,0.109812,0.031741,52.0,0.144699,0.16754,0.055185
max,20.264639,14.041661,1.450144,81.817653,2.60249,37.50918,23.87147,1.361739,129.366037,186.041649,7.546433,164.0,2.621215,2.352465,2.783943


In [11]:
# First we will try to see if RobustScaler helps.
model_XGB_ax = make_pipeline(RobustScaler(), xgboost.XGBRegressor(gamma=0.05,learning_rate=0.05, max_depth=3,
                                            n_estimators=2500,reg_alpha=0.5, reg_lambda=0.85))
print(' Extreme Gradient Boosting score is: ' ,-np.mean(cross_val_score(model_XGB_ax,X_train_ax,y_train_ax, scoring='neg_median_absolute_error', cv = 5, n_jobs = -1)),'\n')

model_XGB_ay = make_pipeline(RobustScaler(), xgboost.XGBRegressor(gamma=0.05,learning_rate=0.05, may_depth=3,
                                            n_estimators=2500,reg_alpha=0.5, reg_lambda=0.85))
print(' Extreme Gradient Boosting score is: ' ,-np.mean(cross_val_score(model_XGB_ay,X_train_ay,y_train_ay, scoring='neg_median_absolute_error', cv = 5, n_jobs = -1)),'\n')

 Extreme Gradient Boosting score is:  0.06989822038791553 

 Extreme Gradient Boosting score is:  0.0651028377981756 



We can see a small increase in accuracy, so we will keep it. Now we will try three different models and see how they work, this models are:
* XGboost
* Gradient Boosting Regressor
* LightGBM Regressor

In [0]:
def generate_boosting_models(X_train):
    
    categorical = X_train.dtypes == object
    
    model_GBC = make_pipeline(RobustScaler(), GradientBoostingRegressor(alpha = 0.85, n_estimators = 1000, max_depth = 3))
    model_LGB = make_pipeline(RobustScaler(), LGBMRegressor(objective='regression', num_leaves=5, learning_rate=0.05, n_estimators=800))
    model_XGB = make_pipeline(RobustScaler(), xgboost.XGBRegressor(objective ='reg:squarederror', gamma=0.05, learning_rate=0.05, max_depth=3, n_estimators=2500, reg_alpha=0.5, reg_lambda=0.85))
  
    return model_GBC, model_LGB,model_XGB

def generate_stacking_model(X_train):
    
    lasso = Lasso(alpha = 1e-3)
    GBC = GradientBoostingRegressor(alpha = 0.85,n_estimators = 1000,max_depth = 3)
    LGB = LGBMRegressor(objective='regression',num_leaves=5, learning_rate=0.05, n_estimators=800)
    XGB = xgboost.XGBRegressor(objective ='reg:squarederror', gamma=0.05,learning_rate=0.05, max_depth=3, n_estimators=2500,reg_alpha=0.5, reg_lambda=0.85)

    categorical = X_train.dtypes == object
    model_SR = make_pipeline(RobustScaler(), StackingRegressor(regressors=[LGB,lasso,XGB], meta_regressor=lasso))
    
    return model_SR

def evaluate_model_by_cross_validation(model, X_train, y_train):
  return -np.mean(cross_val_score(model,X_train,y_train, scoring='neg_median_absolute_error', cv = 5, n_jobs = -1))


In [0]:
# Generate models for cross validation wrt ax and ay
model_GBC_cross_ax, model_LGB_cross_ax, model_XGB_cross_ax = generate_boosting_models(X_train_ax)
model_GBC_cross_ay, model_LGB_cross_ay, model_XGB_cross_ay = generate_boosting_models(X_train_ay)
model_SR_cross_ax = generate_stacking_model(X_train_ax)
model_SR_cross_ay = generate_stacking_model(X_train_ay)

In [16]:
print('Gradient Boosting score for ax is: ', evaluate_model_by_cross_validation(model_GBC_cross_ax, X_train_ax, y_train_ax))
print('Light Gradient Boosting score for ax is: ' , evaluate_model_by_cross_validation(model_LGB_cross_ax, X_train_ax, y_train_ax))
print('Extreme Gradient Boosting score for ax is: ' , evaluate_model_by_cross_validation(model_XGB_cross_ax, X_train_ax, y_train_ax))
print("Stacking Regressor score for ax is: ", evaluate_model_by_cross_validation(model_SR_cross_ax, X_train_ax, y_train_ax))

print("\n\n")

print('Gradient Boosting score for ay is: ', evaluate_model_by_cross_validation(model_GBC_cross_ay, X_train_ay, y_train_ay))
print('Light Gradient Boosting score for ay is: ' , evaluate_model_by_cross_validation(model_LGB_cross_ay, X_train_ay, y_train_ay))
print('Extreme Gradient Boosting score for ay is: ' , evaluate_model_by_cross_validation(model_XGB_cross_ay, X_train_ay, y_train_ay))
print("Stacking Regressor score for ay is: ", evaluate_model_by_cross_validation(model_SR_cross_ay, X_train_ay, y_train_ay))

Gradient Boosting score for ax is:  0.06692848397457403
Light Gradient Boosting score for ax is:  0.0798259959651597
Extreme Gradient Boosting score for ax is:  0.06989822038791553
Stacking Regressor score for ax is:  0.07090951816094951



Gradient Boosting score for ay is:  0.058563565158533816
Light Gradient Boosting score for ay is:  0.07099243864227897
Extreme Gradient Boosting score for ay is:  0.0651028377981756
Stacking Regressor score for ay is:  0.0652450543892687


In [0]:
aux = [i if abs(np.mean((train.iloc[i,:10] - train.iloc[i+1,:10]))) else 0 for i in range(0,len(train))] 

## Testing

In [0]:
# Generate models for testing out on test data
model_GBC_ax, model_LGB_ax, model_XGB_ax = generate_boosting_models()
model_GBC_ay, model_LGB_ay, model_XGB_ay = generate_boosting_models()
model_SR_ax = generate_stacking_model()
model_SR_ay = generate_stacking_model()

In [0]:
from sklearn.metrics import mean_absolute_error

# Testing model wrt ax
# Fitting models
model_XGB_ax.fit(X, Y_ax)
model_GBC_ax.fit(X, Y_ax)
model_LGB_ax.fit(X, Y_ax)
model_SR_ax.fit(X, Y_ax)

# Predict
pred_XGB_Y_ax = model_XGB_ax.predict(X_val)
pred_GBC_Y_ax = model_GBC_ax.predict(X_val)
pred_LGB_Y_ax = model_LGB_ax.predict(X_val)
pred_SR_Y_ax = model_SR_ax.predict(X_val)

# Evaluate
print("MAE of XGB model for ax is:", mean_absolute_error(pred_XGB_Y_ax, Y_val_ax))
print("MAE of GBC model for ax is:", mean_absolute_error(pred_GBC_Y_ax, Y_val_ax))
print("MAE of LGB model for ax is:", mean_absolute_error(pred_LGB_Y_ax, Y_val_ax))
print("MAE of SR model for ax is:", mean_absolute_error(pred_SR_Y_ax, Y_val_ax))


MAE of XGB model for ax is: 0.3686276244769425
MAE of GBC model for ax is: 0.3914869858510271
MAE of LGB model for ax is: 0.3331275368083274
MAE of SR model for ax is: 0.3881150978628632


In [0]:
# Testing model wrt ay
# Fitting models
model_XGB_ay.fit(X, Y_ay)
model_GBC_ay.fit(X, Y_ay)
model_LGB_ay.fit(X, Y_ay)
model_SR_ay.fit(X, Y_ay)

# Predict
pred_XGB_Y_ay = model_XGB_ay.predict(X_val)
pred_GBC_Y_ay = model_GBC_ay.predict(X_val)
pred_LGB_Y_ay = model_LGB_ay.predict(X_val)
pred_SR_Y_ay = model_SR_ay.predict(X_val)

# Evaluate
print("MAE of XGB model for ay is:", mean_absolute_error(pred_XGB_Y_ay, Y_val_ay))
print("MAE of GBC model for ay is:", mean_absolute_error(pred_GBC_Y_ay, Y_val_ay))
print("MAE of LGB model for ay is:", mean_absolute_error(pred_LGB_Y_ay, Y_val_ay))
print("MAE of SR model for ay is:", mean_absolute_error(pred_SR_Y_ay, Y_val_ay))

MAE of XGB model for ay is: 0.3619189745640704
MAE of GBC model for ay is: 0.3909493338559913
MAE of LGB model for ay is: 0.26933650078551474
MAE of SR model for ay is: 0.398083355688693
