In [1]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

# load in the dataset
data = pd.read_csv('../../train.csv')
print(data.shape)
data.dropna(
  subset=[
    'stock_id', 
    'date_id', 
    'seconds_in_bucket', 
    'imbalance_size', 
    'imbalance_buy_sell_flag',
    'reference_price',
    'matched_size',
    'far_price',
    'near_price',
    'bid_price',
    'bid_size',
    'ask_price',
    'ask_size',
    'wap',
    'target',
  ], 
  inplace=True
)
print(data.shape)
data

(5237980, 17)
(2343638, 17)


Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target,time_id,row_id
5730,0,0,300,0.00,0,1.000241,26670118.88,1.000241,1.000241,1.000026,19319.31,1.000241,16149.55,1.000143,-10.039806,30,0_300_0
5731,1,0,300,242332.96,-1,1.000073,3242054.27,0.981974,0.994490,0.999544,43205.40,1.000308,2042.76,1.000273,-1.329780,30,0_300_1
5732,2,0,300,0.00,0,1.000193,4671376.00,1.000193,1.000193,0.999035,18971.00,1.001036,59688.26,0.999518,14.009476,30,0_300_2
5733,3,0,300,2914730.16,1,0.999870,41057776.66,1.003870,1.002279,0.999827,25569.50,1.000042,37897.50,0.999914,-4.339814,30,0_300_3
5734,4,0,300,3396923.02,1,0.998496,38356174.88,1.007894,1.007894,0.998357,14446.00,0.998703,2601.18,0.998650,10.850430,30,0_300_4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5237975,195,480,540,2440722.89,-1,1.000317,28280361.74,0.999734,0.999734,1.000317,32257.04,1.000434,319862.40,1.000328,2.310276,26454,480_540_195
5237976,196,480,540,349510.47,-1,1.000643,9187699.11,1.000129,1.000386,1.000643,205108.40,1.000900,93393.07,1.000819,-8.220077,26454,480_540_196
5237977,197,480,540,0.00,0,0.995789,12725436.10,0.995789,0.995789,0.995789,16790.66,0.995883,180038.32,0.995797,1.169443,26454,480_540_197
5237978,198,480,540,1000898.84,1,0.999210,94773271.05,0.999210,0.999210,0.998970,125631.72,0.999210,669893.00,0.999008,-1.540184,26454,480_540_198


In [2]:
# split into test and train
X = data.drop(['target', 'row_id', 'time_id'], axis=1)
y = data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [3]:
# use the XGBRegressor module from xgboost to train a model. preselected hyperparameters.
model1 = xgb.XGBRegressor(
  colsample_bytree = 0.3,
  learning_rate = 0.1,
  max_depth = 5,
  alpha = 10,
  n_estimators = 10,
  eval_metric = 'mae'
)
model1.fit(X_train, y_train)

In [4]:
# predict on test using the model with preselected hyperparameters and report MAE
predictions1 = model1.predict(X_test)
mae1 = mean_absolute_error(y_test, predictions1)
print("MAE1: %f" % (mae1))

MAE1: 5.679939


In [5]:
# use grid search to find optimize hyperparameters
from sklearn.model_selection import GridSearchCV
param_grid = {
  'max_depth': [3, 4, 5],
  'learning_rate': [0.01, 0.05, 0.1],
  'n_estimators': [100, 200, 300],
  'colsample_bytree': [0.3, 0.7]
}
model2 = xgb.XGBRegressor(eval_metric='mae')
search = GridSearchCV(
  estimator=model2, 
  param_grid=param_grid, 
  scoring='neg_mean_absolute_error', 
  n_jobs=1, cv=5
)
search.fit(X_train, y_train)
print(search.best_params_)
print(search.best_score_)

{'colsample_bytree': 0.7, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 300}
-5.584612473434407


In [8]:
final_model = xgb.XGBRegressor(
  colsample_bytree = 0.7,
  learning_rate = 0.1,
  max_depth = 12,
  alpha = 10,
  n_estimators = 300,
  eval_metric = 'mae'
)
final_model.fit(X_train, y_train)
final_predictions = final_model.predict(X_test)
final_mae = mean_absolute_error(y_test, final_predictions)
print("final_mae: %f" % (final_mae))

final_mae: 5.206287
