In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV

# load in the dataset
data = pd.read_csv('../../train.csv')
print(data.shape)
data.dropna(
  subset=[
    'stock_id', 
    'date_id', 
    'seconds_in_bucket', 
    'imbalance_size', 
    'imbalance_buy_sell_flag',
    'reference_price',
    'matched_size',
    'far_price',
    'near_price',
    'bid_price',
    'bid_size',
    'ask_price',
    'ask_size',
    'wap',
    'target',
  ], 
  inplace=True
)
print(data.shape)
data

(5237980, 17)
(2343638, 17)


Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target,time_id,row_id
5730,0,0,300,0.00,0,1.000241,26670118.88,1.000241,1.000241,1.000026,19319.31,1.000241,16149.55,1.000143,-10.039806,30,0_300_0
5731,1,0,300,242332.96,-1,1.000073,3242054.27,0.981974,0.994490,0.999544,43205.40,1.000308,2042.76,1.000273,-1.329780,30,0_300_1
5732,2,0,300,0.00,0,1.000193,4671376.00,1.000193,1.000193,0.999035,18971.00,1.001036,59688.26,0.999518,14.009476,30,0_300_2
5733,3,0,300,2914730.16,1,0.999870,41057776.66,1.003870,1.002279,0.999827,25569.50,1.000042,37897.50,0.999914,-4.339814,30,0_300_3
5734,4,0,300,3396923.02,1,0.998496,38356174.88,1.007894,1.007894,0.998357,14446.00,0.998703,2601.18,0.998650,10.850430,30,0_300_4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5237975,195,480,540,2440722.89,-1,1.000317,28280361.74,0.999734,0.999734,1.000317,32257.04,1.000434,319862.40,1.000328,2.310276,26454,480_540_195
5237976,196,480,540,349510.47,-1,1.000643,9187699.11,1.000129,1.000386,1.000643,205108.40,1.000900,93393.07,1.000819,-8.220077,26454,480_540_196
5237977,197,480,540,0.00,0,0.995789,12725436.10,0.995789,0.995789,0.995789,16790.66,0.995883,180038.32,0.995797,1.169443,26454,480_540_197
5237978,198,480,540,1000898.84,1,0.999210,94773271.05,0.999210,0.999210,0.998970,125631.72,0.999210,669893.00,0.999008,-1.540184,26454,480_540_198


In [2]:
# split into test and train for model fitting
X = data.drop(['target', 'row_id', 'time_id'], axis=1)
y = data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [4]:
# split into a smaller subset for hyperparameter tuning
tune_data = data.sample(
  n=int(0.0001*len(data)), 
  random_state=0
)
X_tune = tune_data.drop(['target', 'row_id', 'time_id'], axis=1)
y_tune = tune_data['target']
X_tune_train, X_tune_test, y_tune_train, y_tune_test = train_test_split(X_tune, y_tune, test_size=0.2, random_state=0)

In [5]:
# random forest tree
from sklearn.ensemble import RandomForestRegressor
random_forest_search = RandomForestRegressor(criterion="absolute_error", random_state=0)
"""
n_estimators: number of trees in the forest
max_features: number of features to consider when looking for the best split
"""
parameters = {
  "n_estimators": [100, 300, 500, 700, 900],
  "max_features": [0.2, 0.4, 0.6, 0.8, 1.0],
}
random_forest_search = GridSearchCV(
  estimator=random_forest_search,
  param_grid=parameters,
  scoring='neg_mean_absolute_error',
  n_jobs=1,
  cv=5,
)
random_forest_search.fit(X_tune_train, y_tune_train)
print(random_forest_search.best_params_)
print(random_forest_search.best_score_)

{'max_features': 0.2, 'n_estimators': 100}
-5.36325392492532


In [None]:
# refit on full data
random_forest = RandomForestRegressor(
  criterion="absolute_error", 
  random_state=0,
  n_estimators=100,
  max_features=0.2,
)
random_forest.fit(X_train, y_train)
random_forest_predictions = random_forest.predict(X_test)
random_forest_mae = mean_absolute_error(y_test, random_forest_predictions)
print("random_forest_mae: %f" % (random_forest_mae))