In [3]:
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import pearsonr

In [2]:
# import sys
# sys.path.append('g-research-crypto-forecasting')
# import gresearch_crypto

In [3]:
# from ctypes import cdll
# gresearch_crypto =cdll.LoadLibrary('../gresearch_crypto/competition.cpython-37m-x86_64-linux-gnu.so')

# <span style='color:blue'>Read In Data</span>

In [4]:
df_ethereum = pd.read_csv('../Data/partitioned_coins/Ethereum.csv').drop('Unnamed: 0', axis=1)

In [5]:
df_ethereum.head(3)

Unnamed: 0,timestamp,Asset_ID,Count,Open,High,Low,Close,Volume,VWAP,Target,Datetime
0,1514764860,6,173.0,738.3025,746.0,732.51,738.5075,335.987856,738.839291,-0.004809,2017-12-31 16:01:00
1,1514764920,6,192.0,738.5075,745.14,732.49,738.26,232.793141,738.268967,-0.004441,2017-12-31 16:02:00
2,1514764980,6,120.0,738.3325,745.12,730.0,737.5025,174.138031,737.994457,-0.004206,2017-12-31 16:03:00


# <span style='color:blue'>Train-Test Split</span>

In [6]:
# remove all null
df_ethereum = df_ethereum[df_ethereum['Target'].notnull()]

In [7]:
df_reduced = df_ethereum[len(df_ethereum)-8000:]

len_df = len(df_reduced)
thresh = int(np.ceil(len_df)*.7)

# train
x_train = df_reduced[:thresh][['Open', 'Close', 'High', 'Low', 'Volume', 'VWAP']]
y_train = np.array(df_reduced[:thresh]['Target']).reshape(-1,1)

# test
x_test = df_reduced[thresh:][['Open', 'Close', 'High', 'Low', 'Volume', 'VWAP']]
y_test = np.array(df_reduced[thresh:]['Target']).reshape(-1,1)

# <span style='color:blue'>Scale Output</span>

In [8]:
scaler = MinMaxScaler()

y_train_scaled = scaler.fit_transform(y_train)

y_test_scaled = scaler.transform(y_test)

# <span style='color:blue'>Model Training</span>

In [12]:
xgb_reg = xgb.XGBRegressor(objective= "reg:squarederror")

### Baseline Model

In [23]:
xgb_reg.fit(x_train, y_train_scaled)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [24]:
predictions = xgb_reg.predict(x_test)

In [25]:
predictions = scaler.inverse_transform(predictions.reshape(-1,1))

In [26]:
np.sqrt(mean_squared_error(predictions, y_test))

0.0018283844101351927

### Tuned Model

In [13]:
parameters = {
    'colsample_bytree': [0.3, 0.7],
    'learning_rate': [0.01, 0.1, 0.2, 0.5],
    'n_estimators': [100],
    'subsample': [0.2, 0.5, 0.8],
    'max_depth': [2, 3, 5]
}

In [14]:
gridsearch = GridSearchCV(estimator=xgb_reg, param_grid=parameters, scoring='neg_mean_squared_error',
                    cv=5, n_jobs=-1)

In [15]:
gridsearch.fit(x_train, y_train)

GridSearchCV(cv=5,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None, gamma=None,
                                    gpu_id=None, importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n_estimators=100, n_jobs=...
                                    num_parallel_tree=None, random_state=None,
                                    reg_alpha=None, reg_lambda=None,
                                    scale_pos_weight=None, subsample=None,
                                    tree_method=None, validate_parame

In [16]:
predictions = gridsearch.predict(x_test)

In [17]:
np.sqrt(mean_squared_error(predictions, y_test))

0.001694561367769246