In [None]:
import xgboost as xgb
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
import glob
import os
from tqdm import tqdm
from scipy.stats import pearsonr
import gresearch_crypto

In [None]:
env = gresearch_crypto.make_env()

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Read

In [None]:
df_all = pd.read_csv('../input/g-research-crypto-forecasting/train.csv', low_memory=False)
df_test = pd.read_csv('../input/g-research-crypto-forecasting/supplemental_train.csv',
                     low_memory=False).sample(20000, random_state=42)

# Preprocessing

In [None]:
# not null
df_test = df_test[df_test['Target'].notnull()]
df_all = df_all[df_all['Target'].notnull()]
df_all = df_all[(df_all['VWAP'] != -np.inf) & (df_all['VWAP'] != np.inf)]
df_all = df_all[df_all['VWAP'].notnull()]

In [None]:
# get unique assets
unique_assets = df_all['Asset_ID'].unique()
unique_assets.sort()

### Sample

In [None]:
dfs_reduced = []
for i in tqdm(unique_assets):
    asset_type = df_all[df_all['Asset_ID'] == i].sort_values('timestamp')
    dfs_reduced.append(asset_type.sample(100000, random_state=42))

In [None]:
df_reduced = pd.concat(dfs_reduced)

### One-Hot Encode

In [None]:
encoder = OneHotEncoder(categories = 'auto')
np_train = np.array(df_reduced['Asset_ID']).reshape(-1,1)
np_test = np.array(df_test['Asset_ID']).reshape(-1,1)

encoder.fit(np_train)
asset_cols = list(encoder.categories_[0])

In [None]:
# transform training set
encoded_train = encoder.transform(np_train).toarray()
df_encoded_train = pd.DataFrame(encoded_train, columns=asset_cols)

# transform test set
encoded_test = encoder.transform(np_test).toarray()
df_encoded_test = pd.DataFrame(encoded_test, columns=asset_cols)

### Min-Max Scaler

In [None]:
feats = ['Open', 'Close', 'High', 'Low', 'Volume', 'VWAP']

In [None]:
scaler = MinMaxScaler()
scaler.fit(df_reduced[feats])

In [None]:
df_train_transformed = pd.DataFrame(scaler.transform(df_reduced[feats]), columns = feats)
df_test_transformed = pd.DataFrame(scaler.transform(df_test[feats]), columns = feats)

### Concat

In [None]:
df_train_pro = pd.concat([df_encoded_train.reset_index(drop=True), 
                          df_train_transformed.reset_index(drop=True), 
                          df_reduced['Target'].reset_index(drop=True)], axis=1)

df_test_pro = pd.concat([df_encoded_test.reset_index(drop=True), 
                         df_test_transformed.reset_index(drop=True),
                         df_test['Target'].reset_index(drop=True)], axis=1)

# X and Y Split

In [None]:
x_train = df_train_pro[asset_cols + feats]
y_train = df_train_pro['Target']

x_test = df_test_pro[asset_cols + feats]
y_test = df_test_pro['Target']

# Base Model

In [None]:
xgb_reg = xgb.XGBRegressor(objective= "reg:squarederror")

In [None]:
xgb_reg.fit(x_train, y_train)

In [None]:
pred = xgb_reg.predict(x_test)

In [None]:
corr, _ = pearsonr(pred, y_test)
print('Correlation Score:', corr)

rmse = mean_squared_error(pred, y_test, squared=False)
print('RMSE:', rmse)

# Submit

In [None]:
iter_test = env.iter_test()

In [None]:
for (test_df, sample_prediction_df) in iter_test:
#     print(test_df)
    # one-hot encode
    np_asset_submit = np.array(test_df['Asset_ID']).reshape(-1,1)
    encoded_asset_encoded = encoder.transform(np_asset_submit).toarray()
    df_asset_encoded = pd.DataFrame(encoded_asset_encoded, columns=asset_cols)
    
    # min-max scale
    df_feats_transformed = pd.DataFrame(scaler.transform(test_df[feats]), columns = feats)
    
    # concat
    input_row = pd.concat([df_asset_encoded.reset_index(drop=True), 
                          df_feats_transformed.reset_index(drop=True)], axis=1)
    
#     print(input_row)
    
    sample_prediction_df['Target'] = xgb_reg.predict(input_row)
    env.predict(sample_prediction_df)