In [None]:
from datetime import datetime
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import gc

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

import gresearch_crypto
env = gresearch_crypto.make_env()
iter_test = env.iter_test()

In [None]:
def add_time_features(df):
    # Feature engineering
    df = df.assign(ds=df.index.values.astype('datetime64[s]'))

    # Calendar features
    # df = df.assign(quarter = pd.to_datetime(df['ds']).dt.quarter)
    df = df.assign(month = df['ds'].dt.month)
#     df = df.assign(year = df['ds'].dt.year)
    df = df.assign(dayofweek = df['ds'].dt.dayofweek)
    df = df.assign(dayofmonth = df['ds'].dt.day)
    df = df.assign(dayofyear = df['ds'].dt.dayofyear)

    df = df.assign(hour = df['ds'].dt.hour)
    df = df.assign(minute = df['ds'].dt.minute)

    return df


def plot_ts(df, c='Target', n_cols = 4):
    assets = df.Asset_ID.unique()

    n_rows = int(np.ceil(len(assets) /n_cols))

    f,ax = plt.subplots(n_rows, n_cols, figsize=(20,10))

    i, j = 0, 0

    for asset in assets:
        # Retrieve name from the asset id
        asset_name = asset_details_df[asset_details_df.Asset_ID == asset].Asset_Name.iloc[0]

        # Filter dataframe to get only data from asset id
        sub_df = df[df.Asset_ID == asset]

        ax[i][j].plot(sub_df[c], label=asset_name)
        ax[i][j].legend()
    #     ax[i][j].xlabel('Time')
    #     ax[i][j].ylabel(asset_name)

        j += 1
        j = j%n_cols

        if j == 0:
            i += 1

In [None]:
folder = '/kaggle/input/g-research-crypto-forecasting/'
train_df = pd.read_csv(folder + 'train.csv')
asset_details_df = pd.read_csv(folder + 'asset_details.csv')
example_test_df = pd.read_csv(folder + 'example_test.csv')
# example_submission_df = pd.read_csv(folder + 'example_sample_submission.csv')
supplemental_train_df = pd.read_csv(folder + 'supplemental_train.csv')
# asset_details_df = pd.read_csv(folder + 'asset_details.csv')


In [None]:
example_test_df.head(1)

In [None]:
train_df.head(1)

In [None]:
asset_details_df.head(1)

In [None]:
supplemental_train_df.head(1)

In [None]:
cutoff_ds = '2021-01-01'
train_df = train_df.set_index("timestamp")
train_df = train_df.assign(ds=train_df.index.values.astype('datetime64[s]'))
train_df = train_df[train_df.ds >= cutoff_ds]
train_df = add_time_features(train_df)

In [None]:
X = train_df[(~train_df.Target.isna()) & (train_df.VWAP != np.float('inf'))].drop(['ds', 'Target'], axis=1)
y = train_df[(~train_df.Target.isna()) & (train_df.VWAP != np.float('inf'))]['Target']

# model = XGBRegressor()
lgb_params = {
    "objective": "regression",
    "n_estimators" : 2000,
    "num_leaves" : 300,
    "learning_rate" : 0.09,
    "random_seed" : 50,
#     "device": 'gpu'
}

model = LGBMRegressor(**lgb_params)
model.fit(X, y)

In [None]:
for (test_df, sample_prediction_df) in iter_test:
    test_df = test_df.set_index("timestamp")
    test_df = test_df.assign(ds=test_df.index.values.astype('datetime64[s]'))
    test_df = add_time_features(test_df)
    sample_prediction_df['Target'] = model.predict(test_df.drop(['ds', 'row_id'], axis=1))
    env.predict(sample_prediction_df)