In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None
import gc
from datetime import datetime

directory = '/kaggle/input/g-research-crypto-forecasting/'
file_path = os.path.join(directory, 'train.csv')
dtypes = {
    'timestamp': np.int64,
    'Asset_ID': np.int8,
     'Count': np.int32,
     'Open': np.float64,
     'High': np.float64,
     'Low': np.float64,
    'Close': np.float64,
     'Volume': np.float64,
     'VWAP': np.float64,
    'Target': np.float64,
}

data = pd.read_csv(file_path, dtype=dtypes, usecols=list(dtypes.keys()))
data.set_index('timestamp', inplace = True)
data_recent = data.loc[datetime.timestamp(datetime(2021,1,1)):datetime.timestamp(datetime(2021,6,13))]

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

def prepare_df(df, with_target = True):
    df_feat = df[['Count', 'Close', 'Volume']].copy()
    df_feat.columns = [col_name + '15' for col_name in df_feat.columns]
    df_feat = df_feat.diff(15)

    df = pd.concat([df_feat, df], axis = 1) 
    df.dropna(inplace = True)

    X = df[['Count15', 'Close15', 'Volume15']]
    if with_target:
        y = df['Target']
    else:
        y = None
    return X, y, df.index

def get_model(df_train):
    X_train, y_train, idx = prepare_df(df_train)
    model = GradientBoostingRegressor(n_estimators = 10)
    model.fit(X_train, y_train)
    return model

def make_prediction(df_pred, model):
    X_train, _, idx = prepare_df(df_pred, with_target = False)
    y_pred = model.predict(X_train)
    return(idx, y_pred)

ids = data_recent.Asset_ID.unique()
models = {}

for id_ in ids:
    df_train = data_recent[data_recent.Asset_ID == id_]
    models[id_] = get_model(df_train)
    print(id_)

In [None]:
# data up to the start from the test data
data_memory = data_recent.drop(['Target'], axis = 1)
data_memory = data_memory.iloc[-10000:-14]

In [None]:
import gresearch_crypto
from datetime import datetime

env = gresearch_crypto.make_env()
iter_test = env.iter_test()

start_time = datetime.now()
for df_test, df_pred in iter_test:

    # add new data and memorize how much data we need to drop from the beginning
    data_to_drop = len(df_test)
    data_memory.loc[:, 'row_id'] = np.nan

    # set timestamp index for new test data
    df_test.set_index('timestamp', inplace = True)
    data_memory = pd.concat([data_memory, df_test], axis = 0)
    
    # for each id ...
    for id_ in ids:

        # step two, a asset specific id with row_ids for which targets are needed
        need_pred = df_test[df_test.Asset_ID ==  id_]

        # get asset data for prediction
        data_memory_id = data_memory[data_memory.Asset_ID == id_]
        # get predictions from the model
        idx, y_pred = make_prediction(data_memory_id.drop(['row_id'], axis = 1), models[id_])
        # make dataframe for asset specific predictions
        df_pred_id = pd.DataFrame(index = idx, data = y_pred, columns = ['Target'])

        # reduce all asset predictions to the ones we need for that asset
        need_pred = need_pred.merge(df_pred_id, left_index = True, right_index = True)[['row_id', 'Target']]

        try:
            all_current_preds = pd.concat([all_current_preds, need_pred], axis = 0)
        except:
            all_current_preds = need_pred.copy()

    df_pred = df_pred.drop(['Target'], axis = 1).merge(all_current_preds, on = 'row_id', how = 'left')
    del all_current_preds
    #print(df_pred)
    
    # Send submissions
    env.predict(df_pred)
    
    data_memory = data_memory.iloc[data_to_drop:, :]
    data_memory.drop(['row_id'], axis = 1, inplace = True)
    
time_elapsed = datetime.now() - start_time
print('Time elapsed (hh:mm:ss.ms) {}'.format(time_elapsed))