In [None]:
import numpy as np
import pandas as pd
import gresearch_crypto
from lightgbm import LGBMRegressor
import datetime as dt

env = gresearch_crypto.make_env()
iter_test = env.iter_test()
SEED = 202112

In [None]:
cols_used = ['Asset_ID', 'Count', 'Open', 'Close', 'Volume', 'VWAP'
             , 'weekday', 'hour', 'upper_shadow', 'lower_shadow'#, 'Mean'
             , "Close/Open", 'hlco_ratio', 'spread', 'mean_trade'#, 'log_price_change'
             , 'Count_g', 'Volume_g', 'VWAP_g']

def feature_engneering(df):
    df['weekday'] = df.index.weekday
    df['hour'] = df.index.hour
    df['upper_shadow'] = df['High'] - np.maximum(df['Close'], df['Open'])
    df['lower_shadow'] = np.minimum(df['Close'], df['Open']) - df['Low']
    #df['Mean'] = df[['Open', 'High', 'Low', 'Close']].mean()
    df["Close/Open"] = df["Close"] / df["Open"] 
    df['hlco_ratio'] = (df['High'] - df['Low'])/(df['Close']-df['Open'])
    df['spread'] = df['High'] - df['Low']
    df['mean_trade'] = df['Volume']/df['Count']
    #df['log_price_change'] = np.log(df['Close']/df['Open'])
    df_X = df[cols_used]
    return df_X

def model_train(df_X, df_y):
    
    model = LGBMRegressor(#boosting = 'dart', 
                          n_estimators = 800, 
                          learning_rate = 0.05, 
                          max_bin = 15, 
                          num_leaves = 4,
                          random_state = SEED)
    model.fit(df_X, df_y)
    
    return df_X, df_y, model

In [None]:
df = pd.read_csv('/kaggle/input/g-research-crypto-forecasting/train.csv')
#asset_dt = pd.read_csv('/kaggle/input/g-research-crypto-forecasting/asset_details.csv')

df['datetime'] = pd.to_datetime(df['timestamp'], unit='s')
df['datetime_g'] = df['datetime'].dt.strftime("%Y-%m-%d")
df = df.set_index('datetime').drop('timestamp', axis=1)
df = df[(df.index.year == 2021) & (df.index.month < 6)].dropna(subset=["Target"])

df_g = df.groupby(by = ["Asset_ID", 'datetime_g']).mean()
df_g = df_g[['Count', 'Volume', 'VWAP']]
df_g.columns = ['Count_g', 'Volume_g', 'VWAP_g']
df_g = df_g.reset_index().set_index('datetime_g')

df_index = df.index
df = df.merge(df_g, how = 'left', on = ["Asset_ID", 'datetime_g'])
df.index = df_index

#df = df.dropna(subset=["Target"])
df_train_X = feature_engneering(df)
df_train_y = df['Target']
del df

In [None]:
X, y, model = model_train(df_train_X, df_train_y)

In [None]:
# import time
# time_start = time.time()

for i, (df_test, df_pred) in enumerate(iter_test):
    df_test['datetime'] = pd.to_datetime(df_test['timestamp'], unit='s')
    df_test['datetime_g'] = df_test['datetime'].dt.strftime("%Y-%m-%d")
    row_ls = pd.DataFrame()
    for _, row in df_test.iterrows():
        row_ID = row['Asset_ID']
        try:
            df_temp = df_g[df_g['Asset_ID'] == row_ID]
        except:
            df_temp = df_g[df_g['Asset_ID'] == 1]
        df_temp.index = pd.to_datetime(df_temp.index, format = "%Y-%m-%d")
        row_time = pd.to_datetime(row['datetime_g'], format = "%Y-%m-%d")
        row = pd.DataFrame(row).T
        try:
            closest_train_sample = df_temp.iloc[df_temp.index.get_loc(row_time, method='nearest')]
            row.loc[:, 'Count_g'],  row.loc[:, 'Volume_g'], row.loc[:, 'VWAP_g'] = closest_train_sample['Count_g'], closest_train_sample['Volume_g'], closest_train_sample['VWAP_g']
        except:
            print('New Asset')
            row.loc[:, 'Count_g'],  row.loc[:, 'Volume_g'], row.loc[:, 'VWAP_g'] = row['Count'],  row['Volume'], row['VWAP']
        #row = pd.DataFrame(row).T
        row_ls = row_ls.append(row, ignore_index = True)
    df_train = row_ls.set_index('datetime').drop('timestamp', axis=1)
    df = feature_engneering(df_train)
    col_float = df.columns.drop(['Asset_ID', 'weekday', 'hour']).to_list()
    col_int = ['Asset_ID', 'weekday', 'hour']
    df_t = df.copy()
    df_t.loc[:, col_float] = df[col_float].astype(np.float64)
    df_t.loc[:, col_int] = df[col_int].astype(int)
    y_pred = model.predict(df_t)
    df_pred['Target'] = y_pred
    df_pred['Target'] = df_pred['Target'].fillna(0)
    #print(df, df_pred)
    env.predict(df_pred)
    
# time_end = time.time()
# print('Time cost = %fs' % (time_end - time_start))

In [None]:
y_pred

In [None]:
# for df_test, df_pred in iter_test:
#     df_test['datetime'] = pd.to_datetime(df_test['timestamp'], unit='s')
#     df_test['datetime_g'] = df_test['datetime'].dt.strftime("%Y-%m-%d")
#     for _, row in df_test.iterrows():
#         row_ID = row['Asset_ID']
#         try:
#             df_temp = df_g[df_g['Asset_ID'] == row_ID]
#         except:
#             df_temp = df_g[df_g['Asset_ID'] == 1]
            
#         # closest_train_sample = df.iloc[df.index.get_loc(row['datetime'], method='nearest')]
#         df_temp.index = pd.to_datetime(df_temp.index, format = "%Y-%m-%d")
#         row_time = pd.to_datetime(row['datetime_g'], format = "%Y-%m-%d")
#         row = pd.DataFrame(row).T
#         try:
#             closest_train_sample = df_temp.iloc[df_temp.index.get_loc(row_time, method='nearest')]
#             row.loc[:, 'Count_g'],  row.loc[:, 'Volume_g'], row.loc[:, 'VWAP_g'] = closest_train_sample['Count_g'], closest_train_sample['Volume_g'], closest_train_sample['VWAP_g']
#         except:
#             #print('New Asset')
#             row.loc[:, 'Count_g'],  row.loc[:, 'Volume_g'], row.loc[:, 'VWAP_g'] = row['Count'],  row['Volume'], row['VWAP']
#     row = row.set_index('datetime').drop('timestamp', axis=1)
#     row = feature_engneering(row)
#     col_float = row.columns.drop(['Asset_ID', 'weekday', 'hour']).to_list()
#     col_int = ['Asset_ID', 'weekday', 'hour']
#     row.loc[:, col_float] = row[col_float].astype(np.float64)
#     row.loc[:, col_int] = row[col_int].astype(int)
#     df_pred.loc[df_pred['row_id'] == row['row_id'], 'Target'] = model.predict(row)
#     df_pred['Target'] = df_pred['Target'].fillna(0)
#     env.predict(df_pred)