In [None]:
data_train = '/kaggle/input/g-research-crypto-forecasting/train.csv'
data_asset_details = '/kaggle/input/g-research-crypto-forecasting/asset_details.csv'
data_supplemental_train = '/kaggle/input/g-research-crypto-forecasting/supplemental_train.csv'

In [None]:
import random
import pandas as pd
import numpy as np
import lightgbm as lgb
import time
import datetime
import plotly.graph_objects as go

In [None]:
%%time
df_train = pd.read_csv(data_train, 
                       dtype={'Asset_ID': 'int8', 'Count': 'int32', 'row_id': 'int32', 'Count': 'int32', 
                              'Open': 'float64', 'High': 'float64', 'Low': 'float64', 'Close': 'float64', 
                              'Volume': 'float64', 'VWAP': 'float64'
                             }
                      )
df_train.head()

In [None]:
import gresearch_crypto

In [None]:
df_assets = pd.read_csv(data_asset_details).sort_values(by='Asset_ID')
df_assets.head()

In [None]:
for i in range(14):
 
    dfcrop=df_train[df_train['Asset_ID']==i]
    print('Percentage of values not nan',(1-(np.sum((dfcrop['Target'].isnull()).astype(int))/dfcrop.shape[0]))*100)
  

In [None]:
import xgboost as xgb

def upper_shadow(df):
    return df['High'] - np.maximum(df['Close'], df['Open'])

def lower_shadow(df):
    return np.minimum(df['Close'], df['Open']) - df['Low']


def get_features(df):
    df_feat = df[['Count', 'Open', 'High', 'Low', 'Close', 'Volume', 'VWAP']].copy()
    df_feat['Upper_Shadow'] = upper_shadow(df_feat)
    df_feat['Lower_Shadow'] = lower_shadow(df_feat)
    
    df_feat["Close/Open"] = df_feat["Close"] / df_feat["Open"] 
    df_feat["Close-Open"] = df_feat["Close"] - df_feat["Open"] 
    df_feat["High-Low"] = df_feat["High"] - df_feat["Low"] 
    df_feat["High/Low"] = df_feat["High"] / df_feat["Low"]
    
    df_feat['Mean'] = df_feat[['Open', 'High', 'Low', 'Close']].mean(axis=1)

    df_feat["Median"] = df_feat[["Open", "High", "Low", "Close"]].median(axis=1)

    return df_feat

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
def get_Xy_and_model_for_asset(df_train, asset_id):
    df = df_train[df_train["Asset_ID"] == asset_id]
    
    df_proc = get_features(df)
    df_proc['y'] = df['Target']
    df_proc = df_proc.drop(labels=np.where(np.isinf(df_proc))[0], axis=0)
    df_proc = df_proc.dropna(how="any")
    #df_proc = df_proc.reset_index()
    
    X = df_proc.drop("y", axis=1)
    y = df_proc["y"]
    
    #scaler = StandardScaler()
    #X = scaler.fit_transform(X)
    
    if asset_id == 0:
        model = xgb.XGBRegressor(
            n_estimators=317,
            max_depth=8,
            learning_rate= 0.008967159857886885,
            subsample=0.8074685834714562,
            colsample_bytree=0.6156249507619749,
            missing=-999,
            random_state=2022,
            tree_method='gpu_hist'
        )
    elif asset_id == 1:
        model = xgb.XGBRegressor(
            n_estimators=637,
            max_depth=13,
            learning_rate= 0.09253396014321574,
            subsample=0.700624738784116,
            colsample_bytree=0.73896289605807,
            missing=-999,
            random_state=2022,
            tree_method='gpu_hist'
        )
    elif asset_id == 2:
        model = xgb.XGBRegressor(
            n_estimators=452,
            max_depth=8,
            learning_rate=  0.4167642609563461,
            subsample=0.582006239504628,
            colsample_bytree=0.6829989973347863,
            missing=-999,
            random_state=2022,
            tree_method='gpu_hist'
        )
    elif asset_id == 3:
        model = xgb.XGBRegressor(
            n_estimators=452,
            max_depth=8,
            learning_rate= 0.4167642609563461,
            subsample=0.5820062395046286,
            colsample_bytree=0.6829989973347863,
            missing=-999,
            random_state=2022,
            tree_method='gpu_hist'
        )
    elif asset_id == 4:
        model = xgb.XGBRegressor(
            n_estimators=471,
            max_depth=9,
            learning_rate= 0.05918488024797159,
            subsample=0.7693819367697938,
            colsample_bytree=0.7266084230952958,
            missing=-999,
            random_state=2022,
            tree_method='gpu_hist'
        )
    elif asset_id == 5:
        model = xgb.XGBRegressor(
            n_estimators=452,
            max_depth=8,
            learning_rate= 0.4167642609563461,
            subsample=0.582006239504628,
            colsample_bytree=0.6829989973347863,
            missing=-999,
            random_state=2022,
            tree_method='gpu_hist'
        )
    elif asset_id == 6:
        model = xgb.XGBRegressor(
            n_estimators=476,
            max_depth=8,
            learning_rate= 0.4202769113980745,
            subsample=0.5563315209270074,
            colsample_bytree=0.6951993738259458,
            missing=-999,
            random_state=2022,
            tree_method='gpu_hist'
        )
    elif asset_id == 7:
        model = xgb.XGBRegressor(
            n_estimators=520,
            max_depth=15,
            learning_rate=0.19184853364231427,
            subsample=0.8869731830313443,
            colsample_bytree=0.6855158027999262,
            missing=-999,
            random_state=2022,
            tree_method='gpu_hist'
        )
    elif asset_id == 8:
        model = xgb.XGBRegressor(
            n_estimators=471,
            max_depth=9,
            learning_rate= 0.05918488024797159,
            subsample=0.7693819367697938,
            colsample_bytree=0.7266084230952958,
            missing=-999,
            random_state=2022,
            tree_method='gpu_hist'
        )
    elif asset_id == 9:
        model = xgb.XGBRegressor(
            n_estimators=229,
            max_depth=9,
            learning_rate= 0.23016519709096778,
            subsample=0.7928998128269837,
            colsample_bytree=0.5299924747454009,
            missing=-999,
            random_state=2022,
            tree_method='gpu_hist'
        )
    elif asset_id == 10:
        model = xgb.XGBRegressor(
            n_estimators=229,
            max_depth=9,
            learning_rate=0.23016519709096778,
            subsample=0.7928998128269837,
            colsample_bytree=0.5299924747454009,
            missing=-999,
            random_state=2022,
            tree_method='gpu_hist'
        )
    elif asset_id == 11:
        model = xgb.XGBRegressor(
            n_estimators=229,
            max_depth=9,
            learning_rate=0.23016519709096778,
            subsample=0.7928998128269837,
            colsample_bytree=0.5299924747454009,
            missing=-999,
            random_state=2022,
            tree_method='gpu_hist'
        )
    elif asset_id == 12:
        model = xgb.XGBRegressor(
            n_estimators=229,
            max_depth=9,
            learning_rate=0.23016519709096778,
            subsample=0.7928998128269837,
            colsample_bytree=0.5299924747454009,
            missing=-999,
            random_state=2022,
            tree_method='gpu_hist'
        )
    elif asset_id == 13:
        model = xgb.XGBRegressor(
            n_estimators=121,
            max_depth=6,
            learning_rate=0.3062149270836522,
            subsample=0.7361485971162751,
            colsample_bytree=0.685244452888315,
            missing=-999,
            random_state=2022,
            tree_method='gpu_hist'
        )
    model.fit(X, y)
    del X
    del y
    #return X, y, model
    return model

In [None]:
def get_Xy_and_model_for_asset_with_reset_index(df_train, asset_id):
    df = df_train[df_train["Asset_ID"] == asset_id]
    
    df_proc = get_features(df)
    df_proc['y'] = df['Target']
    df_proc = df_proc.reset_index(drop=True)
    df_proc = df_proc.drop(labels=np.where(np.isinf(df_proc))[0], axis=0)
    df_proc = df_proc.dropna(how="any")
    df_proc = df_proc.reset_index(drop=True)
    
    X = df_proc.drop("y", axis=1)
    y = df_proc["y"]
    
    #scaler = StandardScaler()
    #X = scaler.fit_transform(X)
    
    model = xgb.XGBRegressor(
        n_estimators=229,
        max_depth=9,
        learning_rate=0.23016519709096778,
        subsample=0.7928998128269837,
        colsample_bytree=0.5299924747454009,
        missing=-999,
        random_state=2022,
        tree_method='gpu_hist'
    )
    model.fit(X, y)
    del X
    del y
    #return X, y, model
    return model

In [None]:
%%time
Xs = {}
ys = {}
models = {}

for asset_id, asset_name in zip(df_assets['Asset_ID'], df_assets['Asset_Name']):
    print(f"Training model for {asset_name:<16} (ID={asset_id:<2})")
    try:
        if asset_id == 10:
            models[asset_id] = get_Xy_and_model_for_asset_with_reset_index(df_train, asset_id)
            continue
        #X, y, model = get_Xy_and_model_for_asset(df_train, asset_id) 
        model = get_Xy_and_model_for_asset(df_train, asset_id)
        models[asset_id] = model
        #Xs[asset_id], ys[asset_id], models[asset_id] = X, y, model
    except:         
        models[asset_id] = None
        #Xs[asset_id], ys[asset_id], models[asset_id] = None, None, None    

In [None]:
import traceback

In [None]:
models

In [None]:
for asset_id, asset_name in zip(df_assets['Asset_ID'], df_assets['Asset_Name']):
    print(f"Feature importances for {asset_name:<16} (ID={asset_id:<2}) - {models[asset_id].feature_importances_}")

In [None]:
%%time
env = gresearch_crypto.make_env()
iter_test = env.iter_test()
#scaler = StandardScaler()
for i, (df_test, df_pred) in enumerate(iter_test):
    for j , row in df_test.iterrows():
        if models[row['Asset_ID']] is not None:
            try:
                model = models[row['Asset_ID']]
                x_test = get_features(pd.DataFrame([row]))
                #x_test = scaler.fit_transform(x_test)
                y_pred = model.predict(x_test)[0]
                df_pred.loc[df_pred['row_id'] == row['row_id'], 'Target'] = y_pred
            except:
                df_pred.loc[df_pred['row_id'] == row['row_id'], 'Target'] = 0
                traceback.print_exc()
        else: 
            df_pred.loc[df_pred['row_id'] == row['row_id'], 'Target'] = 0
    env.predict(df_pred)