In [None]:
import gresearch_crypto
import pandas as pd
import numpy as np
from lightgbm import LGBMRegressor
import xgboost as xgb
import traceback

In [None]:
path = "/kaggle/input/g-research-crypto-forecasting/"
df_train = pd.read_csv(path + "train.csv")
df_test = pd.read_csv(path + "example_test.csv")
df_asset_details = pd.read_csv(path + "asset_details.csv").set_index("Asset_ID")
df_supp_train = pd.read_csv(path + "supplemental_train.csv")
SEED = 137

env = gresearch_crypto.make_env()

In [None]:
df_asset_details

In [None]:
used_columns = ['Count', 'Open', 'High', 'Low', 'Close',
                'Volume', 'VWAP', 'upper_shadow', 'lower_shadow']
#                 'Close/Open',
#                 'spread', 'mean_trade', 'log_price_change', 'high_div_low',
#                 'trade', 'gtrade', 'shadow1', 'shadow3', 'shadow5', 'upper_shadow_log',
#                 'lower_shadow_log']

def enrichment_features(df: pd.DataFrame) -> pd.DataFrame:
    df['upper_shadow'] = df['High'] - np.maximum(df['Close'], df['Open'])
    df['lower_shadow'] = np.minimum(df['Close'], df['Open']) - df['Low']
#     df["Close/Open"] = df["Close"] / df["Open"]
#     # df['hlco_ratio'] = (df['High'] - df['Low'])/(df['Close']-df['Open'])
#     df['spread'] = df['High'] - df['Low']
#     df['mean_trade'] = df['Volume']/df['Count']
#     df['log_price_change'] = np.log(df['Close']/df['Open'])
#     df["high_div_low"] = df["High"] / df["Low"]
#     df['trade']=df['Close']-df['Open']
#     df['gtrade']=df['trade']/df['Count']
#     df['shadow1']=df['trade']/df['Volume']
#     df['shadow3']=df['upper_shadow']/df['Volume']
#     df['shadow5']=df['lower_shadow']/df['Volume']
#     df['upper_shadow_log']=np.log(df['upper_shadow'])
#     df['lower_shadow_log']=np.log(df['lower_shadow'])

    return df[used_columns]

Validation test

In [None]:
# from scipy.stats import pearsonr

# def log(model,X_train, X_valid, y_train, y_valid,train_split=1.0):
#     if train_split > 0:
#         X_train=X_train[:int(train_split*X_train.shape[0])]
#         y_train=y_train[:int(train_split*y_train.shape[0])]
    
#         pred=model.predict(X_train)
#         print('Training :- ')
#         print(f'MSE : {np.mean((y_train-pred)**2)}')
#         print(f'CV : {pearsonr(pred,y_train)[0]}')
#     pred=model.predict(X_valid)
#     print('Validation :- ')
#     print(f'MSE : {np.mean((y_valid-pred)**2)}')
#     print(f'CV : {pearsonr(pred,y_valid)[0]}')

In [None]:
# def validation(df_train, asset_id):
#     asset_name = df_asset_details.iloc[asset_id]["Asset_Name"]
#     df = df_train[df_train["Asset_ID"] == asset_id]
   
#     currency_df = df_train[df_train["Asset_ID"] == asset_id]
#     currency_df = currency_df.dropna()
#     currency_y = currency_df["Target"]
#     currency_x = enrichment_features(currency_df)
    
#     X_train=currency_x[:int(0.7*currency_x.shape[0])]
#     y_train=currency_y[:int(0.7*currency_y.shape[0])]
#     X_test=currency_x[int(0.7*currency_x.shape[0]):]
#     y_test=currency_y[int(0.7*currency_x.shape[0]):]

#     model = LGBMRegressor(**params)
#     model.fit(X_train, y_train)
#     print(f'[Finished Training] evaluating for {asset_name}')
#     log(model,X_train, X_test, y_train, y_test,0.3)

In [None]:
# validation(df_train, 2)

LGBM

1000

[Finished Training] evaluating for Bitcoin
Training :- 
MSE : 1.8713016691819038e-05
CV : 0.47574778046270555
Validation :- 
MSE : 1.7527629595927818e-05
CV : 0.004341025230372545

3000

[Finished Training] evaluating for Bitcoin
Training :- 
MSE : 1.266277321245002e-05
CV : 0.7357600141817864
Validation :- 
MSE : 1.9788559920537545e-05
CV : 0.002839122533392276

5000

[Finished Training] evaluating for Bitcoin
Training :- 
MSE : 7.1893931407124685e-06
CV : 0.8870647218100292
Validation :- 
MSE : 1.890073088293228e-05
CV : 0.0017710093865425422

Train and submit

In [None]:
lgbm_params = {
    "n_estimators": 5000,
    "num_leaves" : 300,
    "learning_rate" : 0.09,
    "random_seed" : SEED,
    "n_jobs": 4
}

def init_model (x, y):
    model = xgb.XGBRegressor(
        n_estimators=500,
        max_depth=15,
        learning_rate=0.05,
        subsample=0.9,
        colsample_bytree=0.7,
        missing=-999,
        random_state=SEED,
        tree_method='gpu_hist'  # use gpu
    )

    model.fit(x, y)

    return model

In [None]:
models = {}

for asset_id in range(df_asset_details.shape[0]):
    asset_name = df_asset_details.iloc[asset_id]["Asset_Name"]
    currency_df = df_train[df_train["Asset_ID"] == asset_id]
    currency_df = currency_df.dropna()
    
    currency_y = currency_df["Target"]
    currency_x = enrichment_features(currency_df)
    
#     for column in currency_x.columns:
#         print(column, np.max(currency_x[column]), np.max(currency_x[column]), currency_x[column].isna().sum())
    
    models[asset_id] = init_model(currency_x, currency_y)
    
    print(f"{asset_name} model was initialized.")

In [None]:
iter_test = env.iter_test()

for i, (df_test, df_pred) in enumerate(iter_test):
    for j , row in df_test.iterrows():        
        if models[row['Asset_ID']] is not None:
            try:
                model = models[row['Asset_ID']]
                x_test = enrichment_features(row)
                y_pred = model.predict(pd.DataFrame([x_test]))[0]
                df_pred.loc[df_pred['row_id'] == row['row_id'], 'Target'] = y_pred
            except:
                df_pred.loc[df_pred['row_id'] == row['row_id'], 'Target'] = 0
                traceback.print_exc()
        else: 
            df_pred.loc[df_pred['row_id'] == row['row_id'], 'Target'] = 0  
    
    env.predict(df_pred)