In [30]:
# basics
from datetime import datetime
import time
import os 
import random
# import gresearch_crypto

# plotting
import plotly.graph_objects as go
import seaborn as sns
import matplotlib.pyplot as plt

# ml shit
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import scipy.stats as stats
# models
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.multioutput import MultiOutputRegressor
from lightgbm import LGBMRegressor


data_root = "../data/"
SEED = 2021
REMOVE_LB_TEST_OVERLAPPING_DATA = True

In [32]:
def fix_all_seeds(seed):
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

fix_all_seeds(SEED)

In [33]:
train = pd.read_csv(data_root+"train.csv")
asset_d = pd.read_csv(data_root+"asset_details.csv")
train.head()

Unnamed: 0,timestamp,Asset_ID,Count,Open,High,Low,Close,Volume,VWAP,Target
0,1514764860,2,40.0,2376.58,2399.5,2357.14,2374.59,19.233005,2373.116392,-0.004218
1,1514764860,0,5.0,8.53,8.53,8.53,8.53,78.38,8.53,-0.014399
2,1514764860,1,229.0,13835.194,14013.8,13666.11,13850.176,31.550062,13827.062093,-0.014643
3,1514764860,5,32.0,7.6596,7.6596,7.6567,7.6576,6626.71337,7.657713,-0.013922
4,1514764860,7,5.0,25.92,25.92,25.874,25.877,121.08731,25.891363,-0.008264


# keep only the values before the LB test set

In [34]:
if REMOVE_LB_TEST_OVERLAPPING_DATA:
    train["datetime"] = pd.to_datetime(train["timestamp"], unit="s")
    train = train[train["datetime"]<"2021-06-13 00:00:00"]


# training

In [35]:
# Two new features from the competition tutorial
def upper_shadow(df):
    return df['High'] - np.maximum(df['Close'], df['Open'])

def lower_shadow(df):
    return np.minimum(df['Close'], df['Open']) - df['Low']

def get_features(df):
    df_feat = df[["Count", "Open","High", "Low", "Close", "Volume","VWAP"]].copy()
    df_feat["Upper_shadow"] = upper_shadow(df_feat)
    df_feat["Lower_shadow"] = lower_shadow(df_feat)
    return df_feat

def Xy_model_asset(train, asset_id):
    df = train[train["Asset_ID"]==asset_id]

    # todo : try different features here 
    #        also, scale the features
    df_proc = get_features(df)
    df_proc["y"] = df["Target"]
    df_proc.dropna(how="any", inplace=True)
    X = df_proc.drop("y", axis=1)
    y = df_proc["y"]

    # todo : try different models here
    model = LGBMRegressor(n_estimators=10)
    model.fit(X, y)
    return X, y, model


# Loop over all assets

In [36]:
Xs = {}
ys = {}
models = {}

for asset_id, asset_name in zip(asset_d["Asset_ID"],\
     asset_d["Asset_Name"]):
     print(f"Training model for {asset_name:<16} (ID={asset_id:<2})")
     X, y, model = Xy_model_asset(train, asset_id)
     Xs[asset_id], ys[asset_id], models[asset_id] = X, y, model



Training model for Bitcoin Cash     (ID=2 )
Training model for Binance Coin     (ID=0 )
Training model for Bitcoin          (ID=1 )
Training model for EOS.IO           (ID=5 )
Training model for Ethereum Classic (ID=7 )
Training model for Ethereum         (ID=6 )
Training model for Litecoin         (ID=9 )
Training model for Monero           (ID=11)
Training model for TRON             (ID=13)
Training model for Stellar          (ID=12)
Training model for Cardano          (ID=3 )
Training model for IOTA             (ID=8 )
Training model for Maker            (ID=10)
Training model for Dogecoin         (ID=4 )


In [37]:
x = get_features(train.iloc[1])
y_pred = models[0].predict([x])
y_pred[0]

  X = _LGBMCheckArray(X, accept_sparse=True, force_all_finite=False)


9.477576309595709e-06

In [28]:
# we can only run this on kaggle
import gresearch_crypto

all_df_test = []

env = gresearch_crypto.make_env()
iter_test = env.iter_test()

for i, (df_test, df_pred) in enumerate(iter_test):
    for j, row in df_test.iterrows():

        model = models[row["Asset_ID"]]
        x_test = get_features(row)
        y_pred = model.predict([x_test])[0]

        df_pred.loc[df_pred["row_id"] == row["row_id"], "Target"] = y_pred

        # print just one sample row

    all_df_test.append(df_test)

    #submit
    env.predict(df_pred)

Unnamed: 0,timestamp,Asset_ID,Count,Open,High,Low,Close,Volume,VWAP,Target,datetime
0,1514764860,2,40.0,2376.58,2399.5,2357.14,2374.59,19.233005,2373.116392,-0.004218,2018-01-01 00:01:00
1,1514764860,0,5.0,8.53,8.53,8.53,8.53,78.38,8.53,-0.014399,2018-01-01 00:01:00
2,1514764860,1,229.0,13835.194,14013.8,13666.11,13850.176,31.550062,13827.062093,-0.014643,2018-01-01 00:01:00
3,1514764860,5,32.0,7.6596,7.6596,7.6567,7.6576,6626.71337,7.657713,-0.013922,2018-01-01 00:01:00
4,1514764860,7,5.0,25.92,25.92,25.874,25.877,121.08731,25.891363,-0.008264,2018-01-01 00:01:00
