## Crypto Prediction, XGB Regressor using polars dataframes library

Polars is an alternative for Pandas that is significantly faster. See H2O's benchmark [here](http://https://h2oai.github.io/db-benchmark/).

The predictions for this competition take a long time and if there are too many features the submissions time out. Using the polars library speeds up the dataframe generation in the submission loop and therefore more features can be used.

I also created a Kaggle dataset with the python wheel to install polars offline: https://www.kaggle.com/rluethy/polars-fast-dataframe-library

Acknowledgements:

https://github.com/pola-rs/polars

https://www.pola.rs/

### Install polars library

In [None]:
!pip install ../input/polars-fast-dataframe-library/typing_extensions-4.0.1-py3-none-any.whl
!pip install ../input/polars-fast-dataframe-library/polars-0.12.7-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.whl

In [None]:
import os
import sys
import datetime
import time
import numpy as np
import pandas as pd
import polars as pl
import matplotlib.pyplot as plt
from IPython.core.display import display
import traceback
import xgboost as xgb

import gresearch_crypto

### Read training data

In [None]:
train_csv = '/kaggle/input/g-research-crypto-forecasting/train.csv'
asset_csv = '/kaggle/input/g-research-crypto-forecasting/asset_details.csv'
df = pl.read_csv(train_csv)
df = df.sort("timestamp")
print(df.shape)

totimestamp = lambda s: np.int32(time.mktime(datetime.datetime.strptime(s, "%d/%m/%Y").timetuple()))

df_test = df[df["timestamp"]>totimestamp("12/06/2021")]
df_train = df[df["timestamp"]<totimestamp("12/06/2021")]
print(df_train.shape)
display(df_train.head())
display(df_train.tail())

df_asset_details = pd.read_csv(asset_csv)
display(df_asset_details)


### Get (lagged) features
https://www.kaggle.com/tomforbes/gresearch-submitting-lagged-features-via-api

In [None]:
# Two new features from the competition tutorial

def upper_shadow(df):
    return df['High'] - np.maximum(df['Close'], df['Open'])

def lower_shadow(df):
    return np.minimum(df['Close'], df['Open']) - df['Low']

def hlco_ratio(df): 
    return (df['High'] - df['Low'])/np.abs(df['Close']-df['Open'])

def get_features(df, asset_id, train=True):
    '''
   
    This function takes a dataframe with all asset data and return the lagged features for a single asset.
    
    df - Full dataframe with all assets included
    asset_id - integer from 0-13 inclusive to represent a cryptocurrency asset
    train - True - you are training your model
          - False - you are submitting your model via api
    '''
    
    df = df[df['Asset_ID']==asset_id]
    if train == True:
        df_feat = df[['timestamp','Asset_ID','Open','High','Low','Close','Volume','VWAP','Target']]
    else:
        df_feat = df[['timestamp','Asset_ID','Open','High','Low','Close','Volume','VWAP']]
    
    # Create your features here, they can be lagged or not
    df_feat['sma15'] = df_feat['Close'].rolling_mean(15)/df_feat['Close']
    df_feat['sma30'] = df_feat['Close'].rolling_mean(30)/df_feat['Close'] 
    df_feat['sma60'] = df_feat['Close'].rolling_mean(60)/df_feat['Close'] 
    df_feat['std30'] = df_feat['Close'].rolling_std(30) 
    
    df_feat['return15'] = df_feat['Close']/df_feat['Close'].shift(15)
    df_feat['return30'] = df_feat['Close']/df_feat['Close'].shift(30)
    df_feat['return60'] = df_feat['Close']/df_feat['Close'].shift(60)
 
    df_feat['vwap15'] = df_feat['VWAP']/df_feat['VWAP'].rolling_mean(15)
    df_feat['vw1'] = df_feat['VWAP']/df_feat['VWAP'].shift(1)
    df_feat['vw2'] = df_feat['VWAP']/df_feat['VWAP'].shift(2)

    df_feat['volume15'] = df_feat['Volume']/df_feat['Volume'].rolling_mean(15)
    df_feat['v1'] = df_feat['Volume']/df_feat['Volume'].shift(1)
    df_feat['v2'] = df_feat['Volume']/df_feat['Volume'].shift(2)

    df_feat['Upper_Shadow'] = upper_shadow(df_feat)
    df_feat['Lower_Shadow'] = lower_shadow(df_feat)

    df_feat["high_div_low"] = np.log(df_feat["High"] / df_feat["Low"])
    df_feat['trade'] = df_feat['Close'] - df_feat['Open']
    df_feat['shadow1'] = df_feat['trade'] / df_feat['Volume']
    df_feat['shadow3'] = df_feat['Upper_Shadow'] / df_feat['Volume']
    df_feat['shadow5'] = df_feat['Lower_Shadow'] / df_feat['Volume']
    df_feat['mean1'] = (df_feat['shadow5'] + df_feat['shadow3']) / 2
    df_feat['mean2'] = (df_feat['shadow1'] + df_feat['Volume']) / 2
    df_feat['hlco_ratio'] = hlco_ratio(df_feat)

    df_feat = df_feat.drop(['Open','High','Low','Close','Volume','VWAP'])
    features = [f for f in df_feat.columns if f not in ['Target','timestamp', 'Asset_ID']]
    for f in features:
        df_feat[f] = df_feat[f].set(df_feat[f].is_infinite(), None)
        df_feat[f] = df_feat[f].set(df_feat[f].is_nan(), None)
    
    df_feat = df_feat.fill_null("mean")

    return df_feat, features

def get_features_pandas(df, asset_id, train=True):
    '''
   
    This function takes a dataframe with all asset data and return the lagged features for a single asset.
    
    df - Full dataframe with all assets included
    asset_id - integer from 0-13 inclusive to represent a cryptocurrency asset
    train - True - you are training your model
          - False - you are submitting your model via api
    '''
    
    df = df[df['Asset_ID']==asset_id]
    if train == True:
        df_feat = df[['timestamp','Asset_ID','Open','High','Low','Close','Volume','VWAP','Target']]
    else:
        df_feat = df[['timestamp','Asset_ID','Open','High','Low','Close','Volume','VWAP']]
    
    # Create your features here, they can be lagged or not
    df_feat['sma15'] = df_feat['Close'].rolling(15).mean()/df_feat['Close']
    df_feat['sma30'] = df_feat['Close'].rolling(30).mean()/df_feat['Close'] 
    df_feat['sma60'] = df_feat['Close'].rolling(60).mean()/df_feat['Close'] 
    df_feat['std30'] = df_feat['Close'].rolling(30).std() 
    
    df_feat['return15'] = df_feat['Close']/df_feat['Close'].shift(15)
    df_feat['return30'] = df_feat['Close']/df_feat['Close'].shift(30)
    df_feat['return60'] = df_feat['Close']/df_feat['Close'].shift(60)
 
    df_feat['vwap15'] = df_feat['VWAP']/df_feat['VWAP'].rolling(15).mean()
    df_feat['vw1'] = df_feat['VWAP']/df_feat['VWAP'].shift(1)
    df_feat['vw2'] = df_feat['VWAP']/df_feat['VWAP'].shift(2)

    df_feat['volume15'] = df_feat['Volume']/df_feat['Volume'].rolling(15).mean()
    df_feat['v1'] = df_feat['Volume']/df_feat['Volume'].shift(1)
    df_feat['v2'] = df_feat['Volume']/df_feat['Volume'].shift(2)

    df_feat['Upper_Shadow'] = upper_shadow(df_feat)
    df_feat['Lower_Shadow'] = lower_shadow(df_feat)

    df_feat["high_div_low"] = np.log(df_feat["High"] / df_feat["Low"])
    df_feat['trade'] = df_feat['Close'] - df_feat['Open']
    df_feat['shadow1'] = df_feat['trade'] / df_feat['Volume']
    df_feat['shadow3'] = df_feat['Upper_Shadow'] / df_feat['Volume']
    df_feat['shadow5'] = df_feat['Lower_Shadow'] / df_feat['Volume']
    df_feat['mean1'] = (df_feat['shadow5'] + df_feat['shadow3']) / 2
    df_feat['mean2'] = (df_feat['shadow1'] + df_feat['Volume']) / 2
    df_feat['hlco_ratio'] = hlco_ratio(df_feat)
    
    df_feat = df_feat.drop(['Open','High','Low','Close','Volume','VWAP'], axis=1)
    features = [f for f in df_feat.columns if f not in ['Target','timestamp', 'Asset_ID']]
    df_feat = df_feat.replace([np.inf, -np.inf], np.nan)
    df_feat = df_feat.fillna(df_feat.mean())
    
    return df_feat, features


### Measure time creating features using polars

In [None]:
%%timeit -n 1 -r 10
x, features = get_features(df_train[:14*250], 2, False)


In [None]:
x, features = get_features(df_train, 2, False)
print(features)
print(x.shape)
x.head()

### Measure time creating features using pandas

In [None]:
pd_df = df_train.to_pandas()

In [None]:
%%timeit -n 1 -r 10
x, features = get_features_pandas(pd_df.loc[:14*250], 2, False)


In [None]:
x, features = get_features_pandas(pd_df, 2, False)
print(features)
print(x.shape)
x.head()

**The polars version is about 10 times faster in this example: ~2.5 ms for polar vs ~24 ms for pandas**

### Train xgboost models for each asset

In [None]:
def get_xgb_regr_for_asset(df_train, df_test, asset_id, asset_name, params, plot_imp=True):
    df, features = get_features(df_train,asset_id,train=True)
   
    model = xgb.XGBRegressor(**params)
    model.fit(df[features].to_numpy(), df["Target"].to_numpy())
    model.get_booster().feature_names = features

    if plot_imp:
        fig=plt.gcf()
        xgb.plot_importance(model)
        plt.title("Feature Importance for "+asset_name)
        plt.show()

    tst, _ =  get_features(df_test,asset_id,train=True)
    pred = model.predict(tst[features].to_numpy())
    p = np.corrcoef(tst["Target"].to_numpy(),pred)[0][1]
    print(p)
    if np.isnan(p):
        p=0

    return model, p

xgb_params = {"n_estimators": 100,
            "max_depth": 3,
            "learning_rate": 0.06,
            "subsample": 0.7,
            "colsample_bytree": 0.6,
            "random_state": 2020,
            "tree_method": "hist",
            "objective": "reg:pseudohubererror",
            }
models = {}

t0 = time.time()
t1 = time.time()

for idx, row in df_asset_details.iterrows():
    print(f"Training model for {row['Asset_Name']:<16} (ID={row['Asset_ID']:<2})", end=" ")
    models[row['Asset_ID']], tst_corr = get_xgb_regr_for_asset(df_train, df_test, 
                                                                           row['Asset_ID'], 
                                                                           row['Asset_Name'], xgb_params) 
    
    df_asset_details.loc[idx,"test_corr"] = tst_corr
    print(f" time spent {time.time()-t1} ")
    t1= time.time()
    
print(f"Training time for all models {time.time()-t0:.0f} ")
display(df_asset_details)


### Submission

In [None]:
start = time.time()

# define max_lookback - an integer > (greater than) the furthest look back in your lagged features
max_lookback = 60

# create dataframe to store data from the api to create lagged features
l = df_train.shape[0]
history = df_train[l-(max_lookback*14+100):]
history = history.drop("Target")
history.insert_at_idx(history.shape[1],
                      pl.Series(values=[-1 for i in range(history.shape[0])], 
                                dtype=pl.datatypes.Int32, name="row_id"))
print(history.shape)

env = gresearch_crypto.make_env()
iter_test = env.iter_test()

for i, (df_test, df_pred) in enumerate(iter_test):
    # concatenate new api data to history dataframe
    df_test["Asset_ID"] = df_test["Asset_ID"].astype(np.int64)
    df_test["Count"] = df_test["Count"].astype(np.float64)
    history = pl.concat([history, pl.DataFrame(df_test)])

    for j , row in df_test.iterrows():
        if models[row['Asset_ID']] is not None:
            try:
                model = models[row['Asset_ID']]
                row_features, features = get_features(history, row['Asset_ID'], train=False)
                x_test = row_features[-1][features].to_numpy()
                y_pred = model.predict(x_test)[0]
                df_pred.loc[df_pred['row_id'] == row['row_id'], 'Target'] = y_pred
            except:
                df_pred.loc[df_pred['row_id'] == row['row_id'], 'Target'] = 0
                traceback.print_exc()
        else: 
            df_pred.loc[df_pred['row_id'] == row['row_id'], 'Target'] = 0
    history = history.sort('timestamp')
    l = history.shape[0]
    history = history[l-(max_lookback*14+100):]
        
    env.predict(df_pred)
    
t = time.time() - start
print(f"Test time: {t:.3f}, {t/i:.6f} per iteration")