# Features to increase your score

In this notebook I am sharing features which increased scoring of the model for me.

At the same time I am also sharing all the features which I was considering before condensing it down to only 8 features.




This notebook is based on Tom Forbes' notebook "GResearch - Submitting Lagged Features via API"
https://www.kaggle.com/tomforbes/gresearch-submitting-lagged-features-via-api

In [None]:
import os
import random
import pandas as pd
import numpy as np
import lightgbm as lgb
import gresearch_crypto
import time
import datetime

TRAIN_CSV = '/kaggle/input/g-research-crypto-forecasting/train.csv'
ASSET_DETAILS_CSV = '/kaggle/input/g-research-crypto-forecasting/asset_details.csv'

In [None]:
df_train = pd.read_csv(TRAIN_CSV).astype(np.float32)
df_train.head()

In [None]:
df_asset_details = pd.read_csv(ASSET_DETAILS_CSV).sort_values("Asset_ID")
df_asset_details

In [None]:
# Feature engineering part
# calculate z-score
def zscore(x, window):
    r = x.rolling(window=window, min_periods = 1)
    m = r.mean()
    s = r.std(ddof=0)
    z = (x-m)/s
    return z

# calculate different KPI
def upper_shadow(df): return df['High'] - np.maximum(df['Close'], df['Open'])
def lower_shadow(df): return np.minimum(df['Close'], df['Open']) - df['Low']

def upper_shadow_15(df): return df['High'].rolling(window=15, min_periods=1).max() - np.maximum(df['Close'], df['Open']).shift(15)
def lower_shadow_15(df): return np.minimum(df['Close'], df['Open']).shift(15) - df['Low'].rolling(window=15, min_periods=1).min()

                                                     
def upper_shadow_percent(df): return (df['High'] / np.maximum(df['Close'], df['Open'])) -1
def lower_shadow_percent(df): return (np.minimum(df['Close'], df['Open']) / df['Low']) -1
                                                     
def upper_shadow_15_perc(df): return (df['High'].rolling(window=15, min_periods=1).max() / np.maximum(df['Close'], df['Open']).shift(15)) -1
def lower_shadow_15_perc(df): return (np.minimum(df['Close'], df['Open']).shift(15) / df['Low'].rolling(window=15, min_periods=1).min()) -1


In [None]:
def get_features(df, 
                 asset_id, 
                 train=True):
    '''
    This function takes a dataframe with all asset data and return the lagged features for a single asset.
    
    df - Full dataframe with all assets included
    asset_id - integer from 0-13 inclusive to represent a cryptocurrency asset
    train - True - you are training your model
          - False - you are submitting your model via api
    '''
    
    df = df[df['Asset_ID']==asset_id]
    df = df.sort_values('timestamp')
    if train == True:
        df_feat = df.copy()
        # define a train_flg column to split your data into train and validation
        totimestamp = lambda s: np.int32(time.mktime(datetime.datetime.strptime(s, "%d/%m/%Y").timetuple()))
        valid_window = [totimestamp("12/03/2021")]
        df_feat['train_flg'] = np.where(df_feat['timestamp']>=valid_window[0], 0,1)
        df_feat = df_feat[['timestamp','Asset_ID','Close','Count', 'Open', 'High', 'Low','Volume','Target','train_flg']].copy()
    else:
        df = df.sort_values('row_id')
        df_feat = df[['Asset_ID','Close','Count', 'Open', 'High', 'Low','Volume','row_id']].copy()
    
    # Create your features here
    
    df_feat['Z-score_15_Close'] = zscore(df_feat['Close'], 15)
    df_feat['lower_shadow_15'] = zscore(lower_shadow_15(df_feat), 1440*30*12)
    df_feat['upper_shadow_15'] = zscore(upper_shadow_15(df_feat), 1440*30*12)
    df_feat['Return15_%'] = (df_feat['Close'] / df_feat['Close'].shift(15)) - 1
    df_feat['Return60_%'] = (df_feat['Close'] / df_feat['Close'].shift(60)) - 1
    df_feat['Candle_body_%'] = (df_feat['Close'] / df_feat['Open']) - 1
    df_feat['ATR_15_%'] = ((df_feat['High'].rolling(window=15, min_periods=1).max())/ (df_feat['Low'].rolling(window=15, min_periods=1).min()) - 1)
    df_feat['Z-score_return15_60_%'] = zscore(df_feat['Return15_%'], 60)

    df_feat = df_feat.fillna(0)
    df_feat = df_feat.replace([np.inf, -np.inf], value=0)
    
    '''
    All features created by me or found in other people's notebooks
    
    
    df_feat['Volume / Count'] = df_feat['Volume'] / df_feat['Count'] 
    #df_feat['Close - VWAP'] = df_feat['Close'] - df_feat['VWAP'] 
    df_feat['(Close / VWAP_15) - 1'] = (df_feat['Close'] / df_feat['VWAP'].rolling(window=15, min_periods=1).mean()) - 1
    df_feat['Close - VWAP_15'] = (df_feat['Close'] - df_feat['VWAP'].rolling(window=15, min_periods=1).mean())
    df_feat['sma15'] = df_feat['Close'] / df_feat['Close'].rolling(15).mean() -1
    df_feat['sma60'] = df_feat['Close'] / df_feat['Close'].rolling(60).mean() -1
    df_feat['sma240'] = df_feat['Close'] / df_feat['Close'].rolling(240).mean() -1
    
    df_feat['Bar_Range'] = df_feat['High'] - df_feat['Low']  
    df_feat['Bar_Range_%'] = (df_feat['High'] / df_feat['Low']) -1
    
    df_feat['lower_Shadow'] = lower_shadow(df_feat)
    df_feat['upper_Shadow'] = upper_shadow(df_feat)
    df_feat['lower_shadow_percent'] = lower_shadow_percent(df_feat)
    df_feat['upper_Shadow_percent'] = upper_shadow_percent(df_feat)
    df_feat['lower_shadow_15'] = lower_shadow_15(df_feat)
    df_feat['upper_shadow_15'] = upper_shadow_15(df_feat)
    df_feat['lower_shadow_15_perc'] = lower_shadow_15_perc(df_feat)
    df_feat['upper_shadow_15_perc'] = upper_shadow_15_perc(df_feat)
    
    df_feat['Return1'] = df_feat['Close'] - df_feat['Close'].shift(1) 
    df_feat['Return1_%'] = (df_feat['Close'] / df_feat['Close'].shift(1)) - 1
    df_feat['Return15'] = df_feat['Close'] - df_feat['Close'].shift(15)
    df_feat['Return15_%'] = (df_feat['Close'] / df_feat['Close'].shift(15)) - 1
    df_feat['Return60_%'] = (df_feat['Close'] / df_feat['Close'].shift(60)) - 1
    df_feat['Return240_%'] = (df_feat['Close'] / df_feat['Close'].shift(240)) - 1
    df_feat['Candle_body'] = df_feat['Close'] - df_feat['Open'] 
    df_feat['Candle_body_%'] = (df_feat['Close'] / df_feat['Open']) - 1
    df_feat['Candle_body15'] = df_feat['Close'] - df_feat['Open'].shift(14)
    df_feat['Candle_body15_%'] = (df_feat['Close'] / df_feat['Open'].shift(14)) -1
    
    df_feat['Z-score_15_Close'] = zscore(df_feat['Close'], 15)
    df_feat['Z-score_60_Volume'] = zscore(df_feat['Volume'], 60)
    
    df_feat['Mean_15_Close'] = df_feat['Close'].rolling(window=15, min_periods=1).mean()
    df_feat['Mean_60_Volume'] = df_feat['Volume'].rolling(window=60, min_periods=1).mean()
    df_feat['Mean_60_Count'] = df_feat['Count'].rolling(window=60, min_periods=1).mean()
    df_feat['Z-score_60_Count'] = zscore(df_feat['Count'], 60)
    df_feat['ATR_15'] = (df_feat['High'].rolling(window=15, min_periods=1).max() - df_feat['Low'].rolling(window=15, min_periods=1).min())
    df_feat['ATR_15_%'] = ((df_feat['High'].rolling(window=15, min_periods=1).max())/ (df_feat['Low'].rolling(window=15, min_periods=1).min()) - 1)
    
    df_feat['Z-score_return15_15'] = zscore(df_feat['Return15'], 15)
    df_feat['Z-score_return15_15_%'] = zscore(df_feat['Return15_%'], 15)
    df_feat['Z-score_return15_60'] = zscore(df_feat['Return15'], 60)
    df_feat['Z-score_return15_60_%'] = zscore(df_feat['Return15_%'], 60)
    df_feat['Z-score_return15_1440'] = zscore(df_feat['Return15'], 1440)
    df_feat['Z-score_return15_1440_%'] = zscore(df_feat['Return15_%'], 1440)
    df_feat['Z-score_return15_month'] = zscore(df_feat['Return15'], 1440*30)
    df_feat['Z-score_return15_month_%'] = zscore(df_feat['Return15_%'], 1440*30)
    
    df_feat['VWAP15_median'] = df_feat['VWAP'].rolling(window=15, min_periods=1).median()
    df_feat['VWAP60_median'] = df_feat['VWAP'].rolling(window=60, min_periods=1).median()
    df_feat['VWAP240_median'] = df_feat['VWAP'].rolling(window=240, min_periods=1).median()
    df_feat['VWAPday_median'] = df_feat['VWAP'].rolling(window=1440, min_periods=1).median()
    
    df_feat['Median_return15_15'] = df_feat['Return15'].rolling(window=15, min_periods=1).median()
    df_feat['Median_return15_15_%'] = df_feat['Return15_%'].rolling(window=15, min_periods=1).median()
    df_feat['Median_return15_60'] = df_feat['Return15'].rolling(window=60, min_periods=1).median()
    df_feat['Median_return15_60_%'] = df_feat['Return15_%'].rolling(window=60, min_periods=1).median()
    df_feat['Median_return15_1440'] = df_feat['Return15'].rolling(window=1440, min_periods=1).median()
    df_feat['Median_return15_1440_%'] = df_feat['Return15_%'].rolling(window=1440, min_periods=1).median()
    df_feat['Median_return15_month'] = df_feat['Return15'].rolling(window=1440*30, min_periods=1).median()
    df_feat['Median_return15_month_%'] = df_feat['Return15_%'].rolling(window=1440*30, min_periods=1).median()
    df_feat['Median_return15_year'] = df_feat['Return15'].rolling(window=1440*30*12, min_periods=1).median()
    df_feat['Median_return15_year_%'] = df_feat['Return15_%'].rolling(window=1440*30*12, min_periods=1).median()
    df_feat['Median_return15_3years_%'] = df_feat['Return15_%'].rolling(window=1440*30*12*3, min_periods=1).median()
    #df_feat['ROC'] =  ((df_feat['Close'] - df_feat['Close'].shift(15)) / df_feat['Close'].shift(15))*100
    #df_feat['MFI'] 
    
    #Seasonality features
    df_feat["hour"] = df_feat["datetime"].dt.hour
    df_feat["day of week"] = df_feat["datetime"].dt.dayofweek 
    df_feat["day"] = df_feat["datetime"].dt.day
    
    
    '''
    
    df_feat = df_feat.drop(columns=['Close','Count', 'Open', 'High', 'Low','Volume'])
    df_feat = df_feat.astype(np.float32)
    
    return df_feat

In [None]:
# create your feature dataframe for each asset and concatenate
feature_df = pd.DataFrame()
for i in range(14):
    feature_df = pd.concat([feature_df,get_features(df_train,i,train=True)])

In [None]:
# assign weight column feature dataframe
feature_df = pd.merge(feature_df, df_asset_details[['Asset_ID','Weight']], how='left', on=['Asset_ID'])

In [None]:
feature_df.head()

In [None]:
corr = feature_df[['Return15_%','Return60_%','Z-score_15_Close', 'ATR_15_%', 'lower_shadow_15','upper_shadow_15',
          'Candle_body_%', 'Z-score_return15_60_%',
          'Target'    
    ]].corr()
corr.style.background_gradient(cmap='coolwarm').set_precision(5)

In [None]:
# define features for LGBM
features = ['Asset_ID','Return15_%','Return60_%', 'Z-score_15_Close','ATR_15_%', 'Z-score_return15_60_%',
            'Candle_body_%','lower_shadow_15','upper_shadow_15'
            ]
categoricals = ['Asset_ID']

In [None]:
# define the evaluation metric
def weighted_correlation(a, train_data):
    
    weights = train_data.add_w.values.flatten()
    b = train_data.get_label()
    
    
    w = np.ravel(weights)
    a = np.ravel(a)
    b = np.ravel(b)

    sum_w = np.sum(w)
    mean_a = np.sum(a * w) / sum_w
    mean_b = np.sum(b * w) / sum_w
    var_a = np.sum(w * np.square(a - mean_a)) / sum_w
    var_b = np.sum(w * np.square(b - mean_b)) / sum_w

    cov = np.sum((a * b * w)) / np.sum(w) - mean_a * mean_b
    corr = cov / np.sqrt(var_a * var_b)

    return 'eval_wcorr', corr, True

In [None]:
# define train and validation weights and datasets
#
#feature_df = reduce_memory_usage(feature_df)
#
weights_train = feature_df.query('train_flg == 1')[['Weight']]
weights_test = feature_df.query('train_flg == 0')[['Weight']]

train_dataset = lgb.Dataset(feature_df.query('train_flg == 1')[features], 
                            feature_df.query('train_flg == 1')['Target'].values, 
                            feature_name = features, 
                            categorical_feature= categoricals)
val_dataset = lgb.Dataset(feature_df.query('train_flg == 0')[features], 
                          feature_df.query('train_flg == 0')['Target'].values, 
                          feature_name = features, 
                          categorical_feature= categoricals)

train_dataset.add_w = weights_train
val_dataset.add_w = weights_test


In [None]:
evals_result = {}
params = {'n_estimators': 1500,
        'objective': 'regression',
        'metric': 'None',
        'boosting_type': 'gbdt',
        'max_depth': -1, 
        'learning_rate': 0.01,
        'seed': 46,
        'verbose': -1,
        }

In [None]:
# train LGBM2
model = lgb.train(params = params,
                  train_set = train_dataset, 
                  valid_sets = [val_dataset],
                  early_stopping_rounds=100,
                  verbose_eval = 10,
                  feval=weighted_correlation,
                  evals_result = evals_result 
                 )

### Important!

In [None]:
# define max_lookback - an integer > (greater than) the furthest look back in your lagged features
max_lookback = 1440*31*12

#### Now we will submit via api

- As mentioned by the host here https://www.kaggle.com/c/g-research-crypto-forecasting/discussion/290412 - the api takes 10 minutes to complete when submitted on the full test data with a simple dummy prediction. 

- Therefore, any extra logic we include within the api loop with increase the time to completion significantly.

- I have not focused on optimisation of the logic within this loop yet - there are definetly significant improvements you can try for yourself. For example, using numpy arrays instead of pandas dataframes may help.

- For this version - the submission time is roughly 5 hours.

In [None]:
'''
start = time.time()

env = gresearch_crypto.make_env()
iter_test = env.iter_test()

# create dataframe to store data from the api to create lagged features
history = pd.DataFrame()
for i, (df_test, df_pred) in enumerate(iter_test):
    
    # concatenate new api data to history dataframe
    history = pd.concat([history, df_test[['timestamp','Asset_ID','Close','Count', 'Open', 'High', 'Low', 'Volume','row_id']]])
    for j , row in df_test.iterrows():
        # get features using history dataframe
        row_features = get_features(history, row['Asset_ID'], train=False)
        row = row_features.iloc[-1].fillna(0)
        y_pred = model.predict(row[features])[0]

        df_pred.loc[df_pred['row_id'] == row['row_id'], 'Target'] = y_pred
    
    # we only want to keep the necessary recent part of our history dataframe, which will depend on your
    # max_lookback value (your furthest lookback in creating lagged features).
    history = history.sort_values(by='row_id')
    history = history.iloc[-(max_lookback*14+100):]
    
    # Send submissions
    env.predict(df_pred)
stop = time.time()
print(stop-start)
'''