# Submitting Lagged Features via API

In this notebook we submit a LGBM model with lagged features via the API.

The API works by providing a single row for each Asset - one timestamp at a time - to prevent using future data in predictions.

In order to utilise lagged features in our model, we must store the outputs from the API so we can calculate features using past data.

In [None]:
import os
import random
import pandas as pd
import numpy as np
import lightgbm as lgb
import gresearch_crypto
import time
import datetime
import gc
import traceback
import datatable as dt
import gresearch_crypto
from tqdm.notebook import tqdm
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, mean_absolute_error


TRAIN_CSV = '/kaggle/input/g-research-crypto-forecasting/train.csv'
ASSET_DETAILS_CSV = '/kaggle/input/g-research-crypto-forecasting/asset_details.csv'

DEVICE = 'GPU'
# CV PARAMS
FOLDS = 5
GROUP_GAP = 130
MAX_TEST_GROUP_SIZE = 180
MAX_TRAIN_GROUP_SIZE = 280

In [None]:
# Memory saving function credit to https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype.name

        if col_type not in ['object', 'category', 'datetime64[ns, UTC]', 'datetime64[ns]']:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [None]:
df_train = pd.read_csv(TRAIN_CSV)
# df_train.head()

In [None]:
df_asset_details = pd.read_csv(ASSET_DETAILS_CSV).sort_values("Asset_ID")
# df_asset_details

In [None]:
def get_features(df, 
                 asset_id, 
                 train=True):
    '''
    This function takes a dataframe with all asset data and return the lagged features for a single asset.
    
    df - Full dataframe with all assets included
    asset_id - integer from 0-13 inclusive to represent a cryptocurrency asset
    train - True - you are training your model
          - False - you are submitting your model via api
    '''
    
    df = df[df['Asset_ID']==asset_id]
    df = df.sort_values('timestamp')
    if train == True:
        df_feat = df.copy()
        # define a train_flg column to split your data into train and validation
        totimestamp = lambda s: np.int32(time.mktime(datetime.datetime.strptime(s, "%d/%m/%Y").timetuple()))
        valid_window = [totimestamp("12/03/2021")]
        df_feat['train_flg'] = np.where(df_feat['timestamp']>=valid_window[0], 0,1)
        df_feat = df_feat[['timestamp','Asset_ID', 'Count','Open', 'High', 'Low','Close','Volume', 'Target','train_flg']].copy()
    else:
        df = df.sort_values('row_id')
        df_feat = df[['Asset_ID','Count','Open', 'High', 'Low', 'Close','Volume','row_id']].copy()
    
    # Create your features here, they can be lagged or not
    df_feat['sma15'] = df_feat['Close'].rolling(15).mean()/df_feat['Close'] -1
    df_feat['sma60'] = df_feat['Close'].rolling(60).mean()/df_feat['Close'] -1
    df_feat['sma240'] = df_feat['Close'].rolling(240).mean()/df_feat['Close'] -1
    
    #df_feat['return15'] = df_feat['Close']/df_feat['Close'].shift(15) -1
    #df_feat['return60'] = df_feat['Close']/df_feat['Close'].shift(60) -1
    #df_feat['return240'] = df_feat['Close']/df_feat['Close'].shift(240) -1
    
    df_feat['upper_shadow'] = df_feat['High'] - np.maximum(df_feat['Close'], df_feat['Open'])
    df_feat['lower_shadow'] = np.minimum(df_feat['Close'], df_feat['Open']) - df_feat['Low']
    #df['Mean'] = df[['Open', 'High', 'Low', 'Close']].mean()
    df_feat['Close/Open'] = df_feat['Close'] / df_feat['Open'] 
    df_feat['hlco_ratio'] = (df_feat['High'] - df_feat['Low'])/(df_feat['Close']-df_feat['Open']+1e-6)
    df_feat['spread'] = df_feat['High'] - df_feat['Low']
    df_feat['mean_trade'] = df_feat['Volume']/df_feat['Count']
    #df['log_price_change'] = np.log(df['Close']/df['Open'])
    df_feat = df_feat.fillna(0)
    
    return df_feat

In [None]:
# define features for LGBM
# features = ['Asset_ID','sma15','sma60','sma240', 'upper_shadow', 'lower_shadow'
#            , 'Close/Open', 'hlco_ratio', 'spread', 'mean_trade'
#            ]
features = ['Asset_ID','Count','Open', 'High', 'Low', 'Close','Volume']
categoricals = ['Asset_ID']

In [None]:
# define the evaluation metric
def weighted_correlation(a, train_data):
    
    weights = train_data.add_w.values.flatten()
    b = train_data.get_label()
    
    
    w = np.ravel(weights)
    a = np.ravel(a)
    b = np.ravel(b)

    sum_w = np.sum(w)
    mean_a = np.sum(a * w) / sum_w
    mean_b = np.sum(b * w) / sum_w
    var_a = np.sum(w * np.square(a - mean_a)) / sum_w
    var_b = np.sum(w * np.square(b - mean_b)) / sum_w

    cov = np.sum((a * b * w)) / np.sum(w) - mean_a * mean_b
    corr = cov / np.sqrt(var_a * var_b)

    return 'eval_wcorr', corr, True

In [None]:
import optuna
from lightgbm import LGBMRegressor # But do not call lightgbm! This is a must! This trick is explained above!
from joblib import parallel_backend

In [None]:
def objective(trial, asset_id, categoricals, cv_fold_func = np.average):
    from lightgbm import LGBMRegressor
    # Optuna suggest params
    param_lgb = {
        "verbosity": -1,
        "boosting_type": "gbdt",
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        'device': 'gpu',
    }    
    # fit for all folds and return composite MAE score
    
    feature_df = get_features(df_train, asset_id, train=True)
    feature_df = pd.merge(feature_df, df_asset_details[['Asset_ID','Weight']], how='left', on=['Asset_ID'])
    feature_df = reduce_mem_usage(feature_df)
    for c in categoricals:
        feature_df[c] = pd.Series(feature_df[c], dtype = 'category')
    x_train = feature_df.query('train_flg == 1')[features]
    y_train = feature_df.query('train_flg == 1')['Target'].values
    x_val = feature_df.query('train_flg == 0')[features]
    y_val = feature_df.query('train_flg == 0')['Target'].values

    clf = LGBMRegressor(**param_lgb)
    clf.fit(x_train, y_train)
    preds = clf.predict(x_val)
    mae = mean_absolute_error(y_val, preds)
    


    return -1.0 * mae

In [None]:
N_JOBS = 2
N_TRIALS = 5

In [None]:
from optuna.visualization import plot_param_importances
def get_best_params(objective, N_TRIALS, N_JOBS, asset_id, categoricals):
    with parallel_backend('multiprocessing'):
        study = optuna.create_study(direction = "maximize")
        study.optimize(lambda trial: objective(trial, asset_id, categoricals), n_trials = N_TRIALS, n_jobs = N_JOBS)

    print("Number of finished trials: {}".format(len(study.trials)))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))
    

#     display(optuna.visualization.plot_optimization_history(study))
#     display(optuna.visualization.plot_slice(study))
#     display(optuna.visualization.plot_parallel_coordinate(study))
#     display(optuna.visualization.plot_param_importances (study))
    best_params = trial.params      
    return best_params

In [None]:
best_params = {}
for asset_id in range(14):
    best_params[asset_id] = get_best_params(objective, N_TRIALS, N_JOBS, asset_id, categoricals)

In [None]:
def get_final_model(asset_id, best_params):
    feature_df = get_features(df_train, asset_id, train=True)
    feature_df = pd.merge(feature_df, df_asset_details[['Asset_ID','Weight']], how='left', on=['Asset_ID'])
    feature_df = reduce_mem_usage(feature_df)
    # define train and validation weights and datasets
    weights_train = feature_df.query('train_flg == 1')[['Weight']]
    weights_test = feature_df.query('train_flg == 0')[['Weight']]

    train_dataset = lgb.Dataset(feature_df.query('train_flg == 1')[features], 
                                feature_df.query('train_flg == 1')['Target'].values, 
                                feature_name = features, 
                                categorical_feature= categoricals)
    val_dataset = lgb.Dataset(feature_df.query('train_flg == 0')[features], 
                              feature_df.query('train_flg == 0')['Target'].values, 
                              feature_name = features, 
                              categorical_feature= categoricals)

    train_dataset.add_w = weights_train
    val_dataset.add_w = weights_test

    evals_result = {}

    # train LGBM2
    model = lgb.train(params = best_params,
                      train_set = train_dataset, 
                      valid_sets = [val_dataset],
                      early_stopping_rounds=100,
                      verbose_eval = 10,
                      feval=weighted_correlation,
                      evals_result = evals_result 
                     )
    return model

In [None]:
models = {}
for asset_id in range(14):
    model = get_final_model(asset_id, best_params[asset_id])
    models[asset_id] = model

### Important!

In [None]:
# define max_lookback - an integer > (greater than) the furthest look back in your lagged features
max_lookback = 250

#### Now we will submit via api

- As mentioned by the host here https://www.kaggle.com/c/g-research-crypto-forecasting/discussion/290412 - the api takes 10 minutes to complete when submitted on the full test data with a simple dummy prediction. 

- Therefore, any extra logic we include within the api loop with increase the time to completion significantly.

- I have not focused on optimisation of the logic within this loop yet - there are definetly significant improvements you can try for yourself. For example, using numpy arrays instead of pandas dataframes may help.

- For this version - the submission time is roughly 5 hours.

In [None]:
start = time.time()

env = gresearch_crypto.make_env()
iter_test = env.iter_test()

# create dataframe to store data from the api to create lagged features
history = pd.DataFrame()
for i, (df_test, df_pred) in enumerate(iter_test):
    
    # concatenate new api data to history dataframe
    history = pd.concat([history, df_test[['timestamp','Asset_ID', 'Count','Open', 'High', 'Low', 'Close', 'Volume', 'row_id']]])
    for j , row in df_test.iterrows():
        model = models[row['Asset_ID']]
        # get features using history dataframe
        row_features = get_features(history, row['Asset_ID'], train=False)
        row = row_features.iloc[-1].fillna(0)
        y_pred = model.predict(row[features])[0]

        df_pred.loc[df_pred['row_id'] == row['row_id'], 'Target'] = y_pred
    
    # we only want to keep the necessary recent part of our history dataframe, which will depend on your
    # max_lookback value (your furthest lookback in creating lagged features).
    history = history.sort_values(by='row_id')
    history = history.iloc[-(max_lookback*14+100):]
    
    # Send submissions
    env.predict(df_pred)
stop = time.time()
print(stop-start)