# G-Research Crypto - Starter XGB Pipeline
![](https://storage.googleapis.com/kaggle-competitions/kaggle/30894/logos/header.png)


### Just a simple pipeline going from zero to a valid submission




# Import and load dfs

References: [Tutorial to the G-Research Crypto Competition](https://www.kaggle.com/cstein06/tutorial-to-the-g-research-crypto-competition)

In [None]:
import pandas as pd
import numpy as np
from lightgbm import LGBMRegressor
import gresearch_crypto
import xgboost as xgb
import traceback
import datetime
import matplotlib.pyplot as plt
from scipy.stats import pearsonr


TRAIN_CSV = '/kaggle/input/g-research-crypto-forecasting/train.csv'
ASSET_DETAILS_CSV = '/kaggle/input/g-research-crypto-forecasting/asset_details.csv'

In [None]:
# Memory saving function credit to https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype.name

        if col_type not in ['object', 'category', 'datetime64[ns, UTC]']:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [None]:
DEVICE = 'GPU'

df = pd.read_csv(TRAIN_CSV)
df.replace([np.inf, -np.inf], np.nan)

#df_train.fillna(-999, inplace=True)
# df = df[df['Target'].notna()]
# df.interpolate(method='linear', inplace=True)
df = df.dropna(how="any")

# sorting data into groups of days
df['date'] = pd.to_datetime(df['timestamp'], unit = 's')
df = df.sort_values('date')
groups = pd.factorize(df['date'].dt.day.astype(str) + '_' + df['date'].dt.month.astype(str) + '_' + df['date'].dt.year.astype(str))[0]

# reduce memory usage
df.drop(columns = 'date', inplace = True)
target = df['Target'].copy()
df.drop(columns = 'Target', inplace = True)
df = reduce_mem_usage(df)
df['Target'] = target
df['groups'] = groups

# getting rid of data overlap
df_train = df[df['timestamp'] < 1623542400]
df_train.info()

In [None]:
df_asset_details = pd.read_csv(ASSET_DETAILS_CSV).sort_values("Asset_ID")
df_asset_details

## Utility functions to train a model for one asset

In [None]:

from pandas import DataFrame
from pandas import concat
 
def series_to_supervised(data, n_in=1, n_out=1, dropnan=False, interpolate = False):
    """
    Frame a time series as a supervised learning dataset.
    Arguments:
        data: Sequence of observations as a list or NumPy array.
        n_in: Number of lag observations as input (X).
        n_out: Number of observations as output (y).
        dropnan: Boolean whether or not to drop rows with NaN values.
    Returns:
        Pandas DataFrame of series framed for supervised learning.
    """
    n_vars = 1 if type(data) is list else data.shape[1]
    df = DataFrame(data)
    cols, names = list(), list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
    # put it all together
    agg = concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    elif interpolate:
        agg.interpolate(method='linear', inplace=True)
    else:
        agg.fillna(-999,inplace=True)
    return agg
 

In [None]:
# Two new features from the competition tutorial
def upper_shadow(df):
    return df['High'] - np.maximum(df['Close'], df['Open'])

def lower_shadow(df):
    return np.minimum(df['Close'], df['Open']) - df['Low']

# A utility function to build features from the original df
# It works for rows to, so we can reutilize it.
def get_features(df, lag = 1, shuffle = False, less_features = False, diff = True):
    df_feat = df[['Count', 'Open', 'High', 'Low', 'Close', 'Volume', 'VWAP']].copy()
    
    #imputer
#     df_feat = df_feat.interpolate()
    df_feat.fillna(-999,inplace=True)
    #less features
    if less_features:
        df_feat = df_feat[['Close', 'Volume','Count']]
    if diff:
        df_feat = df_feat.diff()
    # add lagged observations
    if lag > 0:
        df_feat = series_to_supervised(df_feat, n_in = lag)
    df_feat['Upper_Shadow'] = upper_shadow(df)
    df_feat['Lower_Shadow'] = lower_shadow(df)
    if shuffle is True:
        df_feat = df_feat.sample(frac=1)
    return df_feat

# Building the model

In [None]:
# Build pipeline here, imputer might not be necessary as all nans were dropped in the beginning.
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, mean_absolute_error
from sklearn import set_config
set_config(display='diagram') 
def XGB_pipeline():
#     imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
    scaler = StandardScaler()
#     scaler = MinMaxScaler()

    #parameters from here: https://www.kaggle.com/sarvesh278/xgboost-trading-classifier
    #note: original is a classifier
    XGB_model = xgb.XGBRegressor(
        n_estimators=508,
#         learning_rate=0.05,
#         max_depth=12,
#         subsample=0.9,
#         colsample_bytree=0.7,
        #colsample_bylevel=0.75,
        missing=-999,
        max_depth= 8,
        learning_rate = 0.06992689459063349,
        subsample = 0.8012753784867586,
        colsample_bytree = 0.8419498887494685,
#         gamma = 9,
#         reg_lambda = 5,
        random_state=1111,
        tree_method='gpu_hist'  
        )

    pipe = Pipeline(steps=[
#         ('imputer', imp_mean),
        ('scaler', scaler),
        ('linear', XGB_model)
    ])
    
    return pipe

In [None]:
# df = df_train[df_train["Asset_ID"] == 10]
# df.isna().sum()

In [None]:
# df = df_train[df_train["Asset_ID"] == 0]
# #impute y
# df['Target'] = df.Target.interpolate(method='slinear')
# df.isna().sum()

In [None]:
# # Check the model interface
# x = get_features(df_train.iloc[1])
# #y_pred = models[0].predict([x])
# #y_pred[0]
# y_pred = models[0].predict(pd.DataFrame([x]))
# y_pred[0]

# Cross Validation

In [None]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn.utils.validation import _deprecate_positional_args
from sklearn.model_selection._split import _BaseKFold, indexable, _num_samples

# modified code for group gaps; source
# https://github.com/getgaurav2/scikit-learn/blob/d4a3af5cc9da3a76f0266932644b884c99724c57/sklearn/model_selection/_split.py#L2243
class PurgedGroupTimeSeriesSplit(_BaseKFold):
    """Time Series cross-validator variant with non-overlapping groups.
    Allows for a gap in groups to avoid potentially leaking info from
    train into test if the model has windowed or lag features.
    Provides train/test indices to split time series data samples
    that are observed at fixed time intervals according to a
    third-party provided group.
    In each split, test indices must be higher than before, and thus shuffling
    in cross validator is inappropriate.
    This cross-validation object is a variation of :class:`KFold`.
    In the kth split, it returns first k folds as train set and the
    (k+1)th fold as test set.
    The same group will not appear in two different folds (the number of
    distinct groups has to be at least equal to the number of folds).
    Note that unlike standard cross-validation methods, successive
    training sets are supersets of those that come before them.
    Read more in the :ref:`User Guide <cross_validation>`.
    Parameters
    ----------
    n_splits : int, default=5
        Number of splits. Must be at least 2.
    max_train_group_size : int, default=Inf
        Maximum group size for a single training set.
    group_gap : int, default=None
        Gap between train and test
    max_test_group_size : int, default=Inf
        We discard this number of groups from the end of each train split
    """

    @_deprecate_positional_args
    def __init__(self,
                 n_splits=5,
                 *,
                 max_train_group_size=np.inf,
                 max_test_group_size=np.inf,
                 group_gap=None,
                 verbose=False
                 ):
        super().__init__(n_splits, shuffle=False, random_state=None)
        self.max_train_group_size = max_train_group_size
        self.group_gap = group_gap
        self.max_test_group_size = max_test_group_size
        self.verbose = verbose

    def split(self, X, y=None, groups=None):
        """Generate indices to split data into training and test set.
        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training data, where n_samples is the number of samples
            and n_features is the number of features.
        y : array-like of shape (n_samples,)
            Always ignored, exists for compatibility.
        groups : array-like of shape (n_samples,)
            Group labels for the samples used while splitting the dataset into
            train/test set.
        Yields
        ------
        train : ndarray
            The training set indices for that split.
        test : ndarray
            The testing set indices for that split.
        """
        if groups is None:
            raise ValueError(
                "The 'groups' parameter should not be None")
        X, y, groups = indexable(X, y, groups)
        n_samples = _num_samples(X)
        n_splits = self.n_splits
        group_gap = self.group_gap
        max_test_group_size = self.max_test_group_size
        max_train_group_size = self.max_train_group_size
        n_folds = n_splits + 1
        group_dict = {}
        u, ind = np.unique(groups, return_index=True)
        unique_groups = u[np.argsort(ind)]
        n_samples = _num_samples(X)
        n_groups = _num_samples(unique_groups)
        for idx in np.arange(n_samples):
            if (groups[idx] in group_dict):
                group_dict[groups[idx]].append(idx)
            else:
                group_dict[groups[idx]] = [idx]
        if n_folds > n_groups:
            raise ValueError(
                ("Cannot have number of folds={0} greater than"
                 " the number of groups={1}").format(n_folds,
                                                     n_groups))

        group_test_size = min(n_groups // n_folds, max_test_group_size)
        group_test_starts = range(n_groups - n_splits * group_test_size,
                                  n_groups, group_test_size)
        for group_test_start in group_test_starts:
            train_array = []
            test_array = []

            group_st = max(0, group_test_start - group_gap - max_train_group_size)
            for train_group_idx in unique_groups[group_st:(group_test_start - group_gap)]:
                train_array_tmp = group_dict[train_group_idx]
                
                train_array = np.sort(np.unique(
                                      np.concatenate((train_array,
                                                      train_array_tmp)),
                                      axis=None), axis=None)

            train_end = train_array.size
 
            for test_group_idx in unique_groups[group_test_start:
                                                group_test_start +
                                                group_test_size]:
                test_array_tmp = group_dict[test_group_idx]
                test_array = np.sort(np.unique(
                                              np.concatenate((test_array,
                                                              test_array_tmp)),
                                     axis=None), axis=None)

            test_array  = test_array[group_gap:]
            
            
            if self.verbose > 0:
                    pass
                    
            yield [int(i) for i in train_array], [int(i) for i in test_array]

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

# this is code slightly modified from the sklearn docs here:
# https://scikit-learn.org/stable/auto_examples/model_selection/plot_cv_indices.html#sphx-glr-auto-examples-model-selection-plot-cv-indices-py
def plot_cv_indices(cv, X, y, group, ax, n_splits, lw=10):
    """Create a sample plot for indices of a cross-validation object."""
    
    cmap_cv = plt.cm.coolwarm

    jet = plt.cm.get_cmap('jet', 256)
    seq = np.linspace(0, 1, 256)
    _ = np.random.shuffle(seq)   # inplace
    cmap_data = ListedColormap(jet(seq))

    # Generate the training/testing visualizations for each CV split
    for ii, (tr, tt) in enumerate(cv.split(X=X, y=y, groups=group)):
        # Fill in indices with the training/test groups
        indices = np.array([np.nan] * len(X))
        indices[tt] = 1
        indices[tr] = 0

        # Visualize the results
        ax.scatter(range(len(indices)), [ii + .5] * len(indices),
                   c=indices, marker='_', lw=lw, cmap=cmap_cv,
                   vmin=-.2, vmax=1.2)

    # Plot the data classes and groups at the end
    ax.scatter(range(len(X)), [ii + 1.5] * len(X),
               c=y, marker='_', lw=lw, cmap=plt.cm.Set3)

    ax.scatter(range(len(X)), [ii + 2.5] * len(X),
               c=group, marker='_', lw=lw, cmap=cmap_data)

    # Formatting
    yticklabels = list(range(n_splits)) + ['target', 'day']
    ax.set(yticks=np.arange(n_splits+2) + .5, yticklabels=yticklabels,
           xlabel='Sample index', ylabel="CV iteration",
           ylim=[n_splits+2.2, -.2], xlim=[0, len(y)])
    ax.set_title('{}'.format(type(cv).__name__), fontsize=15)
    return ax

In [None]:
# fig, ax = plt.subplots()

# feature_names = [i for i in df_train.columns if i not in ['Target', 'date', 'timestamp', 'VWAP', 'Asset_ID', 'groups']]
# cv = PurgedGroupTimeSeriesSplit(
#     n_splits=3,
# #     max_train_group_size=150,
#     group_gap=20,
# #     max_test_group_size=60
# )

# plot_cv_indices(
#     cv,    
#     df_train.loc[df_train["Asset_ID"] == 1][feature_names].values,
#     df_train.loc[df_train["Asset_ID"] == 1]['Target'].values > np.nanmean(df_train.loc[df_train["Asset_ID"] == 1]['Target'].values),
#     df_train.loc[df_train["Asset_ID"] == 1]['groups'].values,
#     ax,
#     5,
#     lw=20
# );

In [None]:
import optuna

# get_features_params = {'lag':1,'less_features': True, 'shuffle':False, 'diff':True}
feature_names = [i for i in df_train.columns if i not in ['Target', 'date', 'timestamp' 'Asset_ID', 'groups']]
# Testing with Asset_ID == 0 for now
asset_df = df_train[df_train['Asset_ID'] == 0]
y_labels = asset_df['Target'].values
# X_train = asset_df[feature_names].values
# X_train = get_features(asset_df[feature_names], **get_features_params).values
groups = asset_df['groups'].values

cv = PurgedGroupTimeSeriesSplit(
    n_splits=3,
    max_train_group_size=160,
    group_gap=5,
    max_test_group_size=60
)

def objective(trial, cv=cv, cv_fold_func=np.average):
    
    # Optuna suggest params for feature engineering
    feature_params = {
        'lag':trial.suggest_int('lag', 0, 20),
        'less_features': trial.suggest_int('less_features',0, 1), 
        'shuffle': trial.suggest_int('shuffle',0, 1), 
        'diff': trial.suggest_int('diff',0, 1)
        }
    X_train = get_features(asset_df[feature_names], **feature_params).values
    # Optuna suggest params for model
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.10),
        'subsample': trial.suggest_uniform('subsample', 0.50, 0.90),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.50, 0.90),
#         'gamma': trial.suggest_int('gamma', 0, 1),
        'missing': -999,        
        }
    
    if DEVICE == 'GPU': params['tree_method'] = 'gpu_hist'  
    # setup the pipeline
#     imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
    scaler = StandardScaler()
    clf = xgb.XGBRegressor(**params)

    pipe = Pipeline(steps=[
#         ('imputer', imp_mean),
        ('scaler', scaler),
        ('xgb', clf)
    ])


    # fit for all folds and return composite MAE score
    maes = []
    correlations = []
    for i, (train_idx, valid_idx) in enumerate(cv.split(
        X_train,
        y_labels,
        groups=groups)):
        
        train_data = X_train[train_idx, :], y_labels[train_idx]
        valid_data = X_train[valid_idx, :], y_labels[valid_idx]
        
#         display(X_train[valid_idx, :])
        pipe.fit(X_train[train_idx, :], y_labels[train_idx])
        preds = pipe.predict(X_train[valid_idx, :])
        mae = mean_absolute_error(y_labels[valid_idx], preds)
        correlation = pearsonr(y_labels[valid_idx], preds)[0]
#         print(preds)
        maes.append(mae)
        correlations.append(correlation)
    
    print(f'Trial done: mae values on folds: {maes}, correlation: {correlations}')
#     return -1.0 * cv_fold_func(maes)
    return cv_fold_func(correlations)


In [None]:
%%time

FIT_XGB = True

n_trials = 60

if FIT_XGB:
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=n_trials)

    print("Number of finished trials: {}".format(len(study.trials)))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))
    best_params = trial.params        
else: best_params = {}

In [None]:
# maes = []
# correlations = []
# for i, (train_idx, valid_idx) in enumerate(cv.split(
#     X_train,
#     y_labels,
#     groups=groups)):

#     train_data = X_train[train_idx, :], y_labels[train_idx]
#     valid_data = X_train[valid_idx, :], y_labels[valid_idx]

# #         display(X_train[valid_idx, :])
#     pipe = XGB_pipeline()
#     pipe.fit(X_train[train_idx, :], y_labels[train_idx])
#     preds = pipe.predict(X_train[valid_idx, :])
#     mae = mean_absolute_error(y_labels[valid_idx], preds)
#     correlation = pearsonr(y_labels[valid_idx], preds)[0]
# #         print(preds)
#     maes.append(mae)
#     correlations.append(correlation)
# correlations

# Training

In [None]:
def get_Xy_and_model_for_asset(df_train, asset_id, lag = 1, shuffle = False, less_features = False, diff = True):
    df = df_train[df_train["Asset_ID"] == asset_id]
    #impute y, not working
#     df['Target'] = df.Target.interpolate(method='slinear')
    # TODO: Try different features here!
    df_proc = get_features(df, lag = lag, shuffle = shuffle, less_features = less_features, diff = diff)
    df_proc['y'] = df['Target']
    #df_proc = df_proc.dropna(how="any")
    
    X = df_proc.drop("y", axis=1)
    y = df_proc["y"]
    
    # TODO: Try different models here!
    #model = LGBMRegressor(random_state=1111, n_estimators=1200)
    #model.fit(X, y)
    #return X, y, model
    
    model = XGB_pipeline()
    
    model.fit(X, y)
    return model

## Loop over all assets

In [None]:
# training models, change params with the 'params' variable.
params = {'lag':0,'less_features': True, 'shuffle':True, 'diff':False}
models = {}

for asset_id, asset_name in zip(df_asset_details['Asset_ID'], df_asset_details['Asset_Name']):
    print(f"Training model for {asset_name:<16} (ID={asset_id:<2})")
    try:
        model = get_Xy_and_model_for_asset(df_train, asset_id, **params)    
        models[asset_id] = model
    except KeyboardInterrupt:
        break
    except: 
        traceback.print_exc()
        models[asset_id] = None 

# Testing

In [None]:
# testing the model, prints out correlation every 1000 iterations.

params['shuffle'] = False
from tqdm import tqdm

y_test = df[df['timestamp'] > 1623542400].Target
X_test = df[df['timestamp'] > 1623542400].iloc[:-1]

dict_tmp = {}
correlations = []
pred = []
counter = 1
for j , row in tqdm(X_test.iterrows()):
    counter += 1
    if models[row['Asset_ID']] is not None:
        try:
            model = models[row['Asset_ID']]
            # putting test data in the correct format, not optimized
            if row['Asset_ID'] not in dict_tmp:
                dict_tmp[row['Asset_ID']] = pd.DataFrame()
            dict_tmp[row['Asset_ID']] = dict_tmp[row['Asset_ID']].append(row)
            if len(dict_tmp[row['Asset_ID']]) > 50:
                dict_tmp[row['Asset_ID']] = dict_tmp[row['Asset_ID']].tail(50)
            x_test = get_features(dict_tmp[row['Asset_ID']],**params).tail(1)
            if len(x_test) < 1 :
                pred.append(0)
                continue
#             y_pred = model.predict(x_test)[0]
            y_pred = model.predict(x_test)[0]
            pred.append(y_pred)
        except KeyboardInterrupt:
            break
        except:
            pred.append(0)
            traceback.print_exc()
    else: 
        print('no model found')
        pred.append(0)
    if counter%1000 == 0:
        correlations.append(pearsonr(y_test[:len(pred)], pred)[0])
        print(f'Correlation: {pearsonr(y_test[:len(pred)], pred)[0]}, p-value = {pearsonr(y_test[:len(pred)], pred)[1]}')
    if counter%20000 == 0:
        break


In [None]:
print(x_test)
pred[:10]

In [None]:
# This assumes that we have minute-by-minute data for each asset

In [None]:
## these numbers are all an artifact of that fact that predictions are all very close to 0.

# fixed lag, new XBGoost params:
    #params = {'lag':5,'less_features': False, 'shuffle':True} test_size = 10000, 0.0051152419695546245 (peak = 0.068)
    #params = {'lag':0,'less_features': True, 'shuffle':True, 'diff': False} test_size = 10000, 0.0051361107196150665 (peak = 0.06)
    #params = {'lag':0,'less_features': True, 'shuffle':True } test_size = 10000, 0.004907722808215038 (peak = 0.06)
    #params = {'lag':0,'less_features': True, 'shuffle':False } test_size = 10000, 0.004716585631229205 (peak = 0.06)

# previous code didn't run correctly, get_features(training) returns all nans except for the two shadow features.

# with scaling:
    # params = {'lag':10,'less_features': True, 'shuffle':False }, test_size = 10000, 0.03236088315425213
    
#with diff, no scaling & model = xgb.XGBRegressor(
#         n_estimators=500 (i think),
#         missing=-999,
#         random_state=1111,
#         tree_method='gpu_hist'  
#         ):
    # params = {'lag':5,'less_features': True, 'shuffle':False }, test_size = 10000, <0
    # params = {'lag':8,'less_features': True, 'shuffle':False }, test_size = 10000, <0
    # params = {'lag':10,'less_features': True, 'shuffle':False }, test_size = 10000, 0.059430976177124534
    # params = {'lag':10,'less_features': True, 'shuffle':True }, test_size = 10000, 0.007976245597496146
    # params = {'lag':20,'less_features': True, 'shuffle':True }, test_size = 10000, <0

    # with lagged features, lag = 3, test_size = 6000, shuffled, correlation for all assets: -0.025720808709807898

#with n_estimators = 1000:

    # params = {'lag':10,'less_features': True, 'shuffle':False }, test_size = 10000, 0.038023285612531356 (but high variance)

# all shit
    # with lagged features, lag = 3, test_size = 4000, correlation for all assets: -0.025720808709807898
    # with lagged features, lag = 3, test_size = 5000, correlation for all assets: 0.010013542572384845
    # with lagged features, lag = 3, test_size = 6000, correlation for all assets: 0.005208812545830533
    # with lagged features, lag = 1, test_size = 6000, correlation for asset_ID = 1: 0.012588759721880852
    # with lagged features,lag = 1, test_size = 6000, correlation for asset_ID = 0: -0.006493023259016858
    # with lagged features, lag = 3, test_size = 6000, correlation for asset_ID = 0: 0.025999920635118752
    # without lagged features, test_size = 6000, correlation for asset_ID = 1: <0
    # without lagged features, test_size = 6000, correlation for asset_ID = 0: <0

# Predict & submit

References: [Detailed API Introduction](https://www.kaggle.com/sohier/detailed-api-introduction)

Something that helped me understand this iterator was adding a pdb checkpoint inside of the for loop:

```python
import pdb; pdb.set_trace()
```

See [Python Debugging With Pdb](https://realpython.com/python-debugging-pdb/) if you want to use it and you don't know how to.


In [None]:
# env = gresearch_crypto.make_env()
# iter_test = env.iter_test()

# for i, (df_test, df_pred) in enumerate(iter_test):
#     for j , row in df_test.iterrows():
        
#         if models[row['Asset_ID']] is not None:
#             try:
#                 model = models[row['Asset_ID']]
#                 x_test = get_features(row)
#                 y_pred = model.predict(pd.DataFrame([x_test]))[0]
#                 df_pred.loc[df_pred['row_id'] == row['row_id'], 'Target'] = y_pred
#             except:
#                 df_pred.loc[df_pred['row_id'] == row['row_id'], 'Target'] = 0
#                 traceback.print_exc()
#         else: 
#             df_pred.loc[df_pred['row_id'] == row['row_id'], 'Target'] = 0
        
#     env.predict(df_pred)


