In [None]:
# Memory saving function credit to https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype.name
        
        if col_type not in ['object', 'category', 'datetime64[ns, UTC]']:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [None]:
import pandas as pd
import numpy as np
import gresearch_crypto
import traceback
import datetime
import matplotlib.pyplot as plt

REDUCE_MEMORY = False

TRAIN_CSV = '/kaggle/input/g-research-crypto-forecasting/train.csv'
ASSET_DETAILS_CSV = '/kaggle/input/g-research-crypto-forecasting/asset_details.csv'

df = pd.read_csv(TRAIN_CSV)
if REDUCE_MEMORY:
    df = reduce_mem_usage(df)
df_asset_details = pd.read_csv(ASSET_DETAILS_CSV).sort_values("Asset_ID")

asset_to_weight = df_asset_details.Weight.values
df["Weight"] = df["Asset_ID"].apply(lambda x: asset_to_weight[x])

In [None]:
def clean(df):
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.dropna(how="any", inplace=True)

def test_train_split(df):
    X_train = df[df['timestamp'] <= 1623542400].drop('Target', axis=1)
    y_train = df[df['timestamp'] <= 1623542400].Target
    X_test = df[df['timestamp'] > 1623542400].iloc[:-1].drop('Target', axis=1)
    y_test = df[df['timestamp'] > 1623542400].iloc[:-1].Target
    return X_train, y_train, X_test, y_test

clean(df)
X_train, y_train, X_test, y_test = test_train_split(df)

In [None]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn.utils.validation import _deprecate_positional_args
from sklearn.model_selection._split import _BaseKFold, indexable, _num_samples

# modified code for group gaps; source
# https://github.com/getgaurav2/scikit-learn/blob/d4a3af5cc9da3a76f0266932644b884c99724c57/sklearn/model_selection/_split.py#L2243
class PurgedGroupTimeSeriesSplit(_BaseKFold):
    """Time Series cross-validator variant with non-overlapping groups.
    Allows for a gap in groups to avoid potentially leaking info from
    train into test if the model has windowed or lag features.
    Provides train/test indices to split time series data samples
    that are observed at fixed time intervals according to a
    third-party provided group.
    In each split, test indices must be higher than before, and thus shuffling
    in cross validator is inappropriate.
    This cross-validation object is a variation of :class:`KFold`.
    In the kth split, it returns first k folds as train set and the
    (k+1)th fold as test set.
    The same group will not appear in two different folds (the number of
    distinct groups has to be at least equal to the number of folds).
    Note that unlike standard cross-validation methods, successive
    training sets are supersets of those that come before them.
    Read more in the :ref:`User Guide <cross_validation>`.
    Parameters
    ----------
    n_splits : int, default=5
        Number of splits. Must be at least 2.
    max_train_group_size : int, default=Inf
        Maximum group size for a single training set.
    group_gap : int, default=None
        Gap between train and test
    max_test_group_size : int, default=Inf
        We discard this number of groups from the end of each train split
    """

    @_deprecate_positional_args
    def __init__(self,
                 n_splits=5,
                 *,
                 max_train_group_size=np.inf,
                 max_test_group_size=np.inf,
                 group_gap=None,
                 verbose=False,
                 train_gap = 0 # in days
                 ):
        super().__init__(n_splits, shuffle=False, random_state=None)
        self.max_train_group_size = max_train_group_size
        self.group_gap = group_gap
        self.max_test_group_size = max_test_group_size
        self.verbose = verbose
        self.train_gap = train_gap

    def split(self, X, y=None, groups=None):
        """Generate indices to split data into training and test set.
        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training data, where n_samples is the number of samples
            and n_features is the number of features.
        y : array-like of shape (n_samples,)
            Always ignored, exists for compatibility.
        groups : array-like of shape (n_samples,)
            Group labels for the samples used while splitting the dataset into
            train/test set.
        Yields
        ------
        train : ndarray
            The training set indices for that split.
        test : ndarray
            The testing set indices for that split.
        """
        if groups is None:
            raise ValueError(
                "The 'groups' parameter should not be None")
        X, y, groups = indexable(X, y, groups)
        n_samples = _num_samples(X)
        train_gap = self.train_gap
        n_splits = self.n_splits
        group_gap = self.group_gap
        max_test_group_size = self.max_test_group_size
        max_train_group_size = self.max_train_group_size
        n_folds = n_splits + 1
        group_dict = {}
        u, ind = np.unique(groups, return_index=True)
        unique_groups = u[np.argsort(ind)]
        n_samples = _num_samples(X)
        n_groups = _num_samples(unique_groups)
        for idx in np.arange(n_samples):
            if (groups[idx] in group_dict):
                group_dict[groups[idx]].append(idx)
            else:
                group_dict[groups[idx]] = [idx]
        if n_folds > n_groups:
            raise ValueError(
                ("Cannot have number of folds={0} greater than"
                 " the number of groups={1}").format(n_folds,
                                                     n_groups))

        group_test_size = min(n_groups // n_folds, max_test_group_size)
        group_test_starts = range(n_groups - n_splits * group_test_size - train_gap *n_splits,
                                  n_groups, group_test_size + train_gap) # altered
        for group_test_start in group_test_starts:
            train_array = []
            test_array = []

            group_st = max(0, group_test_start - group_gap - max_train_group_size)
            for train_group_idx in unique_groups[group_st:(group_test_start - group_gap)]:
                train_array_tmp = group_dict[train_group_idx]
                
                train_array = np.sort(np.unique(
                                      np.concatenate((train_array,
                                                      train_array_tmp)),
                                      axis=None), axis=None)

            train_end = train_array.size
 
            for test_group_idx in unique_groups[group_test_start:
                                                group_test_start +
                                                group_test_size]:
                test_array_tmp = group_dict[test_group_idx]
                test_array = np.sort(np.unique(
                                              np.concatenate((test_array,
                                                              test_array_tmp)),
                                     axis=None), axis=None)

            test_array  = test_array[group_gap:]
            
            
            if self.verbose > 0:
                    pass
                    
            yield [int(i) for i in train_array], [int(i) for i in test_array]

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

# this is code slightly modified from the sklearn docs here:
# https://scikit-learn.org/stable/auto_examples/model_selection/plot_cv_indices.html#sphx-glr-auto-examples-model-selection-plot-cv-indices-py
def plot_cv_indices(cv, X, y, group, ax, n_splits, lw=10):
    """Create a sample plot for indices of a cross-validation object."""
    
    cmap_cv = plt.cm.coolwarm

    jet = plt.cm.get_cmap('jet', 256)
    seq = np.linspace(0, 1, 256)
    _ = np.random.shuffle(seq)   # inplace
    cmap_data = ListedColormap(jet(seq))

    # Generate the training/testing visualizations for each CV split
    for ii, (tr, tt) in enumerate(cv.split(X=X, y=y, groups=group)):
        # Fill in indices with the training/test groups
        indices = np.array([np.nan] * len(X))
        indices[tt] = 1
        indices[tr] = 0

        # Visualize the results
        ax.scatter(range(len(indices)), [ii + .5] * len(indices),
                   c=indices, marker='_', lw=lw, cmap=cmap_cv,
                   vmin=-.2, vmax=1.2)

    # Plot the data classes and groups at the end
    ax.scatter(range(len(X)), [ii + 1.5] * len(X),
               c=y, marker='_', lw=lw, cmap=plt.cm.Set3)

    ax.scatter(range(len(X)), [ii + 2.5] * len(X),
               c=group, marker='_', lw=lw, cmap=cmap_data)

    # Formatting
    yticklabels = list(range(n_splits)) + ['target', 'day']
    ax.set(yticks=np.arange(n_splits+2) + .5, yticklabels=yticklabels,
           xlabel='Sample index', ylabel="CV iteration",
           ylim=[n_splits+2.2, -.2], xlim=[0, len(y)])
    ax.set_title('{}'.format(type(cv).__name__), fontsize=15)
    return ax

In [None]:
# NOT USED, I'll use it later though <3
from pandas import DataFrame
from pandas import concat
 
def time_lag(data, n_in=1, n_out=1, dropnan=True, interpolate = False):
    """
    Frame a time series as a supervised learning dataset.
    Arguments:
        data: Sequence of observations as a list or NumPy array.
        n_in: Number of lag observations as input (X).
        n_out: Number of observations as output (y).
        dropnan: Boolean whether or not to drop rows with NaN values.
    Returns:
        Pandas DataFrame of series framed for supervised learning.
    """
    n_vars = 1 if type(data) is list else data.shape[1]
    df = DataFrame(data)
    cols, names = list(), list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
    # put it all together
    agg = concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    if interpolate:
        agg.fillna(method='bfill', inplace=True)
    return agg
 

In [None]:
def process(df):
    df["Unity"] = 1
    
    norm_cols = ['Open','VWAP']
    ref = "Close"
    for col in norm_cols:
        df["norm_" + col] = df[col] / df[ref]
    
    return pd.concat([df, time_lag(df[["VWAP", "Volume", "Open", "Close"]], n_in=1, n_out=0, dropnan=False, interpolate=True)], axis=1)

X_train = process(X_train)
if REDUCE_MEMORY:
    X_train = reduce_mem_usage(X_train)
X_test = process(X_test)
if REDUCE_MEMORY:
    X_train = reduce_mem_usage(X_train)

In [None]:
#X_test["Unity"] = 1

In [None]:
# NOT USED
# Two new features from the competition tutorial
def upper_shadow(df):
    return df['High'] - np.maximum(df['Close'], df['Open'])

def lower_shadow(df):
    return np.minimum(df['Close'], df['Open']) - df['Low']

def generate_features(df, lag = 1, shuffle = False):
    df_feat = df[['Count', 'Open', 'High', 'Low', 'Close', 'Volume', 'VWAP']].copy()
    
    #df_feat.fillna(-999,inplace=True)
    if lag > 0:
        df_feat = series_to_supervised(df_feat, n_in = lag)
    df_feat['Upper_Shadow'] = upper_shadow(df)
    df_feat['Lower_Shadow'] = lower_shadow(df)
    if shuffle is True:
        df_feat = df_feat.sample(frac=1)
    return df_feat

In [None]:
from sklearn.preprocessing import StandardScaler


class BestModel:
    def __init__(self):
        self.beta = None
        self.scaler = StandardScaler()
    
    def fit(self, X_train, y_train):
        #self.scaler.fit(X_train)
        #X = self.scaler.transform(X_train)
        X = X_train.values
        mat = X.T@X
        self.beta = np.linalg.inv(mat)@X.T@y_train.values
        
    def predict(self, X_test):
        #X = self.scaler.transform(X_test)
        X = X_test.values
        return X@self.beta

In [None]:
features = ["Count", "norm_Open", "Open", "Close", "Volume", "norm_VWAP", "VWAP", "Unity", 'var1(t-1)', 'var2(t-1)', 'var3(t-1)',
       'var4(t-1)']
print(X_train.columns)

In [None]:
X_train

In [None]:
y_train.shape

In [None]:
models = [BestModel() for _ in range(len(df_asset_details))]
 
LESS_SAMPLES = False

if LESS_SAMPLES: 

    X_train_ = X_train.copy().reset_index(drop = True).tail(1000000)
    y_train_ = y_train.copy().reset_index(drop = True).tail(1000000) 

    y_pred = pd.Series(data=np.full_like(y_test.values, np.nan), index=y_test.index)
    for asset_ID, model in enumerate(models):
        X_asset_train = X_train_[X_train_.Asset_ID == asset_ID]
        y_asset_train = y_train_[X_train_.Asset_ID == asset_ID]
        X_asset_test = X_test[X_test.Asset_ID == asset_ID]

        model.fit(X_asset_train[features], y_asset_train)
        y_pred[X_test.Asset_ID == asset_ID] = model.predict(X_asset_test[features])
    #     y_pred = y_pred.fillna(0)
        print(f"Trained model for asset {asset_ID}")
else:

    y_pred = pd.Series(data=np.full_like(y_test.values, np.nan), index=y_test.index)
    for asset_ID, model in enumerate(models):
        X_asset_train = X_train[X_train.Asset_ID == asset_ID]
        y_asset_train = y_train[X_train.Asset_ID == asset_ID]
        X_asset_test = X_test[X_test.Asset_ID == asset_ID]
 
        model.fit(X_asset_train[features], y_asset_train)
        y_pred[X_test.Asset_ID == asset_ID] = model.predict(X_asset_test[features])
    #     y_pred = y_pred.fillna(0)
        print(f"Trained model for asset {asset_ID}")

In [None]:
def percent_same_direction(a,b):
    assert len(a) == len(b)
    count = 0 
    for i in range(len(a)):
        if a[i]/abs(a[i]) == b[i]/abs(b[i]):
            count +=1
    return count/len(a)

In [None]:
def corr(a, b, w):
    cov = lambda x, y: np.sum(w * (x - np.average(x, weights=w)) * (y - np.average(y, weights=w))) / np.sum(w)
    return cov(a, b) / np.sqrt(cov(a, a) * cov(b, b))


In [None]:
alt_weight = np.ones_like(y_pred)
R = corr(y_pred, y_test.values, X_test.Weight)
print(f"{R:.5f}")
# 0.01641 shenanigans
# 0.01523 remove two
#0.01464 all

In [None]:
from scipy.stats import pearsonr
print(pearsonr(y_pred, y_test))
print(percent_same_direction(y_pred.values, y_test.values))

In [None]:
plt.figure(figsize = (15,5))
plt.plot(y_pred[:1000])
plt.plot(y_test[:1000])


In [None]:
for i in range(len(models[0].beta)):
    print(f"{features[i]}: {models[0].beta[i]/np.sum(np.abs(models[0].beta)):.8f}")

In [None]:
import matplotlib.pyplot as plt

plt.plot(np.abs(models[0].beta))
plt.ylim(bottom=0)

## CV

In [None]:
from sklearn.metrics import roc_auc_score, mean_absolute_error

# get_features_params = {'lag':1,'less_features': True, 'shuffle':False, 'diff':True}
features = ["Count", "norm_Open", "Open", "Close", "Volume", "norm_VWAP", "VWAP", "Unity", 'var1(t-1)', 'var2(t-1)', 'var3(t-1)',
       'var4(t-1)']

X_train['date'] = pd.to_datetime(X_train['timestamp'], unit = 's')
X_train = X_train.sort_values('date')
groups = pd.factorize(X_train['date'].dt.day.astype(str) + '_' + X_train['date'].dt.month.astype(str) + '_' + X_train['date'].dt.year.astype(str))[0]
X_train['groups'] = groups
X_train = X_train.reset_index(drop = True)
y_labels = y_train.reset_index(drop = True)

cv = PurgedGroupTimeSeriesSplit(
    n_splits=5,
    max_train_group_size=500,
    group_gap=20,
    max_test_group_size=80,
    train_gap = 120
)

maes = []
correlations = []
percents_same_direction = []


In [None]:
for i, (train_idx, valid_idx) in enumerate(cv.split(
    X_train,
    y_labels,
    groups=groups)):

    train_data = X_train.loc[train_idx, :], y_labels.loc[train_idx]
    valid_data = X_train.loc[valid_idx, :], y_labels.loc[valid_idx]

#         display(X_train[valid_idx, :])
    models = [BestModel() for _ in range(len(df_asset_details))]

    y_pred = pd.Series(data=np.full_like(valid_data[1].values, np.nan), index=valid_data[1].index)
    for asset_ID, model in enumerate(models):
        X_asset_train = train_data[0][train_data[0].Asset_ID == asset_ID]
        y_asset_train = train_data[1][train_data[0].Asset_ID == asset_ID]
        X_asset_val = valid_data[0][valid_data[0].Asset_ID == asset_ID]
        try:
            model.fit(X_asset_train[features], y_asset_train)
            y_pred[valid_data[0].Asset_ID == asset_ID] = model.predict(X_asset_val[features])
        except:
            traceback.print_exc()
            continue
        print(f"Trained model for asset {asset_ID}")
#     pipe.fit(X_train[train_idx, :], y_labels[train_idx])
#     preds = pipe.predict(X_train[valid_idx, :])
    mae = mean_absolute_error(valid_data[1], y_pred)
    correlation = corr(valid_data[1], y_pred, valid_data[0]["Weight"])
    same_direction = percent_same_direction(y_pred.values, valid_data[1].values)
#         print(preds)
    maes.append(mae)
    correlations.append(correlation)
    percents_same_direction.append(same_direction)

print(f'Trial done: mae values on folds: {maes}, correlation: {correlations}')

In [None]:
y_pred

# Predict & submit

References: [Detailed API Introduction](https://www.kaggle.com/sohier/detailed-api-introduction)

Something that helped me understand this iterator was adding a pdb checkpoint inside of the for loop:

```python
import pdb; pdb.set_trace()
```

See [Python Debugging With Pdb](https://realpython.com/python-debugging-pdb/) if you want to use it and you don't know how to.


In [None]:
env = gresearch_crypto.make_env()
iter_test = env.iter_test()

for i, (df_test, df_pred) in enumerate(iter_test):
    df_pred['Target'] = np.nan
    
    df_test = process(df_test)
    for asset_ID, model in enumerate(models):
        X_asset_test = df_test[df_test.Asset_ID == asset_ID]
        df_pred.loc[df_test.Asset_ID == asset_ID, 'Target'] = model.predict(X_asset_test[features])
    df_pred['Target'] = df_pred['Target'].interpolate('nearest')
    env.predict(df_pred)

In [None]:
print("oh yes!")