# GroupTimeSeriesCV - Catboost GPU

### Load data from parquet dataset

We will use the [dataset](https://www.kaggle.com/robikscube/ubiquant-parquet) from @robikscube to have a lighter dataframe to deal with. The parquet dataset was created during an stream session on his Twitch if you feel curious about how to build it.

In [None]:
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)


df = pd.read_parquet("../input/ubiquant-parquet/train_low_mem.parquet")

## Time aware CV split

We use GroupTimeSeriesSplit in order to take into account the chronology of data. The difference between this implementation and sklearn's TimeSeriesSplit is that you do not get overlapping time_ids. So, each fold will have a non-overlapping chronologically aware set of time_ids.

In [None]:
from sklearn.model_selection._split import _BaseKFold, indexable, _num_samples
from sklearn.utils.validation import _deprecate_positional_args

# https://github.com/getgaurav2/scikit-learn/blob/d4a3af5cc9da3a76f0266932644b884c99724c57/sklearn/model_selection/_split.py#L2243
class GroupTimeSeriesSplit(_BaseKFold):
    """Time Series cross-validator variant with non-overlapping groups.
    Provides train/test indices to split time series data samples
    that are observed at fixed time intervals according to a
    third-party provided group.
    In each split, test indices must be higher than before, and thus shuffling
    in cross validator is inappropriate.
    This cross-validation object is a variation of :class:`KFold`.
    In the kth split, it returns first k folds as train set and the
    (k+1)th fold as test set.
    The same group will not appear in two different folds (the number of
    distinct groups has to be at least equal to the number of folds).
    Note that unlike standard cross-validation methods, successive
    training sets are supersets of those that come before them.
    Read more in the :ref:`User Guide <cross_validation>`.
    Parameters
    ----------
    n_splits : int, default=5
        Number of splits. Must be at least 2.
    max_train_size : int, default=None
        Maximum size for a single training set.
    Examples
    --------
    >>> import numpy as np
    >>> from sklearn.model_selection import GroupTimeSeriesSplit
    >>> groups = np.array(['a', 'a', 'a', 'a', 'a', 'a',\
                           'b', 'b', 'b', 'b', 'b',\
                           'c', 'c', 'c', 'c',\
                           'd', 'd', 'd'])
    >>> gtss = GroupTimeSeriesSplit(n_splits=3)
    >>> for train_idx, test_idx in gtss.split(groups, groups=groups):
    ...     print("TRAIN:", train_idx, "TEST:", test_idx)
    ...     print("TRAIN GROUP:", groups[train_idx],\
                  "TEST GROUP:", groups[test_idx])
    TRAIN: [0, 1, 2, 3, 4, 5] TEST: [6, 7, 8, 9, 10]
    TRAIN GROUP: ['a' 'a' 'a' 'a' 'a' 'a']\
    TEST GROUP: ['b' 'b' 'b' 'b' 'b']
    TRAIN: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10] TEST: [11, 12, 13, 14]
    TRAIN GROUP: ['a' 'a' 'a' 'a' 'a' 'a' 'b' 'b' 'b' 'b' 'b']\
    TEST GROUP: ['c' 'c' 'c' 'c']
    TRAIN: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]\
    TEST: [15, 16, 17]
    TRAIN GROUP: ['a' 'a' 'a' 'a' 'a' 'a' 'b' 'b' 'b' 'b' 'b' 'c' 'c' 'c' 'c']\
    TEST GROUP: ['d' 'd' 'd']
    """

    @_deprecate_positional_args
    def __init__(self, n_splits=5, *, max_train_size=None):
        super().__init__(n_splits, shuffle=False, random_state=None)
        self.max_train_size = max_train_size

    def split(self, X, y=None, groups=None):
        """Generate indices to split data into training and test set.
        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training data, where n_samples is the number of samples
            and n_features is the number of features.
        y : array-like of shape (n_samples,)
            Always ignored, exists for compatibility.
        groups : array-like of shape (n_samples,)
            Group labels for the samples used while splitting the dataset into
            train/test set.
        Yields
        ------
        train : ndarray
            The training set indices for that split.
        test : ndarray
            The testing set indices for that split.
        """
        if groups is None:
            raise ValueError("The 'groups' parameter should not be None")
        X, y, groups = indexable(X, y, groups)
        n_samples = _num_samples(X)
        n_splits = self.n_splits
        n_folds = n_splits + 1
        group_dict = {}
        u, ind = np.unique(groups, return_index=True)
        unique_groups = u[np.argsort(ind)]
        n_samples = _num_samples(X)
        n_groups = _num_samples(unique_groups)
        for idx in np.arange(n_samples):
            if groups[idx] in group_dict:
                group_dict[groups[idx]].append(idx)
            else:
                group_dict[groups[idx]] = [idx]
        if n_folds > n_groups:
            raise ValueError(
                (
                    "Cannot have number of folds={0} greater than"
                    " the number of groups={1}"
                ).format(n_folds, n_groups)
            )
        group_test_size = n_groups // n_folds
        group_test_starts = range(
            n_groups - n_splits * group_test_size, n_groups, group_test_size
        )
        for group_test_start in group_test_starts:
            train_array = []
            test_array = []
            for train_group_idx in unique_groups[:group_test_start]:
                train_array_tmp = group_dict[train_group_idx]
                train_array = np.sort(
                    np.unique(
                        np.concatenate((train_array, train_array_tmp)), axis=None
                    ),
                    axis=None,
                )
            train_end = train_array.size
            if self.max_train_size and self.max_train_size < train_end:
                train_array = train_array[train_end - self.max_train_size : train_end]
            for test_group_idx in unique_groups[
                group_test_start : group_test_start + group_test_size
            ]:
                test_array_tmp = group_dict[test_group_idx]
                test_array = np.sort(
                    np.unique(np.concatenate((test_array, test_array_tmp)), axis=None),
                    axis=None,
                )
            yield [int(i) for i in train_array], [int(i) for i in test_array]

### Training function

We will use CatBoost with GPU as accelerator as a baseline model. We compute and store the competition metric as well for every fold.

In [None]:
from catboost import CatBoostRegressor
from scipy import stats
from sklearn.metrics import mean_squared_error

def train_model(df, train, test, gpu=False):
    model = CatBoostRegressor(task_type="GPU")

    X = df.loc[train].drop(["row_id", "time_id", "target"], axis=1)
    y = df.loc[train, "target"]

    X_test = df.loc[test].drop(["row_id", "time_id", "target"], axis=1)
    y_test = df.loc[test, "target"]
    
    model.fit(X, y, verbose=False, eval_set=(X_test, y_test), use_best_model=True)

    preds = model.predict(X_test)
    mse = mean_squared_error(preds, y_test)
    df_preds = df.loc[test].copy()
    df_preds['prediction'] = preds
    corr = df_preds.groupby('time_id').apply(lambda x: stats.pearsonr(x['target'], x['prediction'])[0]).mean()  # Competition metric
    return model, mse, corr

### Training N-CV

Set yourself the desired number of splits.

In [None]:
import numpy as np
import gc

N_SPLITS = 5
dict_results = {"trainmin": [], "trainmax": [], "mse_test": [], "corr": []}
models = []
i = 0
for train, test in GroupTimeSeriesSplit(n_splits=N_SPLITS).split(
    df, groups=df["time_id"]
):
    model, mse, corr = train_model(df, train, test, gpu=True)
    dict_results["trainmin"].append(df.loc[train].time_id.min())
    dict_results["trainmax"].append(df.loc[train].time_id.max())
    dict_results["mse_test"].append(mse)
    dict_results["corr"].append(corr)
    models.append(model)
    gc.collect()

### Results

Just display the results.

In [None]:
df_results = pd.DataFrame(dict_results)
display(df_results)

### Inference

Now let's use the API to predict on online data. You can set your own weights.

In [None]:
import ubiquant

WEIGHTS = [0., 0., 0., 0., 1.]  # Just use the last one for now
assert (sum(WEIGHTS) == 1 and len(WEIGHTS) == N_SPLITS)

env = ubiquant.make_env()
iter_test = env.iter_test()
for (test_df, sample_prediction_df) in iter_test:
    X = test_df.drop("row_id", axis=1)
    pred = 0
    i = 0
    for model in models:
        pred += model.predict(X) * WEIGHTS[i]
        i += 1
    sample_prediction_df["target"] = pred 
    env.predict(sample_prediction_df)