#  Ubiquant Market Prediction with Catboost
## Import Packages

In [None]:
import os
import pandas as pd
import numpy as np
import gc
import tensorflow as tf
from sklearn import metrics
import math
import json
import numpy as np
from scipy.special import comb
from scipy import stats
from itertools import combinations
import matplotlib.pyplot as plt

In [None]:
class CombinatorialPurgedGroupKFold():
    def __init__(self, n_splits = 6, n_test_splits = 2, purge = 1, pctEmbargo = 0.01, **kwargs):
        self.n_splits = n_splits
        self.n_test_splits = n_test_splits
        self.purge = purge
        self.pctEmbargo = pctEmbargo
        
    def split(self, X, y = None, groups = None):
        if groups is None:
            raise ValueError(
                "The 'groups' parameter should not be None")
            
        u, ind = np.unique(groups, return_index = True)
        unique_groups = u[np.argsort(ind)]
        n_groups = len(unique_groups)
        group_dict = {}
        for idx in range(len(X)):
            if groups[idx] in group_dict:
                group_dict[groups[idx]].append(idx)
            else:
                group_dict[groups[idx]] = [idx]
                
        n_folds = comb(self.n_splits, self.n_test_splits, exact = True)
        if n_folds > n_groups:
            raise ValueError(
                ("Cannot have number of folds={0} greater than"
                 " the number of groups={1}").format(n_folds,
                                                     n_groups))
            
        mbrg = int(n_groups * self.pctEmbargo)
        if mbrg < 0:
            raise ValueError(
                "The number of 'embargoed' groups should not be negative")
        
        split_dict = {}
        group_test_size = n_groups // self.n_splits
        for split in range(self.n_splits):
            if split == self.n_splits - 1:
                split_dict[split] = unique_groups[int(split * group_test_size):].tolist()
            else:
                split_dict[split] = unique_groups[int(split * group_test_size):int((split + 1) * group_test_size)].tolist()
        
        for test_splits in combinations(range(self.n_splits), self.n_test_splits):
            test_groups = []
            banned_groups = []
            for split in test_splits:
                test_groups += split_dict[split]
                banned_groups += unique_groups[split_dict[split][0] - self.purge:split_dict[split][0]].tolist()
                banned_groups += unique_groups[split_dict[split][-1] + 1:split_dict[split][-1] + self.purge + mbrg + 1].tolist()
            train_groups = [i for i in unique_groups if (i not in banned_groups) and (i not in test_groups)]

            train_idx = []
            test_idx = []
            for train_group in train_groups:
                train_idx += group_dict[train_group]
            for test_group in test_groups:
                test_idx += group_dict[test_group]
            yield train_idx, test_idx

In [None]:
class Config:
    is_training = False
config = Config()

### Utilities

In [None]:
def evaluate(model, x_val, y_val, time_id_val):
    y_pred = model.predict(x_val).reshape(-1)
    r2 = metrics.r2_score(y_val, y_pred)
    mse = metrics.mean_squared_error(y_val, y_pred)
    mae = metrics.mean_absolute_error(y_val, y_pred)
    mape = tf.keras.metrics.mean_absolute_percentage_error(y_val, y_pred).numpy()
    rmse = np.sqrt(mse)
    print("R2 Score:", r2)
    print("MSE:", mse)
    print("MAE:", mae)
    print("MAPE", mape)
    print("RMSE:", rmse)
    df = pd.DataFrame({"y": list(y_val), "time_id": list(time_id_val), "y_pred": list(y_pred)})
    pearson = df.groupby("time_id").apply(lambda item: stats.pearsonr(item.y, item.y_pred)[0]).mean()
    print(f"Mean Pearson Score: {pearson}")
    return {"r2": r2, "mse": mse, "mae": mae, "mape": mape, "rmse": rmse, "pearson": pearson}

def inference(models, df):
    y_preds = []
    for model in models:
        y_pred = model.predict(df[feature_columns])
        y_preds.append(y_pred)
    return np.mean(y_preds, axis=0)


## Import dataset

In [None]:
%%time
feature_columns = [f'f_{i}' for i in range(300)]
X = pd.read_pickle('../input/ubiquant-market-prediction-half-precision-pickle/train.pkl')
X.head()

In [None]:
investment_id = X.pop("investment_id")
time_id = X.pop("time_id")
target = X.pop("target")

## Modeling

In [None]:
%%time
from catboost import CatBoostRegressor
import time
n_splits = 6
n_test_splits = 1
kfold = CombinatorialPurgedGroupKFold(n_splits, n_test_splits)
models = []
if config.is_training:
    for fold, (train_indices, valid_indices) in enumerate(kfold.split(X, groups=time_id)):
        begin = time.time()
        print("=" * 100)
        print(f"Fold {fold}")
        print("=" * 100)
        X_train = X.iloc[train_indices]
        X_val = X.iloc[valid_indices]
        y_train = target.iloc[train_indices]
        y_val = target.iloc[valid_indices]
        params = {
            'task_type' : 'GPU',
            'verbose' : 1000,
        }
        model = CatBoostRegressor(**params)
        model.fit(X_train, y_train, eval_set=(X_val, y_val))
        model.save_model(f"model_{fold}.model")
        models.append(model)
        print(f"Elapsed time: {time.time() - begin}")
        del X_train
        del y_train
        del X_val
        del y_val
        gc.collect()
else:
    for fold, (train_indices, valid_indices) in enumerate(kfold.split(X, groups=time_id)):
        X_val = X.iloc[valid_indices]
        y_val = target.iloc[valid_indices]
        time_id_val = time_id.iloc[valid_indices]
        params = {
            'task_type' : 'GPU',
            'verbose' : 1000,
        }
        model = CatBoostRegressor(**params)
        model.load_model(f"../input/ubiquant-market-prediction-with-catboost-output/model_{fold}.model")
        evaluate(model, X_val, y_val, time_id_val)
        models.append(model)

## Model Evaluation

## Submission

In [None]:
import ubiquant
env = ubiquant.make_env()
iter_test = env.iter_test() 
for (test_df, sample_prediction_df) in iter_test:
    sample_prediction_df['target'] = inference(models, test_df)
    env.predict(sample_prediction_df) 