# Overview
This notebook uses [pycaret](https://pycaret.gitbook.io/docs/) to create models that can be used for other pycaret processing and prediction.  Only models that use the gpu are used here.  The purpose of this notebook is to get an end-to-end pycaret train to submission flow and to learn about the capabilities of pycaret.  
* Subset of training data is used: train.shape = (750000, 23)  
* Multiple notebooks are used to create a full pycaret workflow due to memory.  
* A few config flags are used to enable a sample of training data - begining part or end part or all training data.  
* Top features were generated using a private notebook running [fastai](https://github.com/fastai/fastai) and [fastinference's](https://muellerzr.github.io/fastinference/) ShapInterpretation.  
* Dataset used: https://www.kaggle.com/datasets/robikscube/ubiquant-parquet  
* Dataset created for no internet option: https://www.kaggle.com/datasets/jmiloser/pycaret-239  

In [None]:
from datetime import datetime
from pytz import timezone
tz = timezone("US/Eastern")
print(datetime.now(tz).strftime('%y%m%d-%H-%M:%S:'))

# Objective
**prior [notebook](https://www.kaggle.com/code/jmiloser/ubiquant-pycaret-optuna)**  
220402-20-06:44: intial pycaret try on subset of data, save models for later blend and tune  
220402-20-46:38: use above version (v5) output files to blend and tune  

**new strategy with ignore_features=['time_id'] and pred/sub option**  
220403-14-16:19: sub logic, disable internet, remove 'time_id' from setup(), score: 0.0689  
220403-17-41:51: (v2) re-create: create_models(), with ignore_features=['time_id'] in setup()   
220403-18-28:42: (v3) create tuned model from blend of models for later pred and sub  
220403-19-54:23: (v4) pred and sub using tuned model from v3 with a subset of training data: train.shape = (750000, 23)  
220406-08-18:11: (v5) create_models() with custom cv object - CombinatorialPurgedGroupKFold() from [@lonnieqin's](https://www.kaggle.com/lonnieqin) [notebook](https://www.kaggle.com/code/lonnieqin/ump-tf-record-combinatorialpurgedgroupkfold/notebook)  
220406-09-14:04: (v6) using custom cv object - create tuned model from blend of models for later pred and sub  
220407-00-42:08: (v7) submission from v6 blended and tuned model. 

In [None]:
!pip install ../input/pycaret-239/scikit_learn-0.23.2-cp37-cp37m-manylinux1_x86_64.whl

In [None]:
import sklearn                                                                                                                                           
sklearn.__version__

In [None]:
import sys
sys.path.append('../input/pycaret-239/pycaret-master/pycaret-master')
sys.path.append('../input/pycaret-239/pyod-master/pyod-master')

In [None]:
%matplotlib inline
import sys
import os
import pandas as pd
import numpy as np
import gc
#from fastai.tabular.all import *
from sklearn.model_selection import (
    TimeSeriesSplit,
    KFold,
    ShuffleSplit,
    StratifiedKFold,
    GroupShuffleSplit,
    GroupKFold,
    StratifiedShuffleSplit,
    #StratifiedGroupKFold,
)
#from model_selection import GroupTimeSeriesSplit
from matplotlib.patches import Patch
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
cmap_data = plt.cm.Paired
cmap_cv = plt.cm.coolwarm
from scipy.special import comb
from itertools import combinations

import pycaret
from pycaret.regression import *
pycaret.__version__

In [None]:
class CFG:
    device = 'kaggle'
    ignore_warnings = True
    if device == 'mac':
        data_path = '/Users/USERNAME/jam/data/ubiquant/'
        output_path = data_path +'output/pycaret/'
    elif device == 'wsl':
        data_path = '/mnt/d/data/ubiquant/'
        output_path = data_path + 'output/pycaret/'
    elif device == 'colab':
        data_path = '/content/drive/MyDrive/data/ubiquant/'
        output_path = data_path + 'output/pycaret/'
    else: # kaggle
        data_path = '../input/'
        output_path = '/kaggle/working/'
        input_path = '../input/ubiquant-pycaret-blend-optuna-sub/'

    protocol = 4
    seed = 1972
    folds = 5
    sample = 1500000 #2800000 #None
    train_part = 'end' # use part of training data [all, begin, end]
    use_cleaned_data = False
    file_name = f'ubiquant_pycaret_blend_oputna_sub_{train_part}_{sample}'
    
    retrain = False
    wandb = False
    blend = False
    tune = False
    pred = True

if CFG.ignore_warnings:
    import warnings
    warnings.filterwarnings('ignore')
    #warnings.simplefilter('ignore')

#pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [None]:
if CFG.wandb:
    !pip install -qqq wandb
    import wandb
    wandb.login()
    log_experiment = 'wandb'
else:
    log_experiment = False

In [None]:
CFG.file_name

In [None]:
def seed_everything(seed):
    res = []
    try: random.seed(seed)
    except NameError as ne: res.append(ne); pass
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    try: torch.manual_seed(seed)
    except NameError as ne: res.append(ne); pass
    try: torch.cuda.manual_seed(seed)
    except NameError as ne: res.append(ne); pass
    try: torch.backends.cudnn.deterministic = True
    except NameError as ne: res.append(ne); pass
    #print (res)
seed_everything(CFG.seed)

In [None]:
%%time
n_features = 300
features = [f'f_{i}' for i in range(n_features)]

if CFG.device == 'mac':
    train_raw = pd.read_parquet(CFG.data_path + 'ubiquant-parquet/train_low_mem.parquet')
else:
    if CFG.use_cleaned_data:
        train_raw = pd.read_pickle(CFG.data_path + 'wo_outliers_mem_reduced/train_wo_outliers_mem_reduced.pkl')
        train_raw.reset_index(drop=True, inplace=True)
        # ?? set investment and time_id as int ???
        print('... using cleaned training data (wo outliers, with mem_reduction) ...')
    else:
        #train_raw = pd.read_pickle(CFG.data_path + 'ubiquant-market-prediction-half-precision-pickle/train.pkl')
        train_raw = pd.read_parquet(CFG.data_path + 'ubiquant-parquet/train_low_mem.parquet')
        print('... using train_low_mem.parquet ...')

In [None]:
KEEP_FEAT = ['row_id', 'time_id', 'investment_id', 'target']
TOP_FEAT_50 = ['f_19','f_21','f_29','f_65','f_76','f_118','f_130','f_178','f_179','f_221',
               'f_223','f_231','f_232','f_244','f_250','f_257','f_281','f_283', 'f_297']
TOP_FEAT = KEEP_FEAT + TOP_FEAT_50
train_raw = train_raw.reindex(columns=TOP_FEAT)
features = [i for i in TOP_FEAT_50]

In [None]:
# sort before any sample strategy
train_raw.sort_values('time_id', inplace=True)

In [None]:
train_raw.shape

In [None]:
if CFG.sample:
    train_raw = train_raw.iloc[-CFG.sample:].copy()
    print(f'... using {CFG.sample} sample of data  ...')

if CFG.train_part != 'all':
    if CFG.train_part == 'begin':
        train = train_raw.iloc[:int(len(train_raw)/2)].copy()
        print('... using begin half part of data ...')
    else:
        train = train_raw.iloc[-int(len(train_raw)/2):].copy() #unverified
        print('... using end half part of data ...')        
else:
    train = train_raw.copy()
    print('... using all data ...')

train.sort_values('time_id', inplace=True)
train.reset_index(drop=True, inplace=True)
    
# change types (cleaned data produces float16 for below)
train['time_id'] = train['time_id'].apply(np.int16)
train['investment_id'] = train['investment_id'].apply(np.int16)

del train_raw
gc.collect()

In [None]:
train.shape

In [None]:
investment_id = train['investment_id']
time_id = train['time_id'].values
_ = train.pop("row_id")

# CV object

In [None]:
from matplotlib.patches import Patch
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
cmap_data = plt.cm.Paired
cmap_cv = plt.cm.coolwarm
from scipy.special import comb
from itertools import combinations

In [None]:
def plot_cv_indices(cv, X, y, group, ax, n_splits, lw=10):
    """Create a sample plot for indices of a cross-validation object."""

    # Generate the training/testing visualizations for each CV split
    for ii, (tr, tt) in enumerate(cv.split(X=X, y=y, groups=group)):
        # Fill in indices with the training/test groups
        indices = np.array([np.nan] * len(X))
        indices[tt] = 1
        indices[tr] = 0

        # Visualize the results
        ax.scatter(
            range(len(indices)),
            [ii + 0.5] * len(indices),
            c=indices,
            marker="_",
            lw=lw,
            cmap=cmap_cv,
            vmin=-0.2,
            vmax=1.2,
        )

    # Plot the data classes and groups at the end
    ax.scatter(
        range(len(X)), [ii + 1.5] * len(X), c=y, marker="_", lw=lw, cmap=cmap_data
    )

    ax.scatter(
        range(len(X)), [ii + 2.5] * len(X), c=group, marker="_", lw=lw, cmap=cmap_data
    )

    # Formatting
    yticklabels = list(range(n_splits)) + ["class", "group"]
    ax.set(
        yticks=np.arange(n_splits + 2) + 0.5,
        yticklabels=yticklabels,
        xlabel="Sample index",
        ylabel="CV iteration",
        ylim=[n_splits + 2.2, -0.2],
        #xlim=[0, 100],
        xlim=[0, len(X)],
    )
    ax.set_title("{}".format(type(cv).__name__), fontsize=15)
    return ax

In [None]:
# Source: https://www.kaggle.com/code/lonnieqin/ump-tf-record-combinatorialpurgedgroupkfold/notebook
# with get_n_splits() added for blend_models()
class CombinatorialPurgedGroupKFold():
    def __init__(self, n_splits = 6, n_test_splits = 2, purge = 1, pctEmbargo = 0.01, **kwargs):
        self.n_splits = n_splits
        self.n_test_splits = n_test_splits
        self.purge = purge
        self.pctEmbargo = pctEmbargo
    
    # added 04-06-2022 for PyCaret blend()    
    def get_n_splits(self, X=None, y=None, groups=None):
        """Returns the number of splitting iterations in the cross-validator
        Parameters
        ----------
        X : object
            Always ignored, exists for compatibility.
        y : object
            Always ignored, exists for compatibility.
        groups : object
            Always ignored, exists for compatibility.
        Returns
        -------
        n_splits : int
            Returns the number of splitting iterations in the cross-validator.
        """
        return self.n_splits
    
    def split(self, X, y = None, groups = None):
        if groups is None:
            raise ValueError(
                "The 'groups' parameter should not be None")
            
        u, ind = np.unique(groups, return_index = True)
        unique_groups = u[np.argsort(ind)]
        n_groups = len(unique_groups)
        group_dict = {}
        for idx in range(len(X)):
            if groups[idx] in group_dict:
                group_dict[groups[idx]].append(idx)
            else:
                group_dict[groups[idx]] = [idx]
                
        n_folds = comb(self.n_splits, self.n_test_splits, exact = True)
        if n_folds > n_groups:
            raise ValueError(
                ("Cannot have number of folds={0} greater than"
                 " the number of groups={1}").format(n_folds,
                                                     n_groups))
            
        mbrg = int(n_groups * self.pctEmbargo)
        if mbrg < 0:
            raise ValueError(
                "The number of 'embargoed' groups should not be negative")
        
        split_dict = {}
        group_test_size = n_groups // self.n_splits
        for split in range(self.n_splits):
            if split == self.n_splits - 1:
                split_dict[split] = unique_groups[int(split * group_test_size):].tolist()
            else:
                split_dict[split] = unique_groups[int(split * group_test_size):int((split + 1) * group_test_size)].tolist()
        
        for test_splits in combinations(range(self.n_splits), self.n_test_splits):
            test_groups = []
            banned_groups = []
            for split in test_splits:
                test_groups += split_dict[split]
                banned_groups += unique_groups[split_dict[split][0] - self.purge:split_dict[split][0]].tolist()
                banned_groups += unique_groups[split_dict[split][-1] + 1:split_dict[split][-1] + self.purge + mbrg + 1].tolist()
            train_groups = [i for i in unique_groups if (i not in banned_groups) and (i not in test_groups)]

            train_idx = []
            test_idx = []
            for train_group in train_groups:
                train_idx += group_dict[train_group]
            for test_group in test_groups:
                test_idx += group_dict[test_group]
            yield train_idx, test_idx

In [None]:
y = None # y = train['target'].values
groups, _ = pd.factorize(train['time_id'])
len(groups), groups

In [None]:
cv = CombinatorialPurgedGroupKFold(n_splits=CFG.folds, n_test_splits=1, purge=1, pctEmbargo=0.05)
if CFG.retrain:
    fig, ax = plt.subplots()
    plot_cv_indices(cv, train, None, groups, ax, CFG.folds)

Modifications to setup():  
* data_split_shuffle = False
* data_split_stratify = False
* fold_strategy= cv
* ignore_features=None

In [None]:
%%time
clf1 = setup(data= train, test_data = None, # hold out test set
             target= 'target', normalize = True, remove_outliers = True, 
             remove_multicollinearity = True, polynomial_features = False,
             trigonometry_features = False, feature_selection = False, 
             feature_interaction = False, feature_ratio = False, 
             data_split_shuffle = False, data_split_stratify = False,
             fold_strategy= cv, fold = CFG.folds, session_id= CFG.seed,
             log_experiment = log_experiment, experiment_name = CFG.file_name,
             use_gpu = True, silent= True, profile = False, ignore_features=None)

In [None]:
print(models(internal=True)[['Name', 'GPU Enabled']])

In [None]:
if CFG.retrain:
    top = compare_models(include=['lr','lasso','ridge','en','svm','knn','rf','xgboost','lightgbm','catboost'], n_select=3,
                         fold=CFG.folds, turbo=True, cross_validation=True, 
                         sort='MSE', budget_time=20)

In [None]:
if CFG.retrain:
    display(top)
    del top
    gc.collect()

### Re-calc groups with setup() training data

In [None]:
_X = get_config('X_train')
groups, _ = pd.factorize(_X['time_id'])
len(groups), groups

In [None]:
%%time
if CFG.pred:
    pass
elif CFG.retrain:
    lr = create_model('lr', fold=CFG.folds, groups=groups) # add groups param for custom cv object
    save_model(lr, f'{CFG.output_path}{CFG.file_name}_lr', model_only=True)
    del lr
    gc.collect()
else:
    lr = load_model(f'{CFG.input_path}{CFG.file_name}_lr')

In [None]:
%%time
if CFG.pred:
    pass
elif CFG.retrain:
    lightgbm = create_model('lightgbm', fold=CFG.folds, groups=groups)
    save_model(lightgbm, f'{CFG.output_path}{CFG.file_name}_lightgbm', model_only=True)
    del lightgbm
    gc.collect()
else:
    lightgbm = load_model(f'{CFG.input_path}{CFG.file_name}_lightgbm')

In [None]:
%%time
if CFG.pred:
    pass
elif CFG.retrain:
    lasso = create_model('lasso', fold=CFG.folds, groups=groups)
    save_model(lasso, f'{CFG.output_path}{CFG.file_name}_lasso', model_only=True)
    del lasso
    gc.collect()
else:
    lasso = load_model(f'{CFG.input_path}{CFG.file_name}_lasso')

In [None]:
%%time
if CFG.pred:
    pass
elif CFG.retrain:
    svm = create_model('svm', fold=CFG.folds, groups=groups)
    save_model(svm, f'{CFG.output_path}{CFG.file_name}_svm', model_only=True)
    del svm
    gc.collect()
else:
    svm = load_model(f'{CFG.input_path}{CFG.file_name}_svm')

In [None]:
%%time
if CFG.pred:
    pass
elif CFG.retrain:
    knn = create_model('knn', fold=CFG.folds, groups=groups) # best rmsle
    save_model(knn, f'{CFG.output_path}{CFG.file_name}_knn', model_only=True)
    del knn
    gc.collect()
else:
    knn = load_model(f'{CFG.input_path}{CFG.file_name}_knn')

In [None]:
%%time
if CFG.blend:
    blender = blend_models(estimator_list=[lr, lightgbm, lasso, svm, knn], fold=CFG.folds, optimize='R2', groups=groups)
    blender

In [None]:
%%time
if CFG.tune:
    tuned = tune_model(blender, fold=CFG.folds, optimize='MSE', n_iter=10, search_library='optuna', 
                       search_algorithm='random', early_stopping=True, groups=groups)

    #finalized = finalize_model(tuned) # for using all data [assuming test_data = test_holdout is used in setup()]
    #save_model(finalized, f'{CFG.output_path}{CFG.file_name}_finalized')
    save_model(tuned, f'{CFG.output_path}{CFG.file_name}_blender_tuned')

# Predictions

In [None]:
if CFG.pred:
    tuned = load_model(f'{CFG.input_path}{CFG.file_name}_blender_tuned')
    KEEP_FEAT = ['row_id', 'time_id', 'investment_id'] # remove 'target'
    TOP_FEAT = KEEP_FEAT + TOP_FEAT_50
    
    import ubiquant
    env = ubiquant.make_env()
    iter_test = env.iter_test()
    
    for (test_df, sample_prediction_df) in iter_test:
        #display(test_df)
        test_df = test_df.reindex(columns=TOP_FEAT)
        test_df.fillna(0, inplace=True) # fix due to using time_id in setup(), better to use external cv strat and/or ignore_feature='time_id'
        #display(test_df)
        preds = predict_model(tuned, data=test_df)
        sample_prediction_df['target'] = preds['Label'].values
        #display(sample_prediction_df)
        env.predict(sample_prediction_df)