# Overview
This notebook uses [pycaret](https://pycaret.gitbook.io/docs/) to create models that can be used for other pycaret processing and prediction.  Only models that use the gpu are used here.  The purpose of this notebook is to get an end-to-end pycaret train to submission flow and to learn about the capabilities of pycaret.  
* Multiple notebooks are used to create a full pycaret workflow due to memory failures.  
* A few config flags are used to enable a sample of training data - begining part or end part or full training data.  
* Top features were generated using a private notebook running [fastai](https://github.com/fastai/fastai) and [fastinference's](https://muellerzr.github.io/fastinference/) ShapInterpretation.  
* Dataset used: https://www.kaggle.com/datasets/robikscube/ubiquant-parquet

In [None]:
from datetime import datetime
from pytz import timezone
tz = timezone("US/Eastern")
print(datetime.now(tz).strftime('%y%m%d-%H-%M:%S:'))

# Objective
220402-20-06:44: intial pycaret try on subset of data, save models for later blend and tune  
220402-20-46:38: use above version (v5) output files to blend and tune  

In [None]:
#pycaret needs v0.23.2
!pip install -qqq scikit-learn=='0.23.2'
!pip install -qqq pycaret
!pip install -qqq optuna

In [None]:
import sklearn                                                                                                                                           
sklearn.__version__

In [None]:
%matplotlib inline
import sys
import os
import pandas as pd
import numpy as np
import gc
#from fastai.tabular.all import *
from sklearn.model_selection import (
    TimeSeriesSplit,
    KFold,
    ShuffleSplit,
    StratifiedKFold,
    GroupShuffleSplit,
    GroupKFold,
    StratifiedShuffleSplit,
    #StratifiedGroupKFold,
)
#from model_selection import GroupTimeSeriesSplit
from matplotlib.patches import Patch
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
cmap_data = plt.cm.Paired
cmap_cv = plt.cm.coolwarm
from scipy.special import comb
from itertools import combinations

import pycaret
from pycaret.regression import *
pycaret.__version__

In [None]:
class CFG:
    device = 'kaggle'
    ignore_warnings = True
    if device == 'mac':
        data_path = '/Users/USERNAME/jam/data/ubiquant/'
        output_path = data_path +'output/pycaret/'
    elif device == 'wsl':
        data_path = '/mnt/d/data/ubiquant/'
        output_path = data_path + 'output/pycaret/'
    elif device == 'colab':
        data_path = '/content/drive/MyDrive/data/ubiquant/'
        output_path = data_path + 'output/pycaret/'
    else: # kaggle
        data_path = '../input/'
        output_path = '/kaggle/working/'
        input_path = '../input/ubiquant-pycaret-optuna/'

    protocol = 4
    seed = 1972
    folds = 5
    sample = 1500000 #2800000 #None
    train_part = 'end' # use part of training data [all, begin, end]
    use_cleaned_data = False
    file_name = f'ubiquant_pycaret_{train_part}_{sample}'
    
    retrain = False
    wandb = False
    blend = True
    tune = True

if CFG.ignore_warnings:
    import warnings
    warnings.filterwarnings('ignore')
    #warnings.simplefilter('ignore')

#pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [None]:
if CFG.wandb:
    !pip install -qqq wandb
    import wandb
    wandb.login()
    log_experiment = 'wandb'
else:
    log_experiment = False

In [None]:
CFG.file_name

In [None]:
def seed_everything(seed):
    res = []
    try: random.seed(seed)
    except NameError as ne: res.append(ne); pass
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    try: torch.manual_seed(seed)
    except NameError as ne: res.append(ne); pass
    try: torch.cuda.manual_seed(seed)
    except NameError as ne: res.append(ne); pass
    try: torch.backends.cudnn.deterministic = True
    except NameError as ne: res.append(ne); pass
    print (res)
seed_everything(CFG.seed)

In [None]:
%%time
n_features = 300
features = [f'f_{i}' for i in range(n_features)]

if CFG.device == 'mac':
    train_raw = pd.read_parquet(CFG.data_path + 'ubiquant-parquet/train_low_mem.parquet')
else:
    if CFG.use_cleaned_data:
        train_raw = pd.read_pickle(CFG.data_path + 'wo_outliers_mem_reduced/train_wo_outliers_mem_reduced.pkl')
        train_raw.reset_index(drop=True, inplace=True)
        # ?? set investment and time_id as int ???
        print('... using cleaned training data (wo outliers, with mem_reduction) ...')
    else:
        #train_raw = pd.read_pickle(CFG.data_path + 'ubiquant-market-prediction-half-precision-pickle/train.pkl')
        train_raw = pd.read_parquet(CFG.data_path + 'ubiquant-parquet/train_low_mem.parquet')
        print('... using train_low_mem.parquet ...')

In [None]:
KEEP_FEAT = ['row_id', 'time_id', 'investment_id', 'target']
TOP_FEAT_50 = ['f_19','f_21','f_29','f_65','f_76','f_118','f_130','f_178','f_179','f_221',
               'f_223','f_231','f_232','f_244','f_250','f_257','f_281','f_283', 'f_297']
TOP_FEAT = KEEP_FEAT + TOP_FEAT_50
train_raw = train_raw.reindex(columns=TOP_FEAT)
features = [i for i in TOP_FEAT_50]

In [None]:
# sort before any sample strategy
train_raw.sort_values('time_id', inplace=True)

In [None]:
train_raw.shape

In [None]:
if CFG.sample:
    train_raw = train_raw.iloc[-CFG.sample:].copy()
    print(f'... using {CFG.sample} sample of data  ...')

if CFG.train_part != 'all':
    if CFG.train_part == 'begin':
        train = train_raw.iloc[:int(len(train_raw)/2)].copy()
        print('... using begin half part of data ...')
    else:
        train = train_raw.iloc[-int(len(train_raw)/2):].copy() #unverified
        print('... using end half part of data ...')        
else:
    train = train_raw.copy()
    print('... using all data ...')

train.sort_values('time_id', inplace=True)
train.reset_index(drop=True, inplace=True)
    
# change types (cleaned data produces float16 for below)
train['time_id'] = train['time_id'].apply(np.int16)
train['investment_id'] = train['investment_id'].apply(np.int16)

del train_raw
gc.collect()

In [None]:
train.shape

In [None]:
investment_id = train['investment_id']
time_id = train['time_id'].values
_ = train.pop("row_id")

In [None]:
%%time
clf1 = setup(data= train, #test_data = X_valid, # hold out test set, not used in this nb
             target= 'target', normalize = True, remove_outliers = True, 
             remove_multicollinearity = True, polynomial_features = False,
             trigonometry_features = False, feature_selection = False, 
             feature_interaction = False, feature_ratio = False, 
             data_split_shuffle = True, data_split_stratify = ['time_id'],
             fold_strategy= 'timeseries', fold = CFG.folds, session_id= CFG.seed,
             log_experiment = log_experiment, experiment_name = CFG.file_name,
             use_gpu = True, silent= True, profile = False)

In [None]:
print(models(internal=True)[['Name', 'GPU Enabled']])

In [None]:
# top = compare_models(include=['lr','lasso','ridge','en','svm','knn','rf','xgboost','lightgbm','catboost'], n_select=3,
#                      fold=CFG.folds, turbo=True, cross_validation=True, 
#                      sort='MSE', budget_time=20)

In [None]:
# del top
# gc.collect()

In [None]:
%%time
if CFG.retrain:
    lightgbm = create_model('lightgbm', fold=CFG.folds) # best mse,rmse,r2
    save_model(lightgbm, f'{CFG.output_path}{CFG.file_name}_lightgbm', model_only=True)
    del lightgbm
    gc.collect()
else:
    lightgbm = load_model(f'{CFG.input_path}{CFG.file_name}_lightgbm')

In [None]:
%%time
if CFG.retrain:
    lasso = create_model('lasso', fold=CFG.folds) # best mape
    save_model(lasso, f'{CFG.output_path}{CFG.file_name}_lasso', model_only=True)
    del lasso
    gc.collect()
else:
    lasso = load_model(f'{CFG.input_path}{CFG.file_name}_lasso')

In [None]:
%%time
if CFG.retrain:
    svm = create_model('svm', fold=CFG.folds) # best mae
    save_model(svm, f'{CFG.output_path}{CFG.file_name}_svm', model_only=True)
    del svm
    gc.collect()
else:
    svm = load_model(f'{CFG.input_path}{CFG.file_name}_svm')

In [None]:
%%time
if CFG.retrain:
    knn = create_model('knn', fold=CFG.folds) # best rmsle
    save_model(knn, f'{CFG.output_path}{CFG.file_name}_knn', model_only=True)
    del knn
    gc.collect()
else:
    knn = load_model(f'{CFG.input_path}{CFG.file_name}_knn')

In [None]:
%%time
if CFG.blend:
    blender = blend_models(estimator_list=[lightgbm, lasso, svm, knn], fold=CFG.folds, optimize='R2')
    blender

In [None]:
%%time
if CFG.tune:
    tuned = tune_model(blender, fold=CFG.folds, optimize='MSE', n_iter=10, search_library='optuna', 
                       search_algorithm='random', early_stopping=True)

    #finalized = finalize_model(tuned) # for using all data [assuming test_data = test_holdout is used in setup()]
    #save_model(finalized, f'{CFG.output_path}{CFG.file_name}_finalized')
    save_model(tuned, f'{CFG.output_path}{CFG.file_name}_blender_tuned')