This notebook is the code for training and uploading to the dataset assuming that Google Colaboratory is used.  
In Google Colaboratory, you can use more memory than Kaggle Notebook by setting the high-memory.  

This pipeline was very referenced from [this notebook](https://www.kaggle.com/mst8823/19th-place-best-single-model-resbilstm)  created by [@mst8823](https://www.kaggle.com/mst8823).


## Setting

The directory structure is assumed to be as follows.

```
MyDrive  
├UbiquantMarketPredictionDrive  
│ └[Author Name]
│    └Notebook  
│       └[This Nootbook]
└kaggle.json
```

And when you run everything, the directory structure will look like this, for example.

```
MyDrive  
├UbiquantMarketPredictionDrive  
│ └colum2131
│    ├Notebook  
│    │   └UMP-Exp001-ColabTraining.ipnyb
│    ├Input
│    │   ├ubiquant
│    │   ├ubiquant-market-prediction.zip
│    │   ├train.csv
│    │   ├example_test.csv
│    │   └example_sample_submission.csv
│    ├Output
│    │   └UMP-Exp001-ColabTraining
│    │      ├preds
│    │      ├model
│    │      └fig
│    ├Dataset
│    │   └ubiquant-parquet
│    └Submission
└kaggle.json
```
You also need to rewrite Config appropriately if you run this code.

## Inference

* [[Ubiquant] Inference: Google Colaboratory Training](https://www.kaggle.com/columbia2131/ubiquant-inference-google-colaboratory-training) - LB Score: 0.132

## Reference

* [19th Place Best Single Model [ResBiLSTM]](https://www.kaggle.com/mst8823/19th-place-best-single-model-resbilstm) created by [@mst8823](https://www.kaggle.com/mst8823)
* [⏫ Fast Data Loading and Low Mem with Parquet Files](https://www.kaggle.com/robikscube/fast-data-loading-and-low-mem-with-parquet-files) created by [@robikscube](https://www.kaggle.com/robikscube)
* [Ubiquant Competition Data in Parquet Format](https://www.kaggle.com/robikscube/ubiquant-parquet) created by [@robikscube](https://www.kaggle.com/robikscube)

In [None]:
!nvidia-smi

In [None]:
class Config:
    author = "colum2131" # Your name
    competition = "ubiquant-market-prediction"
    name = "UMP-Exp001-ColabTraining" # The name of the Dataset
    upload_from_colab = True # If True, the model uploads to the Kaggle Dataset
    
    colab_dir = "/content/drive/Shareddrives/UbiquantMarketPredictionDrive" # Your own directory
    drive_path = colab_dir + f"/{author}"
    api_path = "/content/drive/MyDrive/kaggle.json" # Your own api-path
    
    dataset_path = ['robikscube/ubiquant-parquet'] # The dataset you want to download

    n_fold = 5
    trn_fold = [0, 1, 2, 3, 4]
    seed = 42
    max_epochs = 100

In [None]:
import os
import gc
import sys
import json
import pickle
import shutil
import random
import joblib
import requests
import itertools

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob
from tqdm.auto import tqdm

from sklearn.preprocessing import (
    StandardScaler,
    MinMaxScaler,
    RobustScaler,
)
from sklearn.model_selection import (
    KFold,
    StratifiedKFold,
    GroupKFold
)
from sklearn.linear_model import (
    Ridge,
)
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    log_loss,
    roc_auc_score,
    mean_absolute_error,
    mean_squared_error,
)

import lightgbm as lgbm
import torch
import tensorflow as tf



def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

def MAE(y_true, y_pred):
    return mean_absolute_error(y_true, y_pred)

def MSE(y_true, y_pred):
    return mean_squared_error(y_true, y_pred)

def RMSE(y_true, y_pred):
    return mean_squared_error(y_true, y_pred, squared=False)

def get_kfold(train, n_splits, seed):
    fold_series = []
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
    kf_generator = kf.split(train)
    for fold, (idx_train, idx_valid) in enumerate(kf_generator):
        fold_series.append(pd.Series(fold, index=idx_valid))
    fold_series = pd.concat(fold_series).sort_index()
    return fold_series

def get_stratifiedkfold(train, target_col, n_splits, seed):
    fold_series = []
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    kf_generator = kf.split(train, train[target_col])
    for fold, (idx_train, idx_valid) in enumerate(kf_generator):
        fold_series.append(pd.Series(fold, index=idx_valid))
    fold_series = pd.concat(fold_series).sort_index()
    return fold_series

def get_groupkfold(train, target_col, group_col, n_splits):
    fold_series = []
    kf = GroupKFold(n_splits=n_splits)
    kf_generator = kf.split(train, train[target_col], train[group_col])
    for fold, (idx_train, idx_valid) in enumerate(kf_generator):
        fold_series.append(pd.Series(fold, index=idx_valid))
    fold_series = pd.concat(fold_series).sort_index()
    return fold_series

In [None]:
def setup(cfg):
    cfg.COLAB = 'google.colab' in sys.modules
    if cfg.COLAB:
        print('This environment is Google Colab')
        
        # mount
        from google.colab import drive
        if not os.path.isdir('/content/drive'):
            drive.mount('/content/drive') 
        
        # import library
        ! pip install --quiet tensorflow-addons

        # use kaggle api (need kaggle token)
        f = open(cfg.api_path, 'r')
        json_data = json.load(f) 
        os.environ['KAGGLE_USERNAME'] = json_data['username']
        os.environ['KAGGLE_KEY'] = json_data['key']

        # set dirs
        cfg.DRIVE = cfg.drive_path
        cfg.EXP = (cfg.name if cfg.name is not None 
            else requests.get('http://172.28.0.2:9000/api/sessions').json()[0]['name'][:-6]
        )
        cfg.INPUT = os.path.join(cfg.DRIVE, 'Input')
        cfg.OUTPUT = os.path.join(cfg.DRIVE, 'Output')
        cfg.SUBMISSION = os.path.join(cfg.DRIVE, 'Submission')
        cfg.DATASET = os.path.join(cfg.DRIVE, 'Dataset')

        cfg.OUTPUT_EXP = os.path.join(cfg.OUTPUT, cfg.EXP) 
        cfg.EXP_MODEL = os.path.join(cfg.OUTPUT_EXP, 'model')
        cfg.EXP_FIG = os.path.join(cfg.OUTPUT_EXP, 'fig')
        cfg.EXP_PREDS = os.path.join(cfg.OUTPUT_EXP, 'preds')

        # make dirs
        for d in [cfg.INPUT, cfg.SUBMISSION, cfg.EXP_MODEL, cfg.EXP_FIG, cfg.EXP_PREDS]:
            os.makedirs(d, exist_ok=True)
        
        if not os.path.isfile(os.path.join(cfg.INPUT, 'train.csv')):
            # load dataset
            ! pip install --upgrade --force-reinstall --no-deps kaggle
            ! kaggle competitions download -c $cfg.competition -p $cfg.INPUT
            filepath = os.path.join(cfg.INPUT,cfg.competition+'.zip')
            ! unzip -d $cfg.INPUT $filepath
            
        
        for path in cfg.dataset_path:
            datasetpath = os.path.join(cfg.DATASET,  path.split('/')[1])
            if not os.path.exists(datasetpath):
                os.makedirs(datasetpath, exist_ok=True)
                ! kaggle datasets download $path -p $datasetpath
                filepath = os.path.join(datasetpath, path.split("/")[1]+'.zip')
                ! unzip -d $datasetpath $filepath

    
    else:
        print('This environment is Kaggle Kernel')

        # set dirs
        cfg.INPUT = f'../input/{cfg.competition}'
        cfg.EXP = cfg.name
        cfg.OUTPUT_EXP = cfg.name
        cfg.SUBMISSION = './'
        cfg.DATASET = '../input/'
        
        cfg.EXP_MODEL = os.path.join(cfg.EXP, 'model')
        cfg.EXP_FIG = os.path.join(cfg.EXP, 'fig')
        cfg.EXP_PREDS = os.path.join(cfg.EXP, 'preds')

        # make dirs
        for d in [cfg.EXP_MODEL, cfg.EXP_FIG, cfg.EXP_PREDS]:
            os.makedirs(d, exist_ok=True)

    seed_everything(cfg.seed)
    return cfg


def dataset_create_new(dataset_name, upload_dir):
    dataset_metadata = {}
    dataset_metadata['id'] = f'{os.environ["KAGGLE_USERNAME"]}/{dataset_name}'
    dataset_metadata['licenses'] = [{'name': 'CC0-1.0'}]
    dataset_metadata['title'] = dataset_name
    with open(os.path.join(upload_dir, 'dataset-metadata.json'), 'w') as f:
        json.dump(dataset_metadata, f, indent=4)
    api = KaggleApi()
    api.authenticate()
    api.dataset_create_new(folder=upload_dir, convert_to_csv=False, dir_mode='tar')

In [None]:
def fit_lightgbm(cfg, X, y, params, folds, add_suffix=''):
    oof_pred = np.zeros(len(y), dtype=np.float64)

    for fold in cfg.trn_fold:
        idx_train = (folds!=fold)
        idx_valid = (folds==fold)
        x_train, y_train = X[idx_train], y[idx_train]
        x_valid, y_valid = X[idx_valid], y[idx_valid]
        
        lgbm_train = lgbm.Dataset(x_train, y_train)
        lgbm_valid = lgbm.Dataset(x_valid, y_valid, reference=lgbm_train)
        
        model = lgbm.train(
            params=params,
            train_set=lgbm_train,
            valid_sets=[lgbm_train, lgbm_valid],
            num_boost_round=cfg.max_epochs,
            verbose_eval=100,
            early_stopping_rounds=100,
        )
        
        # save model
        tmp_path = os.path.join(Config.EXP_MODEL, f'lgbm_fold{fold}{add_suffix}.pkl')
        pickle.dump(model, open(tmp_path, 'wb'))
        # save oof-pred
        pred_i = model.predict(x_valid, num_iteration=model.best_iteration)
        oof_pred[x_valid.index] = pred_i
        tmp_path = os.path.join(Config.EXP_PREDS, f'lgbm_fold{fold}{add_suffix}.npy')
        np.save(tmp_path, pred_i)
        
        score = round(RMSE(y_valid, pred_i), 5)
        print(f'Performance of the prediction: {score}')

    # save oof-pred
    tmp_path = os.path.join(Config.EXP_PREDS, f'lgbm_foldall{add_suffix}.npy')
    np.save(tmp_path, oof_pred)

    score = round(RMSE(y, oof_pred), 5)
    print(f'All Performance of the prediction: {score}')
    return oof_pred


def pred_lightgbm(cfg, X, add_suffix=''):
    models = glob(os.path.join(cfg.EXP_MODEL, f'lgbm*{add_suffix}.pkl'))
    models = [pickle.load(open(model, 'rb')) for model in models]
    preds = np.array([model.predict(X) for model in models])
    preds = np.mean(preds, axis=0)
    return preds

In [None]:
# =========================
# SetUp
# =========================
Config = setup(Config)

# 2nd import
import tensorflow_addons as tfa

In [None]:
# =========================
# Pre-Processing
# =========================
train = pd.read_parquet(os.path.join(Config.DATASET, 'ubiquant-parquet/train_low_mem.parquet'))
train_time = train['time_id']
train_investment = train['investment_id']

feature_cols = [f'f_{i}' for i in range(300)]
train_X = train[['investment_id'] + feature_cols]
train_y = train['target']

folds = get_groupkfold(train, 'target', 'time_id', Config.n_fold)

del train
gc.collect()

In [None]:
# =========================
# Training & Upload
# =========================
lgbm_params = {
    'objective': 'rmse',
    'metric': 'rmse',
    'boosting_type': 'gbdt', 
    'learning_rate': 0.1, 
    'num_leaves': 31, 
}
oof_base_pred = fit_lightgbm(Config, train_X, train_y, lgbm_params, folds, '_base')


# upload output folder to kaggle dataset
if Config.upload_from_colab:
    from kaggle.api.kaggle_api_extended import KaggleApi
    dataset_create_new(dataset_name=Config.EXP, upload_dir=Config.OUTPUT_EXP)