# TabNet Baseline

- TabNet is widely used in table-based competitions. This notebook is just the most basic attempt based on LightGMB's baseline. I hope you will like it and give me some more suggestions.
- 大家好，这是我第一次上传我的notebook，TabNet是表格类数据比赛中常用的手法之一。这个笔记本是根据别人开源的LightGBM稍微更改了一点的笔记本。希望大家多给一些意见。
- 今回は初めて自分のノートブックをアップロードしました。TabNetは表データのコンペでよく使われている手法です。このノートブックはLightGBMのベースラインからTabNetが使えるように少し変更したものです。みんなさんからアドバイスを頂けたら嬉しいです。

- [Paper](https://arxiv.org/abs/1908.07442v5)

# What I want to try next.
- Version 1 :Baseline
- Version 2 :Feature filtering (based on importance, etc.), change the validation method
- Version 3 :Optuna tuning parameters
- Version 4 :Ensable with LightGBM

## INSTALL

In [None]:
!pip -q install ../input/pytorchtabnet/pytorch_tabnet-3.1.1-py3-none-any.whl
!pip -q install ../input/talib-binary/talib_binary-0.4.19-cp37-cp37m-manylinux1_x86_64.whl

In [None]:
from pytorch_tabnet.metrics import Metric
from pytorch_tabnet.tab_model import TabNetRegressor
from pytorch_tabnet.pretraining import TabNetPretrainer

## Import

## config

In [None]:
import os
import gc
import joblib
import random
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm
from argparse import Namespace
from collections import defaultdict

import matplotlib.pyplot as plt
from scipy.stats import pearsonr
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import TimeSeriesSplit, StratifiedKFold, train_test_split


import warnings
warnings.filterwarnings('ignore')
pd.set_option('max_columns', 64)

def seed_everything(seed: int = 42) -> None:
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
args = Namespace(
    INFER=True,
    debug=False,
    seed=21,
    folds=5,
    workers=4,
    min_time_id=None, 
    holdout=True,
    num_bins=16,
    data_path=Path("../input/ubiquant-parquet/"),
)
seed_everything(args.seed)

if args.debug:
    setattr(args, 'min_time_id', 1100)

In [None]:
%%time
train = pd.read_parquet(args.data_path.joinpath("train_low_mem.parquet"))
assert train.isnull().any().sum() == 0, "null exists."
assert train.row_id.str.extract(r"(?P<time_id>\d+)_(?P<investment_id>\d+)").astype(train.time_id.dtype).equals(train[["time_id", "investment_id"]]), "row_id!=time_id_investment_id"

if args.min_time_id is not None:
    train = train.query("time_id>=@args.min_time_id").reset_index(drop=True)
    gc.collect()


# StratifiedKFold by time_span

In [None]:
cat_features = ["investment_id"]
cat_idxs = [ i for i, f in enumerate(train.columns.tolist()) if f in cat_features]
del train

In [None]:
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler
import torch
from torch.optim import Adam, SGD
from torch.optim.lr_scheduler import ReduceLROnPlateau, CosineAnnealingWarmRestarts
from sklearn.metrics import mean_squared_error


In [None]:
folds = 1

In [None]:
import os
import zipfile
 
def zipDir(dirpath, outFullName):

    zip = zipfile.ZipFile(outFullName, "w", zipfile.ZIP_DEFLATED)
    for path, dirnames, filenames in os.walk(dirpath):

        fpath = path.replace(dirpath, '')

        for filename in filenames:
            zip.write(os.path.join(path, filename), os.path.join(fpath, filename))
    zip.close()
    

for fold in range(folds):
    input_path =f'../input/tabnet-res-fold1/'
    output_path = f"./fold{fold}.zip"
    zipDir(input_path, output_path)


# Infer

In [None]:
import ubiquant
env = ubiquant.make_env()  
iter_test = env.iter_test()
tabnet_params = dict(
        cat_idxs=cat_idxs,
        cat_emb_dim=1,
        n_d = 16,
        n_a = 16,
        n_steps = 2,
        gamma =1.4690246460970766,
        n_independent = 9,
        n_shared = 4,
        lambda_sparse = 0,
        optimizer_fn = Adam,
        optimizer_params = dict(lr = (0.024907164557092944)),
        mask_type = "entmax",
        scheduler_params = dict(T_0=200, T_mult=1, eta_min=1e-4, last_epoch=-1, verbose=False),
        scheduler_fn = CosineAnnealingWarmRestarts,
        seed = 42,
        verbose = 10, 
    )    



import copy
clf =  TabNetRegressor(**tabnet_params)
models = []
for fold in range(folds):
    clf.load_model(f"./fold0.zip")
    model=copy.deepcopy(clf)
    models.append(model)
    

for (test_df, sample_prediction_df) in iter_test:
    test_df.drop(["row_id", "row_id"], axis=1, inplace=True)
    final_pred = [models[fold].predict(test_df.values) for fold in range(folds)]
    sample_prediction_df['target'] = np.mean(np.stack(final_pred), axis=0)
    env.predict(sample_prediction_df) 