# Tabnet starter

* For my learning, I try to use **Tabnet** for this competition, and I referred [this notebook](https://www.kaggle.com/code/wangqihanginthesky/baseline-tabnet).

* The neural network model in this notebook does not seem to be well trained, but there is a possibility to improve the accuracy by feature engineering.

* Note: I am Beginner of machine learning, so if I am missing or misunderstanding something, please let me know in the comments.



In [None]:
#tabnet install

!pip -q install ../input/pytorchtabnet/pytorch_tabnet-3.1.1-py3-none-any.whl
!pip -q install ../input/talib-binary/talib_binary-0.4.19-cp37-cp37m-manylinux1_x86_64.whl

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import random
from sklearn.preprocessing import StandardScaler
import gc
import warnings
warnings.filterwarnings('ignore')

def seed_everything(seed: int = 42) -> None:
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
seed_everything()

# config

In [None]:
class GCF:
    SEED = 0
    N_EPOCHS = 30
    BATCH_SIZE = 30000
    EARLY_STOPPING_PATIENCE = 10
    EARLY_STOPPING_MIN_DELTA = 1e-3

# Data Load

In [None]:
train_df = pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/train_files/stock_prices.csv")
train_df

# preprocess

In [None]:
#Date convert type
train_df['Date']=pd.to_datetime(train_df['Date'])
train_df['Date']=train_df['Date'].dt.strftime('%Y%m%d').astype(int) 

# dropna columns
train_df=train_df.drop(['RowId','SecuritiesCode','AdjustmentFactor','ExpectedDividend','SupervisionFlag'],axis=1)

#dropna
train_df=train_df.dropna()

#scale
use_cols = ['Open','High','Low','Close','Volume']
ss=StandardScaler()
train_df[use_cols]=ss.fit_transform(train_df[use_cols])

train_df

# train & valid split

In [None]:
# train :80% , valid :20%
train=train_df.loc[train_df['Date']<=20201222,:] #train :2017-01-04	- 2020-12-22
valid=train_df.loc[train_df['Date']>20201222,:] #valid :2020-12-23 - 2021-12-03

In [None]:
print('train_rate:',len(train)/len(train_df)*100,'%')
print('valid_size:',len(valid)/len(train_df)*100,'%')

In [None]:
X_train=train[use_cols]
#X_train=train.drop(['Date','Target'],axis=1)
y_train=train['Target']
X_valid=valid[use_cols]
#X_valid=valid.drop(['Date','Target'],axis=1)
y_valid=valid['Target']

# model

In [None]:
from pytorch_tabnet.metrics import Metric
from pytorch_tabnet.tab_model import TabNetRegressor

In [None]:
from torch.optim import Adam, SGD
from torch.optim.lr_scheduler import ReduceLROnPlateau, CosineAnnealingWarmRestarts
from sklearn.metrics import mean_squared_error

#Model parameters https://github.com/dreamquark-ai/tabnet
tabnet_params=dict(
    #cat_idxs=cat_idxs,
    cat_emb_dim=1,
    n_d = 16, 
    n_a = 16, 
    n_steps = 2, 
    gamma =1.4690246460970766, 
    n_independent = 9, 
    n_shared = 4, 
    lambda_sparse = 0,
    optimizer_fn = Adam,
    optimizer_params = dict(lr = (0.024907164557092944)),
    mask_type = "entmax",
    scheduler_params = dict(T_0=200, T_mult=1, eta_min=1e-4, last_epoch=-1, verbose=False),
    scheduler_fn = CosineAnnealingWarmRestarts,
    seed = 42,
    verbose = 10, 
)

#model
clf=TabNetRegressor(**tabnet_params)

#fit
clf.fit(
    X_train.values, y_train.values.reshape(-1, 1),
    eval_set=[(X_valid.values,y_valid.values.reshape(-1, 1))],
    max_epochs=100,
    patience=10,
    batch_size=1024*20,
    virtual_batch_size=128*20,
    num_workers=4,
    drop_last=False,
)

#predict
valid_pred=clf.predict(X_valid.values)

# evaluate
https://www.kaggle.com/code/smeitoma/train-demo/notebook

In [None]:
result = valid[['Date']].copy()

# predict
result["predict"] = valid_pred
# actual result
result["Target"] = y_valid

def set_rank(df):
    """
    Args:
        df (pd.DataFrame): including predict column
    Returns:
        df (pd.DataFrame): df with Rank
    """
    # sort records to set Rank
    df = df.sort_values("predict", ascending=False)
    # set Rank starting from 0
    df.loc[:, "Rank"] = np.arange(len(df["predict"]))
    return df

result = result.sort_values(["Date", "predict"], ascending=[True, False])
result = result.groupby("Date").apply(set_rank)

result = result.drop('Date',axis=1)

In [None]:
def calc_spread_return_sharpe(df: pd.DataFrame, portfolio_size: int = 200, toprank_weight_ratio: float = 2) -> float:
    """
    Args:
        df (pd.DataFrame): predicted results
        portfolio_size (int): # of equities to buy/sell
        toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
    Returns:
        (float): sharpe ratio
    """
    def _calc_spread_return_per_day(df, portfolio_size, toprank_weight_ratio):
        """
        Args:
            df (pd.DataFrame): predicted results
            portfolio_size (int): # of equities to buy/sell
            toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
        Returns:
            (float): spread return
        """
        assert df['Rank'].min() == 0
        assert df['Rank'].max() == len(df['Rank']) - 1
        weights = np.linspace(start=toprank_weight_ratio, stop=1, num=portfolio_size)
        purchase = (df.sort_values(by='Rank')['Target'][:portfolio_size] * weights).sum() / weights.mean()
        short = (df.sort_values(by='Rank', ascending=False)['Target'][:portfolio_size] * weights).sum() / weights.mean()
        return purchase - short

    buf = df.groupby('Date').apply(_calc_spread_return_per_day, portfolio_size, toprank_weight_ratio)
    sharpe_ratio = buf.mean() / buf.std()
    return sharpe_ratio

In [None]:
# calc spread return sharpe
calc_spread_return_sharpe(result, portfolio_size=200)