# A quick comparison of XGBoost with TabNet

This notebook is based on https://www.kaggle.com/danspace/rossmann-store-sales-xgboost. I wanted to compare XGBoost with TabNet, a neural network made for tabular data. I tried a few configurations, but was not able to come close to the out-of-the-box performance of XGBoost. If anyone finds better hyper-parameters I would be glad to hear it. I tried feature scaling, but it had no effect.

In [None]:
!pip install pytorch_tabnet

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session


from pathlib import Path as P

import torch

from pytorch_tabnet.tab_model import TabNetRegressor

from sklearn.model_selection import train_test_split


PATH = P('/kaggle/input/rossmann-store-sales')

store = pd.read_csv(PATH/'store.csv').fillna(0)
train = pd.read_csv(PATH/'train.csv',parse_dates=[2])

# merge data with store 
train = pd.merge(train, store, on='Store')

# split the last 6 weeks data as hold-out set (idea from Gert https://www.kaggle.com/c/rossmann-store-sales/discussion/18024)
train = train.sort_values(['Date'],ascending = False)
train_total = train.copy()

split_index = 6*7*1115
valid = train[:split_index] 
train = train[split_index:]

# only use data of Sales>0 and Open is 1
valid = valid[(valid.Open != 0)&(valid.Sales >0)]
train = train[(train.Open != 0)&(train.Sales >0)]
train_total = train_total[(train_total.Open != 0)&(train_total.Sales >0)]

def process(data, isTest = False):
    # label encode some features
    mappings = {'0':0, 'a':1, 'b':2, 'c':3, 'd':4}
    data.StoreType.replace(mappings, inplace=True)
    data.Assortment.replace(mappings, inplace=True)
    data.StateHoliday.replace(mappings, inplace=True)
    
    # extract some features from date column  
    data['Month'] = data.Date.dt.month
    data['Year'] = data.Date.dt.year
    data['Day'] = data.Date.dt.day
    data['WeekOfYear'] = data.Date.dt.weekofyear
    
    # calculate competiter open time in months
    data['CompetitionOpen'] = 12 * (data.Year - data.CompetitionOpenSinceYear) + \
        (data.Month - data.CompetitionOpenSinceMonth)
    data['CompetitionOpen'] = data['CompetitionOpen'].apply(lambda x: x if x > 0 else 0)
    
    # calculate promo2 open time in months
    data['PromoOpen'] = 12 * (data.Year - data.Promo2SinceYear) + \
        (data.WeekOfYear - data.Promo2SinceWeek) / 4.0
    data['PromoOpen'] = data['PromoOpen'].apply(lambda x: x if x > 0 else 0)
                                                 
    # Indicate whether the month is in promo interval
    month2str = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', \
             7:'Jul', 8:'Aug', 9:'Sept', 10:'Oct', 11:'Nov', 12:'Dec'}
    data['month_str'] = data.Month.map(month2str)

    def check(row):
        if isinstance(row['PromoInterval'],str) and row['month_str'] in row['PromoInterval']:
            return 1
        else:
            return 0
        
    data['IsPromoMonth'] =  data.apply(lambda row: check(row),axis=1)    
    
    # select the features we need
    features = ['Store', 'DayOfWeek', 'Promo', 'StateHoliday', 'SchoolHoliday',
       'StoreType', 'Assortment', 'CompetitionDistance',
       'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2',
       'Promo2SinceWeek', 'Promo2SinceYear', 'Year', 'Month', 'Day',
       'WeekOfYear', 'CompetitionOpen', 'PromoOpen', 'IsPromoMonth']  
    if not isTest:
        features.append('Sales')
        
    data = data[features]
    return data

train = process(train)
valid = process(valid)
train_total = process(train_total)

# sort by index
valid.sort_index(inplace = True)
train.sort_index(inplace = True)
train_total.sort_index(inplace = True)

# split x and y
X_train, y_train = train.drop(columns = ['Sales']), np.log1p(train[['Sales']])
X_valid, y_valid = valid.drop(columns = ['Sales']), np.log1p(valid[['Sales']])
X_train_total, y_train_total = train_total.drop(columns = ['Sales']), np.log1p(train_total[['Sales']])

def rmspe(y, yhat):
    return np.sqrt(np.mean((yhat/y-1) ** 2))

from pytorch_tabnet.metrics import Metric

class RMSPE_EXP(Metric):
    def __init__(self):
        self._name = "rmspe"
        self._maximize = False

    def __call__(self, y_true, y_pred):
        y_true = np.expm1(y_true)
        #y_pred = y_pred[:, 1]
        y_pred = np.expm1(y_pred)
        return "rmspe", rmspe(y_pred, y_true)  

cat_idxs = [0, 1, 3, 5, 6, 14, 15, 16]
X_train.iloc[:,cat_idxs]

clf = TabNetRegressor(
    n_d=64, n_a=64, 
    n_steps=5,
    lambda_sparse=1e-5,
    optimizer_params=dict(lr=2e-2),
    scheduler_params={"factor": .1, "patience": 3},
    scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
    cat_idxs=cat_idxs,
#    cat_dims=[1115, # Store
#              7,    # Day of week 
#              5,    # Store Type
#              4,    # Assortment
#              12,   # Month
#              31,   # Day
#              52,   # Week of year
#             ],
)

clf.fit(
  X_train.values, 
  y_train.values,
  eval_set=[
            (X_train.values, y_train.values), 
            (X_valid.values, y_valid.values)
           ],
  eval_name=[
             'train',
             'valid'
            ],
  eval_metric=['rmse'],
  max_epochs=1000,
  batch_size=8192, 
  virtual_batch_size=256
)

y_pred = clf.predict(X_train.values)
error = rmspe( np.expm1(y_train.values) , np.expm1(y_pred))
print('Train RMSPE:', error)

y_pred = clf.predict(X_valid.values)
error = rmspe( np.expm1(y_valid.values) , np.expm1(y_pred))
print('Valid RMSPE:', error)

test = pd.read_csv(PATH/'test.csv',  parse_dates=[3])
test = pd.merge(test, store, on='Store')
test.fillna(value = 1, inplace = True)
X_test = process(test, isTest = True)    

preds = clf.predict(X_test.values).flatten()

result = pd.DataFrame({"Id": test["Id"],'Sales': np.expm1(preds)})
result.to_csv("submission.csv", index=False)