In [None]:
# TabNet
!pip install pytorch-tabnet

In [None]:
import torch
from torch import nn
import torch.optim as optim
from torch.nn import functional as F
from torch.nn.modules.loss import _WeightedLoss
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import ReduceLROnPlateau
# Tabnet 
from pytorch_tabnet.metrics import Metric
from pytorch_tabnet.tab_model import TabNetRegressor

# Libraries

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import os
import random
import sys
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import decomposition
from sklearn.preprocessing import StandardScaler, MinMaxScaler, QuantileTransformer
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from tqdm.auto import tqdm
from sklearn import linear_model
import xgboost as xgb
import operator
import lightgbm as lgb
from catboost import CatBoostRegressor, CatBoostClassifier
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# visualize
import matplotlib.pyplot as plt
import matplotlib.style as style
import seaborn as sns
from matplotlib import pyplot
from matplotlib.ticker import ScalarFormatter
sns.set_context("talk")
style.use('seaborn-colorblind')

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

import warnings
warnings.filterwarnings('ignore')
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Config

In [None]:
SEED = 42
NFOLD = 10
NSA = 5 # number of seed average

# Load data

In [None]:
train = pd.read_csv('../input/tabular-playground-series-jan-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-jan-2021/test.csv')

features = [f'cont{i}' for i in range(1, 15)]
target_col = 'target'

X_train = train.drop(['id', 'target'], axis=1)
y_train = train['target']
X_test = test.drop('id', axis=1)

In [None]:
print(X_train.shape)
X_train.head()

In [None]:
print(X_test.shape)
X_test.head()

# Target
Normal?

In [None]:
y_train.hist()

# Scaling

In [None]:
scaler = StandardScaler()
X_train[features] = scaler.fit_transform(X_train[features])
X_test[features] = scaler.transform(X_test[features])

# Tabnet

In [None]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
set_seed(SEED)

In [None]:
MAX_EPOCH = 160
# n_d and n_a are different from the original work, 32 instead of 24
# This is the first change in the code from the original
tabnet_params = dict(
    n_d = 24,
    n_a = 24,
    n_steps = 4,
    gamma = 1.2,
    lambda_sparse = 1e-3,
    optimizer_fn = optim.Adam,
    optimizer_params = dict(lr = 1e-3, weight_decay = 1e-5),
    mask_type = "entmax",
    scheduler_params = dict(
        mode = "min", patience = 5, min_lr = 1e-5, factor = 0.8),
    scheduler_fn = ReduceLROnPlateau,
    seed = SEED,
    verbose = 10
)

In [None]:
def fit_model(params, X_train, y_train, X_test, features=features, n_fold=NFOLD, seed=SEED):
    cv = KFold(n_splits=n_fold, shuffle=True, random_state=seed)

    models = []
    oof_train = np.zeros((len(X_train),))
    y_preds = np.zeros((len(X_test),))

    for fold_id, (train_index, valid_index) in tqdm(enumerate(cv.split(X_train, y_train))):
        # split
        X_tr = X_train.loc[train_index, features].values
        X_val = X_train.loc[valid_index, features].values
        y_tr = y_train.loc[train_index].values
        y_val = y_train.loc[valid_index].values
        
        # model
        model = TabNetRegressor(**params)
            
        # fit
        model.fit(
            X_train = X_tr,
            y_train = y_tr.reshape(-1, 1),
            eval_set = [(X_val, y_val.reshape(-1, 1))],
            eval_name = ["val"],
            eval_metric = ['mse'],
            max_epochs = MAX_EPOCH,
            patience = 80,
            batch_size = 16000, 
            virtual_batch_size = 800,
            num_workers = 4,
            drop_last = False
        )
        
        # predict
        oof_train[valid_index] = model.predict(X_val).ravel()
        y_pred = model.predict(X_test[features].values).ravel()
        y_preds += y_pred / n_fold
        models.append(model)
        
    return oof_train, y_preds, models

In [None]:
oof = np.zeros(len(train))
y_pred = np.zeros(len(test))
for n in range(NSA):
    # fit
    oof_train, y_preds, models = fit_model(tabnet_params, 
        X_train, y_train, X_test, features=features, n_fold=NFOLD, seed=SEED + n**2)
    
    # average
    oof += oof_train / NSA
    y_pred += y_preds / NSA

# Score

In [None]:
print(f'CV score: {mean_squared_error(y_train, oof, squared=False)}')

# Submit

In [None]:
sub = pd.read_csv('../input/tabular-playground-series-jan-2021/sample_submission.csv')
sub['target'] = y_pred
sub.to_csv('submission.csv', index=False)
sub.head()