In [8]:
import pandas as pd
import numpy as np
from pytorch_tabnet.tab_model import TabNetRegressor
from pytorch_tabnet.pretraining import TabNetPretrainer
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.model_selection import train_test_split
from joblib import load, dump
from torch.optim import lr_scheduler
import torch

In [3]:
input_path = '../data/train.parquet.gzip'
test_path = '../data/test.parquet.gzip'
pred_cols = ["Mean_BMI","Median_BMI","Unmet_Need_Rate","Under5_Mortality_Rate","Skilled_Birth_Attendant_Rate","Stunted_Rate"]

In [4]:
low_imp_features = load('../data/low_imp_features.joblib')

In [5]:
train = pd.read_parquet(input_path)
train = train[~train.index.duplicated(keep='first')]
train = train.drop(low_imp_features, axis=1)
X = train.drop(pred_cols, axis=1)
y = train[pred_cols]

In [6]:
from pytorch_tabnet.metrics import Metric
class my_metric(Metric):
    """
    MCRMSE.
    """
    def __init__(self):
        self._name = "MCRMSE" # write an understandable name here
        self._maximize = False

    def __call__(self, y_true, y_score):
        """
        Parameters
        ----------
        y_true: np.ndarray
            Target matrix or vector
        y_score: np.ndarray
            Score matrix or vector

        Returns
        -------
            float
        """
        # loop over each column and calculate error
        errors = []
        for i in range(y_true.shape[1]):
            errors.append(np.sqrt(mean_squared_error(y_true[:,i], y_score[:,i])))
        ans = np.mean(errors)
        return ans

In [7]:
X_train, X_valid, y_train, y_valid = train_test_split(X,y,test_size=.2, random_state=42)

In [None]:
model = TabNetRegressor(n_d=32, n_a=32, n_steps=15, n_independent=5, n_shared=5, scheduler_fn=lr_scheduler.ReduceLROnPlateau
                        , gamma=3.3, lambda_sparse=0.01, momentum=0.3, seed=42)
model.fit(
  X_train.values, y_train.values,
  eval_set=[(X_valid.values, y_valid.values)],
  eval_metric=['rmse', my_metric],
  patience=0,
  batch_size=2048,
  max_epochs=400
)

In [16]:
# TabNetPretrainer
unsupervised_model = TabNetPretrainer(
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    mask_type='entmax' # "sparsemax"
)

unsupervised_model.fit(
    X_train=X_train.values,
    eval_set=[X_valid.values],
    pretraining_ratio=0.8,
)

model = TabNetRegressor(
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    n_d=64,
    n_a=64,
    n_steps=5,
    gamma=2.3,
    scheduler_params={"step_size":10, # how to use learning rate scheduler
                      "gamma":0.9},
    scheduler_fn=torch.optim.lr_scheduler.StepLR,
    mask_type='sparsemax' # This will be overwritten if using pretrain model
)

model.fit(
    X_train=X_train.values, y_train=y_train.values,
    eval_set=[(X_train.values, y_train.values), (X_valid.values, y_valid.values)],
    eval_name=['train', 'valid'],
    eval_metric=[my_metric],
    from_unsupervised=unsupervised_model
)



epoch 0  | loss: 6403.25276| val_0_unsup_loss_numpy: 4442.6572265625|  0:00:03s
epoch 1  | loss: 4238.53732| val_0_unsup_loss_numpy: 4364.6748046875|  0:00:06s
epoch 2  | loss: 3995.94473| val_0_unsup_loss_numpy: 4460.5576171875|  0:00:10s
epoch 3  | loss: 3505.3523| val_0_unsup_loss_numpy: 4503.0830078125|  0:00:13s
epoch 4  | loss: 2958.54427| val_0_unsup_loss_numpy: 4518.609375|  0:00:17s
epoch 5  | loss: 2433.05314| val_0_unsup_loss_numpy: 4531.73779296875|  0:00:20s
epoch 6  | loss: 1905.00614| val_0_unsup_loss_numpy: 4584.5244140625|  0:00:24s
epoch 7  | loss: 2331.02901| val_0_unsup_loss_numpy: 4590.29931640625|  0:00:27s
epoch 8  | loss: 2073.26027| val_0_unsup_loss_numpy: 4609.150390625|  0:00:30s
epoch 9  | loss: 763.89953| val_0_unsup_loss_numpy: 4599.2548828125|  0:00:34s
epoch 10 | loss: 559.83456| val_0_unsup_loss_numpy: 4594.5634765625|  0:00:37s
epoch 11 | loss: 397.22112| val_0_unsup_loss_numpy: 4577.89013671875|  0:00:41s

Early stopping occurred at epoch 11 with best



epoch 0  | loss: 1710.9842| train_MCRMSE: 30.73022| valid_MCRMSE: 30.8094 |  0:00:03s
epoch 1  | loss: 1192.82632| train_MCRMSE: 25.8005 | valid_MCRMSE: 25.83953|  0:00:07s
epoch 2  | loss: 730.05097| train_MCRMSE: 19.99073| valid_MCRMSE: 20.02507|  0:00:10s
epoch 3  | loss: 391.79099| train_MCRMSE: 18.47279| valid_MCRMSE: 18.58014|  0:00:14s
epoch 4  | loss: 302.03982| train_MCRMSE: 14.70228| valid_MCRMSE: 14.7522 |  0:00:18s
epoch 5  | loss: 277.39871| train_MCRMSE: 15.0459 | valid_MCRMSE: 15.1344 |  0:00:21s
epoch 6  | loss: 263.3166| train_MCRMSE: 14.17586| valid_MCRMSE: 14.26668|  0:00:25s
epoch 7  | loss: 255.23289| train_MCRMSE: 13.6484 | valid_MCRMSE: 13.72392|  0:00:28s
epoch 8  | loss: 248.30778| train_MCRMSE: 13.3284 | valid_MCRMSE: 13.41342|  0:00:32s
epoch 9  | loss: 245.01815| train_MCRMSE: 13.3023 | valid_MCRMSE: 13.34598|  0:00:36s
epoch 10 | loss: 241.91678| train_MCRMSE: 13.12555| valid_MCRMSE: 13.20233|  0:00:39s
epoch 11 | loss: 237.21779| train_MCRMSE: 13.03499| va



In [11]:
test = pd.read_parquet('../data/test.parquet.gzip')
test = test.drop(columns=low_imp_features)

In [15]:
y_pred = model.predict(test.values)
out = pd.DataFrame(y_pred, columns=pred_cols)
out['DHSID'] = test.index
out = out[['DHSID'] + pred_cols]
out.to_csv('../submission/tabnet_test.csv', index=False)