## Install Tabnet library

In [None]:
import sys
sys.path.append("../input/pytorch-tabnet-zip")

## Import required libraries

In [None]:
import os

import pandas as pd
import numpy as np
import datatable as dt
import warnings
import random
warnings.filterwarnings('ignore')
pd.set_option('max_columns',None)
from sklearn.metrics import mean_squared_error

from time import time
import pprint
import joblib
from functools import partial
from sklearn.model_selection import KFold, StratifiedKFold

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from pytorch_tabnet.tab_model import TabNetRegressor

## Defined global variables

In [None]:
FOLDER = "/kaggle/input/petfinder-pawpularity-score/"
TRAIN_FNAME = os.path.join(FOLDER, "train.csv")
TEST_FNAME = os.path.join(FOLDER, "test.csv")
SUBMISSION_FNAME = os.path.join(FOLDER, "sample_submission.csv")

RANDOM_STATE = 42
TEST_SIZE = 0.1
MAX_EPOCHS_TABNET = 200

In [None]:
def rmse_fn(y_pred, y_true):
    return np.sqrt(mean_squared_error(y_pred, y_true))

## Import data

In [None]:
train = pd.read_csv(TRAIN_FNAME)
test = pd.read_csv(TEST_FNAME)
submission = pd.read_csv(SUBMISSION_FNAME)

In [None]:
train.shape, test.shape, submission.shape

In [None]:
train.head()

In [None]:
train.shape

## Prepare data

In [None]:
train = train.rename(columns={"Pawpularity": "target"})

In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    train.drop(["target", "Id"], axis=1),
    train.target,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
    stratify=train.target
)
X_train = X_train.values
X_val = X_val.values
y_train = y_train.values.reshape(-1, 1)
y_val = y_val.values.reshape(-1, 1)

## Cross validation for Tabnet, XGBoost & both together

In [None]:
from pytorch_tabnet.tab_model import TabNetRegressor


def fit_pred_tabnet(X_train, y_train, X_val, random_state=RANDOM_STATE, max_epochs_tabnet=MAX_EPOCHS_TABNET):
    model = TabNetRegressor(verbose=1,seed=random_state)
    print("Fit tabnet")
    model.fit(X_train=X_train, y_train=y_train,
               patience=5,max_epochs=max_epochs_tabnet,batch_size=256,
               eval_metric=['rmse'])
    print("Predict tabnet")
    pred_tabnet = model.predict(X_val)
    pred_tabnet = pred_tabnet.reshape(len(pred_tabnet))
    return pred_tabnet


pred_tabnet = fit_pred_tabnet(X_train, y_train, X_val)

In [None]:
rmse_tabnet = rmse_fn(pred_tabnet, y_val)
rmse_tabnet

In [None]:
import xgboost as xgb


def fit_pred_xgb(X_train, y_train, X_val, random_state=RANDOM_STATE):
    xgb_regressor = xgb.XGBRegressor(seed=random_state, **{'n_estimators': 10000, 'max_depth': 7, 'learning_rate': 0.0022137388320075573})
    print("Fit XGB")
    xgb_regressor.fit(X_train, y_train)
    print("Predict XGB")
    pred_xgb = xgb_regressor.predict(X_val)
    return pred_xgb

pred_xgb = fit_pred_xgb(X_train, y_train, X_val)

In [None]:
rmse_xgb = rmse_fn(pred_xgb, y_val)
rmse_xgb

In [None]:
weights = list(np.arange(0, 1, 0.1))
weights

In [None]:
def avg_tabnet_xgb(pred_tabnet, pred_xgb, w_tabnet):
    return np.average([pred_tabnet, pred_xgb], weights=[w_tabnet, 1-w_tabnet], axis=0)

In [None]:
list_rmse = []

for w_tabnet in weights:
    pred_stack = avg_tabnet_xgb(pred_tabnet, pred_xgb, w_tabnet)
    rmse = rmse_fn(pred_stack, y_val)
    list_rmse.append(rmse)

print(list_rmse)
idx_best_w = list_rmse.index(min(list_rmse))
idx_best_w

## Respective RMSE

In [None]:
print(f"rmse_tabnet={rmse_tabnet}")
print(f"rmse_xgb={rmse_xgb}")

In [None]:
best_weight = weights[idx_best_w]
pred_stack = avg_tabnet_xgb(pred_tabnet, pred_xgb, best_weight)
rmse_weighted_avg = rmse_fn(pred_stack, y_val)

print(f"best weight is w={best_weight}")
print(f"weighted average ={rmse_weighted_avg}")

## Make the final prediction

In [None]:
def get_final_pred(X_train, y_train, X_val, w_tabnet):
    pred_tabnet = fit_pred_tabnet(X_train, y_train, X_val)
    pred_xgb = fit_pred_xgb(X_train, y_train, X_val)
    pred_stack = avg_tabnet_xgb(pred_tabnet, pred_xgb, w_tabnet)
    return pred_stack

In [None]:
X_train = train.drop(["target", "Id"], axis=1).values
y_train = train.target.values.reshape(-1, 1)

X_test = test.drop(["Id"], axis=1).values

In [None]:
predictions = get_final_pred(X_train=X_train, y_train=y_train, X_val=X_test, w_tabnet=best_weight)

In [None]:
test["Pawpularity"] = predictions
test[["Id", "Pawpularity"]].to_csv('submission.csv', index=False)