In [None]:
# install PyTorch Tabular first
!pip install pytorch_tabular
# This is for a custom optimizer. PyTorch Tabular is flexible enough to use custom optimizers
!pip install torch_optimizer

In [None]:
import numpy as np
import pandas as pd
import os
import plotly.graph_objects as go
import plotly.express as px
from plotly.offline import iplot, init_notebook_mode
init_notebook_mode(connected=True)
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.figure_factory as ff
# NODE and ML tools
from sklearn.model_selection import KFold, StratifiedKFold, StratifiedShuffleSplit
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from pytorch_tabular import TabularModel
from pytorch_tabular.models import CategoryEmbeddingModelConfig, NodeConfig, TabNetModelConfig
from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig, ExperimentConfig
from pytorch_tabular.categorical_encoders import CategoricalEmbeddingTransformer
from torch_optimizer import QHAdam
import category_encoders as ce
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
import time
import wandb
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
def create_stratified_folds_for_regression(data_df, n_splits=5):
    """
    @param data_df: training data to split in Stratified K Folds for a continous target value
    @param n_splits: number of splits
    @return: the training data with a column with kfold id
    """
    data_df['kfold'] = -1
    # randomize the data
    data_df = data_df.sample(frac=1).reset_index(drop=True)
    # calculate the optimal number of bins based on log2(data_df.shape[0])
    num_bins = np.int(np.floor(1 + np.log2(len(data_df))))
    print(f"Num bins: {num_bins}")
    # bins value will be the equivalent of class value of target feature used by StratifiedKFold to 
    # distribute evenly the classed over each fold
    data_df.loc[:, "bins"] = pd.cut(pd.to_numeric(data_df['target'], downcast="signed"), bins=num_bins, labels=False)
    kf = StratifiedKFold(n_splits=n_splits)
    
    # set the fold id as a new column in the train data
    for f, (t_, v_) in enumerate(kf.split(X=data_df, y=data_df.bins.values)):
        data_df.loc[v_, 'kfold'] = f
    
    # drop the bins column (no longer needed)
    data_df = data_df.drop("bins", axis=1)
    
    return data_df

def create_stratified_shuffle_split_for_regression(data_df):
    """
    @param data_df: training data to split in Stratified K Folds for a continous target value
    @param n_splits: number of splits
    @return: the training data with a column with kfold id
    """
    data_df['kfold'] = -1
    # randomize the data
    data_df = data_df.sample(frac=1).reset_index(drop=True)
    # calculate the optimal number of bins based on log2(data_df.shape[0])
    num_bins = np.int(np.floor(1 + np.log2(len(data_df))))
    print(f"Num bins: {num_bins}")
    # bins value will be the equivalent of class value of target feature used by StratifiedKFold to 
    # distribute evenly the classed over each fold
    data_df.loc[:, "bins"] = pd.qcut(pd.to_numeric(data_df['target'], downcast="signed"), q=num_bins, labels=False)
    kf = StratifiedShuffleSplit(n_splits=1, test_size =0.2, random_state=42)
    
    # set the fold id as a new column in the train data
    for f, (t_, v_) in enumerate(kf.split(X=data_df, y=data_df.bins.values)):
        data_df.loc[v_, 'kfold'] = f
    
    # drop the bins column (no longer needed)
    data_df = data_df.drop("bins", axis=1)
    
    return data_df

In [None]:
df_train = pd.read_csv('/kaggle/input/tabular-playground-series-feb-2021/train.csv', index_col='id')
df_test = pd.read_csv('/kaggle/input/tabular-playground-series-feb-2021/test.csv', index_col='id')
sample = pd.read_csv('/kaggle/input/tabular-playground-series-feb-2021/sample_submission.csv', index_col='id')

In [None]:
n_splits = 2
df_train = create_stratified_shuffle_split_for_regression(df_train)
df_valid = df_train.loc[df_train.kfold==0]
df_train = df_train.loc[df_train.kfold!=0]
df_train.drop(columns='kfold', inplace=True)
df_valid.drop(columns='kfold', inplace=True)

In [None]:
df_valid.shape, df_train.shape

In [None]:
def get_configs(train):
    epochs = 50
    batch_size = 64
    steps_per_epoch = int((len(train)//batch_size)*0.9)
    data_config = DataConfig(
        target=['target'], #target should always be a list. Multi-targets are only supported for regression. Multi-Task Classification is not implemented
        continuous_cols=['cont0', 'cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6',
       'cont7', 'cont8', 'cont9', 'cont10', 'cont11', 'cont12', 'cont13'],
        categorical_cols=['cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8',
       'cat9'],
        continuous_feature_transform="quantile_normal"
    )
    trainer_config = TrainerConfig(
        auto_lr_find=True, # Runs the LRFinder to automatically derive a learning rate
        batch_size=batch_size,
        max_epochs=epochs,
        early_stopping_patience = 5,
        gpus=1, #index of the GPU to use. 0, means CPU
    )
#     optimizer_config = OptimizerConfig(lr_scheduler="OneCycleLR", lr_scheduler_params={"max_lr":0.005, "epochs": epochs, "steps_per_epoch":steps_per_epoch})
    optimizer_config = OptimizerConfig(lr_scheduler="ReduceLROnPlateau", lr_scheduler_params={"patience":3})
#     model_config = CategoryEmbeddingModelConfig(
#         task="regression",
#         layers="128-64-32",  # Number of nodes in each layer
#         activation="ReLU", # Activation between each layers
#         learning_rate = 1e-3,
#         batch_norm_continuous_input=True,
#         use_batch_norm =True,
#         dropout=0.0,
#         embedding_dropout=0.0,
#         initialization="kaiming",
#         target_range=[(df_train[col].min(),df_train[col].max()) for col in ['target']]
#     )

    model_config = NodeConfig(
        task="regression",
        num_layers=1, # Number of Dense Layers
        num_trees=2048, #Number of Trees in each layer
        depth=6, #Depth of each Tree
        embed_categorical=True, #If True, will use a learned embedding, else it will use LeaveOneOutEncoding for categorical columns
        learning_rate = 1e-3,
        additional_tree_output_dim = 25,
        target_range=[(df_train[col].min(),df_train[col].max()) for col in ['target']]
    )
#     model_config = TabNetModelConfig(
#         task="regression",
#         n_d=8,
#         n_a=8,
#         n_steps=3,
#         gamma=1, #btw 1 and 2
#         n_independent = 2,
#         n_shared= 2,
#         virtual_batch_size=128,
#         mask_type="sparsemax",
#         learning_rate = 1e-3,
#         target_range=[(df_train[col].min(),df_train[col].max()) for col in ['target']]
#     )
    return data_config, trainer_config, optimizer_config, model_config

In [None]:
def rmse(y_hat, y):
    return mean_squared_error(y.detach().cpu().numpy(), y_hat.detach().cpu().numpy(), squared=False)

def train(train, valid):
    data_config, trainer_config, optimizer_config, model_config = get_configs(train)
#     exp_config = ExperimentConfig("Tabular Playground Feb PyTorch Tabular", run_name="NODE", log_target="wandb")
    tabular_model = TabularModel(
        data_config=data_config,
        model_config=model_config,
        optimizer_config=optimizer_config,
        trainer_config=trainer_config,
#         experiment_config = exp_config
    )
    # fit model
    tabular_model.fit(train=train, validation=valid, optimizer=QHAdam, 
                  optimizer_params={"nus": (0.7, 1.0), "betas": (0.95, 0.998), "weight_decay": 0})
    result = tabular_model.evaluate(valid)
    return tabular_model

In [None]:
model = train(df_train, df_valid)

In [None]:
pred = model.predict(df_test)

In [None]:
pred.columns

In [None]:
sample['target'] = pred["target_prediction"].values
sample.to_csv('submission.csv', index=True)