Original https://www.kaggle.com/datafan07/optiver-volatility-predictions-using-tabnet/notebook

In [None]:
!pip -q install ../input/pytorchtabnet/pytorch_tabnet-3.1.1-py3-none-any.whl

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import numpy.matlib

import matplotlib.gridspec as gridspec
from matplotlib.ticker import MaxNLocator

from scipy import stats
from scipy.stats import norm
from joblib import Parallel, delayed

import shutil
import glob

from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler, RobustScaler
from sklearn.metrics import r2_score
from sklearn.cluster import KMeans
from sklearn.model_selection import KFold, GroupKFold

from pytorch_tabnet.metrics import Metric
from pytorch_tabnet.tab_model import TabNetRegressor

import torch
from torch.optim import Adam, SGD
from torch.optim.lr_scheduler import ReduceLROnPlateau, CosineAnnealingWarmRestarts

from tsfresh.feature_extraction import feature_calculators as fcs

# setting some globl config

plt.style.use('ggplot')
orange_black = [
    '#fdc029', '#df861d', '#FF6347', '#aa3d01', '#a30e15', '#800000', '#171820'
]
plt.rcParams['figure.figsize'] = (16,9)
plt.rcParams["figure.facecolor"] = '#FFFACD'
plt.rcParams["axes.facecolor"] = '#FFFFE0'
plt.rcParams["axes.grid"] = True
plt.rcParams["grid.color"] = orange_black[3]
plt.rcParams["grid.alpha"] = 0.5
plt.rcParams["grid.linestyle"] = '--'


import warnings
warnings.filterwarnings("ignore")

import psutil
psutil.cpu_count()

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
print(gpu_info)

In [None]:
# config


class CFG:
    lag_range = [1,2,-1,-2]
    use_folds = [0,1,2,3,4]
    nof_epochs = 15
    nfolds = 5
    data_dir = '../input/ventilator-pressure-prediction/'
    verbose = 1


# Functions

In [None]:
def reduce_memory_usage(df):
    
    start_memory = np.round(df.memory_usage().sum() / 1024**2,2)
    print(f"Memory usage of dataframe is {start_memory} MB")
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != 'object':
            c_min = df[col].min()
            c_max = df[col].max()
            
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    pass
        else:
            df[col] = df[col].astype('category')
    
    end_memory = np.round(df.memory_usage().sum() / 1024**2,2)
    print(f"Memory usage of dataframe after reduction {end_memory} MB")
    print(f"Reduced by { np.round(100 * (start_memory - end_memory) / start_memory,2) } % ")
    return df

In [None]:
# Pytorch metric setup
def mae(y_true, y_pred):
    # Function to calculate the root mean squared percentage error
    return np.mean(np.abs(y_true - y_pred))

class MAE(Metric):
    def __init__(self):
        self._name = "mae"
        self._maximize = False

    def __call__(self, y_true, y_score):
        
        return np.mean(np.abs(y_true - y_score))

def MAELoss(y_pred, y_true):
    return torch.mean(torch.abs(y_true - y_pred )).clone()

In [None]:
def add_features(df):
    
    
    xtr_gr_in = df.groupby('breath_id')['u_in']
    df['last_value_u_in'] = np.round(xtr_gr_in.transform('last'),2)
    del xtr_gr_in

    # variations around u_in
    xgr = df.groupby('breath_id')['u_in']
    for lag in CFG.lag_range:
        df['u_in_lag' + str(lag)] = np.round(xgr.shift(lag),2)
    df['u_in_median'] = xgr.transform('median')

    # slope of the curve - v1
    df['u_in_slope'] =\
    xgr.transform(lambda s: fcs.linear_trend(s, [{'attr': 'slope'}])[0][1] )

    # kurtosis
    df['u_in_kurt'] = np.round(xgr.transform(fcs.kurtosis),2)
    
    # varia
    df['area'] = df['time_step'] * df['u_in']
    df['area'] = np.round(df.groupby('breath_id')['area'].cumsum(),2)
    df['u_in_cumsum'] = (df['u_in']).groupby(df['breath_id']).cumsum()
  
    # categoricals
    df['R'] = df['R'].astype(str)
    df['C'] = df['C'].astype(str)
    df['RxC'] = df['C'] + '_' + df['R']
   
    df.fillna(0, inplace = True)
    return df

# Data

In [None]:
xtrain = pd.read_csv('../input/ventilator-pressure-prediction/train.csv')
xtest = pd.read_csv('../input/ventilator-pressure-prediction/test.csv')
submission = pd.read_csv('../input/ventilator-pressure-prediction/sample_submission.csv')

In [None]:
%time
xtrain = add_features(xtrain)
xtest = add_features(xtest)

In [None]:
%time
xtrain = reduce_memory_usage(xtrain)
xtest = reduce_memory_usage(xtest)

# Model

In [None]:
breath_train = xtrain['breath_id'].copy()
id_train = xtrain['id'].copy()
ytrain = xtrain['pressure']
breath_test = xtest['breath_id'].copy()
id_test = xtest['id'].copy()

xtrain.drop(['breath_id', 'id', 'pressure'], axis = 1, inplace = True)
xtest.drop(['breath_id', 'id'], axis = 1, inplace = True)


categorical_columns = ['R', 'C', 'RxC']
numerical_columns = [f for f in xtrain.columns if f not in categorical_columns ]


In [None]:
categorical_dims =  {}

for col in categorical_columns:
    l_enc = LabelEncoder()
    xtrain[col] = l_enc.fit_transform(xtrain[col].values)
    xtest[col] = l_enc.transform(xtest[col].values)
    categorical_dims[col] = len(l_enc.classes_)
    
for col in numerical_columns:
    scaler = RS = RobustScaler()
    xtrain[col] = scaler.fit_transform(xtrain[col].values.reshape(-1, 1))
    xtest[col] = scaler.transform(xtest[col].values.reshape(-1, 1))
        
    

In [None]:
cat_idxs = [ i for i, f in enumerate(xtrain.columns.tolist()) if f in categorical_columns]

cat_dims = [ categorical_dims[f] for i, f in enumerate(xtrain.columns.tolist()) if f in categorical_columns]

In [None]:
tabnet_params = dict(
    cat_idxs=cat_idxs,
    cat_dims=cat_dims,
    cat_emb_dim=1,
    n_d = 16,
    n_a = 16,
    n_steps = 2,
    gamma = 2,
    n_independent = 2,
    n_shared = 2,
    lambda_sparse = 0,
    optimizer_fn = Adam,
    optimizer_params = dict(lr = (2e-2)),
    mask_type = "entmax",
    scheduler_params = dict(T_0=200, T_mult=1, eta_min=1e-4, last_epoch=-1, verbose=False),
    scheduler_fn = CosineAnnealingWarmRestarts,
    seed = 42,
    verbose = CFG.verbose
    
)

In [None]:
# Create out of folds array
oof_predictions = np.zeros((xtrain.shape[0], 1))
test_predictions = np.zeros(xtest.shape[0])
feature_importances = pd.DataFrame()
feature_importances["feature"] = xtrain.columns.tolist()
stats = pd.DataFrame()
explain_matrices = []
masks_ =[]



k_fold = GroupKFold(n_splits = CFG.nfolds)
for fold, (id0, id1) in enumerate(k_fold.split(xtrain, ytrain, breath_train)):
    print(f'Training fold {fold}')
    X_train, X_val = xtrain.iloc[id0].values, xtrain.iloc[id1].values
    y_train, y_val = ytrain.iloc[id0].values.reshape(-1,1), ytrain.iloc[id1].values.reshape(-1,1)


    clf =  TabNetRegressor(**tabnet_params)
    clf.fit(
      X_train, y_train,
      eval_set=[(X_val, y_val)],
      max_epochs = CFG.nof_epochs,
      patience = 50,
      batch_size = 1024*20, 
      virtual_batch_size = 128*20,
      num_workers = 4,
      drop_last = False,
      eval_metric = [MAE],
      loss_fn = MAELoss
      )
    
    saving_path_name = f"./tabnet_f{fold}"
    saved_filepath = clf.save_model(saving_path_name)
    
    explain_matrix, masks = clf.explain(X_val)
    explain_matrices.append(explain_matrix)
    masks_.append(masks[0])
    masks_.append(masks[1])
      
    oof_predictions[id1] = clf.predict(X_val)
    test_predictions += clf.predict(xtest.values).flatten()/5
    feature_importances[f"importance_fold{fold}+1"] = clf.feature_importances_
    
    stats[f'fold{fold}_train_mae']=clf.history['loss']
    stats[f'fold{fold}_val_mae']=clf.history['val_0_mae']
    
print(f'OOF score across folds: {mae(y, oof_predictions.flatten())}')

In [None]:
prval = pd.DataFrame(id_train)
prval['pressure'] = oof_predictions
prval.to_csv('prval_tabnet.csv', index = False)


prfull = pd.DataFrame(id_test)
prfull['pressure'] = test_predictions
prfull.to_csv('prfull_tabnet.csv', index = False)

In [None]:
# feature importances
feature_importances['mean_importance']=feature_importances[['importance_fold0+1','importance_fold1+1']].mean(axis=1)
feature_importances.sort_values(by='mean_importance', ascending=False, inplace=True)
sns.barplot(y=feature_importances['feature'][:25],x=feature_importances['mean_importance'][:25], palette='inferno')
plt.title('Mean Feature Importance by Folds')
plt.show()

# Submission

In [None]:
test['target'] = test_predictions
test[['row_id', 'target']].to_csv('submission.csv',index = False)