## Tabular Playground Series May 2021

<img src="https://i.imgur.com/uHVJtv0.png">
<img src="https://opengraph.githubassets.com/275715e91fe6af32b0c8907606985ff18606c16c36c6d74935f5e9a10b4608c1/stanfordmlgroup/ngboost">

<br><br>

### Notebook Contents:

Given the probabilistic nature of the output I've decided to try [NGBoost](https://stanfordmlgroup.github.io/projects/ngboost/) a wonderful Gradient Boosting library developed by the great [Stanford ML group](https://stanfordmlgroup.github.io/). 

A first try gave results comparable to those I obtained using LightGBM (+ 5hrs of Optuna hyperparameters search). 

**Under construction**

##### Props

Props to [corochann](https://www.kaggle.com/corochann/optuna-tutorial-for-hyperparameter-optimization), I believe this notebook is the best you can find about Optuna.

<h5> Versioning </h5>

V3 was the submitted Run with 1.08820 Public Leaderboard score.

In [None]:
import torch
device = 'gpu' if torch.cuda.is_available() else 'cpu'
import numpy as np
import pandas as pd
pd.options.display.max_columns = 100

from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold
from sklearn.metrics import log_loss, accuracy_score
from sklearn.preprocessing import QuantileTransformer, StandardScaler, PolynomialFeatures, LabelEncoder
from sklearn.feature_selection import VarianceThreshold, SelectKBest
import warnings
warnings.filterwarnings('ignore')
import optuna
import hyperopt
import tqdm
import gc
import os
root_path = '/kaggle/input/tabular-playground-series-may-2021'

In [None]:
train = pd.read_csv(os.path.join(root_path, 'train.csv'))
test = pd.read_csv(os.path.join(root_path, 'test.csv'))
sample_submission = pd.read_csv(os.path.join(root_path, 'sample_submission.csv'))

#label mapping
unique_targets = train['target'].unique().tolist()
label_mapping = dict(zip(unique_targets, [int(i[-1]) - 1 for i in unique_targets]))

label_mapping

#preprocessing

train['target'] = train['target'].map(label_mapping)
dataset = pd.concat([train, test], axis = 0, ignore_index = True)
train_len = len(train)

features = dataset.drop(['id', 'target'], axis=1).columns.tolist()
categorical_feature_columns = (dataset[features].apply(lambda x: x.nunique(), axis = 0)
                               .rename('n_unique').to_frame()
                               .query('n_unique < 10').index.tolist())

label = LabelEncoder()

for column in categorical_feature_columns:
    label.fit(dataset[column])
    dataset[column] = label.transform(dataset[column])
        
categorical_features = list(range(len(categorical_feature_columns)))

train_preprocessed = dataset[:train_len]
test_preprocessed = dataset[train_len:]

assert train_preprocessed.shape[1] == test_preprocessed.shape[1]

del train, test
gc.collect()
cat_indices = [features.index(i) for i in categorical_feature_columns]

In [None]:
!pip install ngboost==0.3.10

In [None]:
from ngboost import NGBClassifier
from ngboost.distns import k_categorical

OPTUNA_OPTIMIZATION = True
N_SPLITS = 5 #Number of folds for validation
N_TRIALS = 3 #Number of trials to find best hyperparameters
TIME = 3600*2 #Time to run optimization (alternative to N_TRIALS)
FOLD_RANDOM_SEED = 42
REPEAT = True 

FIXED_PARAMS = {"random_state": 42,
                "Dist": k_categorical(4), 
                "verbose": True,
                "verbose_eval": 100,
                "n_estimators": 500}

EARLY_STOP = 50

In [None]:
from sklearn.tree import DecisionTreeRegressor

dtr_friedman_3 = DecisionTreeRegressor(criterion='friedman_mse', max_depth=3)
dtr_friedman_5 = DecisionTreeRegressor(criterion='friedman_mse', max_depth=5)
dtr_mse_3 = DecisionTreeRegressor(criterion='mse', max_depth=3)
dtr_mse_5 = DecisionTreeRegressor(criterion='mse', max_depth=5)

In [None]:
skfold = StratifiedKFold(N_SPLITS, shuffle = True, random_state = FOLD_RANDOM_SEED)
if REPEAT:
    skfold = RepeatedStratifiedKFold(N_SPLITS, n_repeats=2, random_state=FOLD_RANDOM_SEED)

def objective(trial, cv=skfold):
    
    param_to_search_ngb = {
        "Base": trial.suggest_categorical("Base", [dtr_friedman_3,dtr_friedman_5,
                                                   dtr_mse_3,dtr_mse_5]),
        "natural_gradient": trial.suggest_categorical("natural_gradient", [True, False]),
        "col_sample": trial.suggest_float('col_sample', 1E-16, 1.0),
        "minibatch_frac": trial.suggest_float('minibatch_frac', 1E-16, 1.0),
        "learning_rate": trial.suggest_categorical('learning_rate', [0.001, 0.005, 0.01, 0.05, 0.1]),
    }
    
    param_ngb = param_to_search_ngb.copy()
    param_ngb.update(FIXED_PARAMS)
    
    val_losses = []
    losses = []
    
    for kfold, (train_idx, val_idx) in tqdm.tqdm(enumerate(cv.split(train_preprocessed[features].values, 
                                                                    train_preprocessed['target'].values))):
        
        X_train = train_preprocessed.loc[train_idx, features].values
        y_train = train_preprocessed.loc[train_idx, 'target'].astype(int)
        
        X_valid = train_preprocessed.loc[val_idx, features].values
        y_valid = train_preprocessed.loc[val_idx, 'target'].astype(int)
        
        model = NGBClassifier(**param_ngb)  
        model.fit(X_train, y_train, X_val = X_valid, Y_val = y_valid,
                  early_stopping_rounds = EARLY_STOP)
        scores = model.predict_proba(X_valid)
        loss = log_loss(y_valid, scores)
        losses.append(loss)
    
    return np.average(losses)

In [None]:
if OPTUNA_OPTIMIZATION:
    study = optuna.create_study(study_name = 'ngb_parameter_opt', direction="minimize")
    
    #study.optimize(objective, n_trials=1, show_progress_bar = True)
    study.optimize(objective, timeout=TIME, show_progress_bar = True) 
    
    trial = study.best_trial
    
    print("  Value: {}".format(trial.value))
    
    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))
    best_params = FIXED_PARAMS.copy()
    best_params.update(trial.params)


In [None]:
if OPTUNA_OPTIMIZATION:
    final_model = NGBClassifier(**best_params)

In [None]:
test_preds = []
loglosses = []
for kfold, (train_idx, val_idx) in enumerate(skfold.split(train_preprocessed[features].values, 
                                                       train_preprocessed['target'].values)):
        
        final_model = NGBClassifier(**best_params)
        
        X_train = train_preprocessed.loc[train_idx, features].values
        y_train = train_preprocessed.loc[train_idx, 'target'].values.astype(int)
        X_valid = train_preprocessed.loc[val_idx, features].values
        y_valid = train_preprocessed.loc[val_idx, 'target'].astype(int)
        
        final_model.fit(X_train, y_train, X_val = X_valid, Y_val = y_valid, 
                        early_stopping_rounds=EARLY_STOP)
        
        probs = final_model.predict_proba(X_valid)
        
        logloss = log_loss(y_valid, probs)
        loglosses.append(logloss)
        print('Fold: {}\t Validation logloss: {}\n'.format(kfold, logloss))
        
        test_preds.append(final_model.predict_proba(test_preprocessed[features].values))
        
print("Best Parameters mean logloss: {}".format(np.mean(loglosses)))

In [None]:
test_predictions = np.mean(test_preds, axis = 0)
assert len(test_predictions) == len(test_preprocessed)
predictions_df = pd.DataFrame(test_predictions, columns = ["Class_1", "Class_2", "Class_3", "Class_4"])
predictions_df['id'] = sample_submission['id']
predictions_df.to_csv("submission.csv", index = False)