In [None]:
!pip install -U scikit-learn

import os
import time

import numpy as np
import pandas as pd
from scipy import stats

RANDOM_SEED = 111

np.random.seed(RANDOM_SEED)

from datetime import datetime
from numpy.random import default_rng
rng = default_rng(RANDOM_SEED)

import matplotlib.pyplot as plt

import sklearn
from sklearn.metrics import mean_squared_log_error, mean_squared_error, mean_squared_log_error
from sklearn.metrics import roc_curve, auc, roc_auc_score, accuracy_score, make_scorer
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler, StandardScaler, OneHotEncoder, Binarizer, KBinsDiscretizer, QuantileTransformer, PolynomialFeatures, LabelEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV, KFold, StratifiedKFold, StratifiedShuffleSplit, ShuffleSplit
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV
from sklearn.ensemble import StackingRegressor

DS_DIR = '/kaggle/input/aug21-ds'
INPUT_DIR = '/kaggle/input/tabular-playground-series-aug-2021'
OUTPUT_DIR = './'

In [None]:
sklearn.__version__

In [None]:
!pip install -q h2o

import h2o
from h2o.automl import H2OAutoML
from h2o.sklearn import H2OAutoMLRegressor
h2o.init()

### Load source data

We will split train dataset into the train (80%) and holdout (20%) to validate train results and stacking.

In [None]:
train_df = pd.read_csv(os.path.join(INPUT_DIR, 'train.csv'), index_col='id')
test_df = pd.read_csv(os.path.join(INPUT_DIR,'test.csv'), index_col='id')

train_df = train_df.sample(frac=1, random_state=RANDOM_SEED)
holdout_size = train_df.shape[0]//5
holdout_df = train_df.iloc[:holdout_size]
train_df = train_df.iloc[holdout_size:]
holdout_labels_df = holdout_df['loss']
holdout_df.drop(columns='loss', inplace=True)

labels = train_df['loss']
train_df.drop(columns='loss', inplace=True)
total_df = train_df.append(holdout_df).append(test_df) 

print('validate sample seed: ', holdout_df.iloc[100, :5])

In [None]:
pd.concat((total_df.min(), total_df.max(), total_df.mean(), total_df.std(), total_df.nunique()), axis=1)

### Build H2O model

The built-in H2O stacking algorithm reach as much as **7.87097** in private score.<br/>
To improve the scores, increase the `max_runtime_secs` to 1h.

In [None]:
class LiteAutoMLWrapper(BaseEstimator):
  def __init__(self, all_cols, timeout=3600):
    self.all_cols = all_cols
    self.roles = {
        #lama.dataset.roles.CategoryRole(dtype=np.float): all_cols,
        lama.dataset.roles.NumericRole(dtype=np.float): all_cols
    }
    
    task = Task('reg', loss='mse', metric='mse')
    self.automl = TabularAutoML(task=task, timeout=timeout, 
                           reader_params  = {'cv': 5, 'random_state': RANDOM_SEED},
                           general_params = {'use_algos': [['lgb', 'lgb_tuned', 'cb', 'cb_tuned']]})

  def fit(self, X, y, **kwargs):
    self.automl.fit_predict({'data': X, 'target': y}, train_features=self.all_cols, roles=self.roles)
    
  def predict(self, X):
    y_pred = self.automl.predict({'data': X}, features_names=self.all_cols)
    y_pred = y_pred.data[:, 0]
    return y_pred


class FLAMLWrapper(BaseEstimator):
  def __init__(self, **kwargs):
    self.settings = kwargs
    self.automl = flaml.AutoML()

  def fit(self, X, y, **kwargs):
    self.automl.fit(X, y, **self.settings)
    
  def predict(self, X):
    y_pred = self.automl.predict(X)
    return y_pred


def RMSE(y_true, y_pred, **kwargs):
    return np.sqrt(mean_squared_error(y_true, y_pred, **kwargs))

In [None]:
train_frame = h2o.H2OFrame(train_df)
holdout_frame = h2o.H2OFrame(holdout_df)
test_frame = h2o.H2OFrame(test_df)

model = H2OAutoMLRegressor(seed=RANDOM_SEED, max_runtime_secs=600, nfolds=5, stopping_metric='RMSE', sort_metric='RMSE', 
                           stopping_rounds=10, verbosity='warn')

model.fit(train_frame, labels.values)
test_pred = np.squeeze(model.predict(test_frame).as_data_frame().values)
train_pred = np.squeeze(model.predict(train_frame).as_data_frame().values)
holdout_pred = np.squeeze(model.predict(holdout_frame).as_data_frame().values)

In [None]:
score_train = RMSE(train_pred, labels.values)
score_holdout = RMSE(holdout_pred, holdout_labels_df.values)
print(score_train, score_holdout)

In [None]:
model.estimator.leaderboard.as_data_frame().iloc[:12]

In [None]:
#h2o.get_model(model.estimator.get_best_model().metalearner().model_id)

leaderbrd = model.estimator.leaderboard.as_data_frame().iloc[2:12]
leaderbrd = leaderbrd['model_id'].values

test_ds_pred, train_ds_pred, holdout_ds_pred = [], [], []
for mdl in leaderbrd:
  mdl = h2o.get_model(mdl)
  test_ds_pred.append(np.squeeze(mdl.predict(test_frame).as_data_frame().values))
  train_ds_pred.append(np.squeeze(mdl.predict(train_frame).as_data_frame().values))
  holdout_ds_pred.append(np.squeeze(mdl.predict(holdout_frame).as_data_frame().values))

test_ds_pred = np.vstack(test_ds_pred).T
train_ds_pred = np.vstack(train_ds_pred).T
holdout_ds_pred = np.vstack(holdout_ds_pred).T

### Save new or load existing predicted data

In [None]:
output_res = pd.DataFrame(data=test_ds_pred)
output_res.to_csv('test_pred.csv', index=False)

output_res = pd.DataFrame(data=train_ds_pred)
output_res.to_csv('train_pred.csv', index=False)

output_res = pd.DataFrame(data=holdout_ds_pred)
output_res.to_csv('holdout_pred.csv', index=False)

In [None]:
test_ds_pred = pd.read_csv(os.path.join(DS_DIR,'test_pred.csv')).values
train_ds_pred = pd.read_csv(os.path.join(DS_DIR, 'train_pred.csv')).values
holdout_ds_pred = pd.read_csv(os.path.join(DS_DIR, 'holdout_pred.csv')).values

train_labels = labels.values
holdout_labels = holdout_labels_df.values

### Do manual stacking and blending
We will do and compare stacking among the best 10 sub-models from the same H2O model

|stacking type|private score
|---|---
|Take the best fold in holdout dataset (no stacking)|7.89102
|Mean/median among numeric float label values|7.89707
|Voting among discreet label values|7.92492
|Best weighted average against holdout labels|7.87679
|Add a normally-distributed noise to existing predictions|7.86517
|H2O stacking with predicted values from holdout dataset|7.87028
|H2O default stacking with 20% test pseudolabels|7.86660
|H2O recursive chaining with pseudolabels|7.84977

In [None]:
#1) Take the best fold in holdout dataset (no stacking)
#Private Score: 7.89102

best_holdout_score, scores, best_idx = 10, [], -1
for i in range(train_ds_pred.shape[1]):
  score_train = RMSE(train_ds_pred.T[i], labels.values)
  score_holdout = RMSE(holdout_ds_pred.T[i], holdout_labels_df.values)
  print(score_train, score_holdout)
  scores.append(score_holdout)
  if best_holdout_score > score_holdout:
    best_holdout_score = score_holdout
    best_idx = i

scores = np.argsort(scores)

print('\n', 'index with best score:', best_idx, ', score ranks:', scores)

In [None]:
#2) Mean/median among numeric float label values
#Private Score: 7.89707 (mean)

score_train = RMSE(np.mean(train_ds_pred, 1), labels.values)
score_holdout = RMSE(np.mean(holdout_ds_pred, 1), holdout_labels_df.values)
print(score_train, score_holdout)

score_train = RMSE(np.median(train_ds_pred, 1), labels.values)
score_holdout = RMSE(np.median(holdout_ds_pred, 1), holdout_labels_df.values)
print(score_train, score_holdout)

In [None]:
#3) Voting among discreet label values
#Private Score: 7.92492

score_train = RMSE(np.squeeze(stats.mode(np.round(train_ds_pred), 1).mode), labels.values)
score_holdout = RMSE(np.squeeze(stats.mode(np.round(holdout_ds_pred), 1).mode), holdout_labels_df.values)
print(score_train, score_holdout)

In [None]:
#4) Best weighted average against holdout labels
#7.877126121980474 (0.075, 0.8, 0.125) - Private Score: 7.87690
#7.885931889115389 (0.0875, 0.8, 0.1125) - Private Score: 7.87679


import itertools

def minimize_weights(cols, n, maxv):
  size = len(cols)
  best_score = 100
  best_weights = []
  perf = np.array([0]*size)

  linspace = np.round(np.linspace(0, maxv, round(n*maxv*10)+1).tolist(), 8)
  print(linspace[:50])
  print(cols)

  for x in itertools.product(linspace, repeat=size): 
    if sum(x) == 1:
      score = mean_squared_error(np.average(holdout_ds_pred.T[cols], 0, x), holdout_labels_df.values)
      if score < best_score:
        best_score = score
        best_weights = x
        perf = perf + x
        #print(np.sqrt(score), x)

  cols = cols[perf.argsort()]
  print(np.sqrt(best_score), best_weights, cols)
  return cols, best_weights

#full flow for selecting best features:
#cols = scores
#cols, best_weights = minimize_weights(cols[:12], 0.5, 0.8)
#cols, best_weights = minimize_weights(cols[-6:], 2, 0.8)
#cols, best_weights = minimize_weights(cols[-4:], 4, 0.8)
#cols, best_weights = minimize_weights(cols[-3:], 8, 0.8)

#quick flow:
cols, best_weights = minimize_weights(np.array([9,0,8]), 16, 0.8)

In [None]:
cols = np.array([9,0,8])
score_train = RMSE(np.average(train_ds_pred.T[cols], 0, best_weights), labels.values)
score_holdout = RMSE(np.average(holdout_ds_pred.T[cols], 0, best_weights), holdout_labels_df.values)
print(score_train, score_holdout)

In [None]:
#5) Add a normally-distributed noise to existing predictions
#Private Score: 7.86517

train_pred_noise = [x + np.random.normal(0, 0.03) for x in train_pred]
test_pred_noise = [x + np.random.normal(0, 0.03) for x in test_pred]
holdout_pred_noise = [x + np.random.normal(0, 0.03) for x in holdout_pred]

score_train = RMSE(train_pred_noise, labels.values)
score_holdout = RMSE(holdout_pred_noise, holdout_labels_df.values)
print(score_train, score_holdout)

In [None]:
#6) H2O stacking with predicted values from holdout dataset
#Private Score: 7.87028

holdout_stack_frame = h2o.H2OFrame(holdout_ds_pred)
train_stack_frame = h2o.H2OFrame(train_ds_pred)
test_stack_frame = h2o.H2OFrame(test_ds_pred)

model = H2OAutoMLRegressor(seed=RANDOM_SEED, max_runtime_secs=600, nfolds=5, stopping_metric='RMSE', sort_metric='RMSE', 
                           stopping_rounds=10, verbosity='warn')

model.fit(holdout_stack_frame, holdout_labels_df.values)
test_stack_pred = np.squeeze(model.predict(test_stack_frame).as_data_frame().values)
train_stack_pred = np.squeeze(model.predict(train_stack_frame).as_data_frame().values)
holdout_stack_pred = np.squeeze(model.predict(holdout_stack_frame).as_data_frame().values)

In [None]:
score_train = RMSE(train_stack_pred, labels.values)
score_holdout = RMSE(holdout_stack_pred, holdout_labels_df.values)
print(score_train, score_holdout)

In [None]:
#7) H2O default stacking with 20% test pseudolabels
#Private Score: 7.86660

test_pseudo_df = test_df.copy()
test_pseudo_df['loss'] = test_pred
test_pseudo_df = test_pseudo_df.sample(frac=1, random_state=RANDOM_SEED)
test_pseudo_df = test_pseudo_df.iloc[:test_pseudo_df.shape[0]//5]

train_pseudo_df = train_df.copy()
train_pseudo_df['loss'] = labels
train_pseudo_df = train_pseudo_df.append(test_pseudo_df)
train_pseudo_labels = train_pseudo_df['loss']
train_pseudo_df.drop(columns='loss', inplace=True)

model = H2OAutoMLRegressor(seed=RANDOM_SEED, max_runtime_secs=600, nfolds=5, stopping_metric='RMSE', sort_metric='RMSE', 
                           stopping_rounds=10, verbosity='warn')

model.fit(train_pseudo_df.values, train_pseudo_labels.values)
test_pred2 = model.predict(test_df.values)
train_pred2 = model.predict(train_df.values)
holdout_pred2 = model.predict(holdout_df.values)

In [None]:
score_train = RMSE(train_pred2, labels.values)
score_holdout = RMSE(holdout_pred2, holdout_labels_df.values)
print(score_train, score_holdout)

In [None]:
#8) H2O recursive chaining with pseudolabels
#The algorithm is taken from:
#https://www.kaggle.com/c/tabular-playground-series-aug-2021/discussion/270051
#Private Score: 7.84977

model = H2OAutoMLRegressor(seed=RANDOM_SEED, max_runtime_secs=600, nfolds=5, stopping_metric='RMSE', sort_metric='RMSE', 
                           stopping_rounds=10, verbosity='warn')

def recursive_train(test_pred, learning_rate = 0.2, output_res=None):
  model.fit(test_frame, test_pred)
  train_pred = np.squeeze(model.predict(train_frame).as_data_frame().values)
  new_pred = labels.values - train_pred
  print('train loss:', RMSE(train_pred, labels.values))

  model.fit(train_frame, new_pred)
  error_prediction = np.squeeze(model.predict(test_frame).as_data_frame().values)

  test_pred = test_pred + (error_prediction * learning_rate)
  if output_res:
    output_res['loss'] = test_pred
    output_res.to_csv('submission.csv', index=False)

  return test_pred

model.fit(train_frame, labels.values)
test_pred = np.squeeze(model.predict(test_frame).as_data_frame().values)
train_pred = np.squeeze(model.predict(train_frame).as_data_frame().values)
print('train loss:', RMSE(train_pred, labels.values))

test_pred_new = test_pred

for x in range(10):
  test_pred_new = recursive_train(test_pred_new)

### Save submission data

In [None]:
output_res = pd.DataFrame(index=test_df.index, data={'id':test_df.index})
output_res['loss'] = test_pred_new
output_res.to_csv('submission.csv', index=False)