I wanted to try and follow the suggestions outlined by Aayush Kumar Singha, linked here: https://www.kaggle.com/c/tabular-playground-series-aug-2021/discussion/258009. As such, for this month's edition of the Tabular Playground, I'll be focusing on blindly fitting in LGBM, XGB, and CatBoost Regression models before tuning them using Optuna. First things first, we need to load in all the required libraries and data.

# 1. Loading Libraries and Data

In [None]:
#!pip install autogluon
#!pip install autokeras

In [None]:
import numpy as np
from numpy import mean
from numpy import std
import pandas as pd
import os
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, callbacks
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.model_selection import RepeatedKFold
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
import catboost
from catboost import CatBoostRegressor
import optuna
import sklearn
import math
from fractions import Fraction
import shutil
import gc
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
#from autogluon.tabular import TabularPredictor, TabularDataset
#import autokeras as ak

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train = pd.read_csv('/kaggle/input/tabular-playground-series-aug-2021/train.csv', index_col = 'id')
train.head()

In [None]:
train.shape #For KERAS usage down the line.

In [None]:
train_y = train['loss'] #Labels
train_x = train
train_x.drop(columns = ['loss'], inplace=True)
train_x.head() #Features

In [None]:
scaler = StandardScaler() #Scaling required.
train_x = scaler.fit_transform(train_x)

In [None]:
train_x_train, train_x_valid, train_y_train, train_y_valid = train_test_split(train_x, train_y, train_size = 0.8) #Normal Train Test Split will be used. 

In [None]:
test = pd.read_csv('/kaggle/input/tabular-playground-series-aug-2021/test.csv', index_col = 'id')
test.head()

In [None]:
test = scaler.transform(test) #Need to transform test data too. 

# 2. Blind Model Fitting

I defined a function that takes in an argument to fit in the corresponding model. This is done to streamline the process whenever a different model is desired.

In [None]:
def model(train_x, train_y, argument):
    def ModelSelector(argument):
        model = {
            'LGBM': LGBMRegressor(),
            'XGB': XGBRegressor(),
            'CatBoost': CatBoostRegressor()
        }
        return model.get(argument, 'Invalid Selection')
    model = ModelSelector(argument)
    model.fit(train_x, train_y)
    predictions = model.predict(test)
    return predictions

In [None]:
#argument = 'XGB'
#predictions = model(train_x, train_y, argument)

# 3. Optuna

We can improve the quality of models by implementing both K-Folding and Optuna. 

In [None]:
#Credit here for objective functions and K-Fold Implementation: https://www.kaggle.com/michael127001/xgbregressor-with-optuna-tuning
def Optuna(argument):
    N_TRIALS = 5
    N_SPLITS = 10
    def LGBMObjective(trial, x_train = train_x, y_train = train_y):
        kfolds = KFold(n_splits=N_SPLITS, shuffle=True)
        param = {
            'boosting_type': 'goss',
            'max_depth': trial.suggest_int('max_depth', 6, 12),
            'learning_rate': trial.suggest_float('learning_rate', 0.007, 0.013),
            'n_estimators': trial.suggest_int('n_estimators', 400, 4000, 400),
            'min_child_weight': trial.suggest_int('min_child_weight', 5, 20),
            'subsample': trial.suggest_discrete_uniform('subsample', 0.5, 0.9, 0.1),
            'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.5, 0.9, 0.1),
            'reg_alpha': trial.suggest_int('reg_alpha', 0, 50),
            'reg_lambda': trial.suggest_int('reg_lambda', 1, 50),
            'n_jobs': 4,
        }
        model = LGBMRegressor(**param)
        mse = cross_val_score(model, x_train, y_train, cv=kfolds,
            scoring="neg_root_mean_squared_error", 
        )
        return mse.mean()
    def XGBRObjective(trial, x_train = train_x, y_train = train_y):
        kfolds = KFold(n_splits=N_SPLITS, shuffle=True)
        param = {
            'max_depth': trial.suggest_int('max_depth', 6, 12),
            'eta': trial.suggest_float('eta', 0.007, 0.013),
            'n_estimators': trial.suggest_int('n_estimators', 400, 4000, 400),
            'min_child_weight': trial.suggest_int('min_child_weight', 5, 20),
            'subsample': trial.suggest_discrete_uniform('subsample', 0.5, 0.9, 0.1),
            'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.5, 0.9, 0.1),
            'reg_alpha': trial.suggest_int('reg_alpha', 0, 50),
            'reg_lambda': trial.suggest_int('reg_lambda', 1, 50),
            'n_jobs': 4,
            'tree_method': 'gpu_hist',
            'predictor': 'gpu_predictor',
        }
        model = XGBRegressor(**param)
        mse = cross_val_score(model, x_train, y_train, cv=kfolds,
            scoring="neg_root_mean_squared_error", 
        )
        return mse.mean()
    def CatBoostObjective(trial, x_train = train_x, y_train = train_y): 
        kfolds = KFold(n_splits=N_SPLITS, shuffle=True)
        param = {
            'max_depth': trial.suggest_int('max_depth', 6, 12),
            'learning_rate': trial.suggest_float('eta', 0.007, 0.013),
            'n_estimators': trial.suggest_int('n_estimators', 400, 4000, 400),
            'subsample': trial.suggest_discrete_uniform('subsample', 0.5, 0.9, 0.1),
            'reg_lambda': trial.suggest_int('reg_lambda', 1, 50),
            'task_type': 'GPU',
            'bootstrap_type': 'Bernoulli', 
            'eval_metric':'RMSE',
            'verbose': False
        }
        model = CatBoostRegressor(**param)
        mse = cross_val_score(model, x_train, y_train, cv=kfolds,
            scoring="neg_root_mean_squared_error", 
        )
        return mse.mean()
    def ObjectiveSelector(argument):
        objective = {
            'LGBM': LGBMObjective,
            'XGB': XGBRObjective,
            'CatBoost': CatBoostObjective            
        }
        return objective.get(argument, "Invalid Selection")
    def ModelSelector(argument, trial): #Switch case not usable here without crashing. 
        if(argument == 'LGBM'):
            return LGBMRegressor(**trial.params)
        elif(argument == 'XGB'):
            return XGBRegressor(**trial.params)
        elif(argument == 'CatBoost'):
            return CatBoostRegressor(**trial.params)
        return "Invalid Model"
    study = optuna.create_study(direction="maximize")
    study.optimize(ObjectiveSelector(argument), n_trials=N_TRIALS)
    print("Number of finished trials: {}".format(len(study.trials)))
    print("Best trial:")
    trial = study.best_trial
    print("  Value: {}".format(trial.value))
    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))
    predictions_optuna = None
    kfolds = KFold(n_splits=N_SPLITS, shuffle=True)
    for i, (train_idx, valid_idx) in enumerate(kfolds.split(train_x, train_y)):
        X_train, y_train = train_x[train_idx], train_y[train_idx]
        X_valid, y_valid = train_x[valid_idx], train_y[valid_idx]
        model = ModelSelector(argument, trial)
        if(argument == 'LGBM' or argument == 'XGB'):
            model.fit(X_train, y_train, eval_set = [(X_valid, y_valid)], eval_metric = 'rmse', verbose = False)
        else:
            model.fit(X_train, y_train, eval_set = [(X_valid, y_valid)], verbose = False)
        valid_pred = model.predict(X_valid)
        rmse = np.sqrt(mean_squared_error(valid_pred, y_valid))
        print(rmse)
        if(predictions_optuna is None):
            predictions_optuna = model.predict(test)
        else:
            predictions_optuna += model.predict(test)
    predictions_optuna /= N_SPLITS
    return predictions_optuna

In [None]:
#argument = 'XGB'
#predictions_optuna = Optuna(argument)

# 4. Blending Models

Now that we have several tuned models, it would be a good idea to blend them together into a better model. 

In [None]:
#predictions_lgbm = Optuna('LGBM')
#predictions_xgb = Optuna('XGB')
#predictions_catboost = Optuna('CatBoost')
#predictions_blend = Fraction(1,3) * predictions_lgbm + Fraction(1,3) * predictions_xgb + Fraction(1,3) * predictions_catboost

# 5.1. Autogulon

In [None]:
#Have to reload files for AutoML
#train = pd.read_csv('/kaggle/input/tabular-playground-series-aug-2021/train.csv', index_col = 'id')
#train.drop(columns = ['id'], inplace=True)
#test = pd.read_csv('/kaggle/input/tabular-playground-series-aug-2021/test.csv', index_col = 'id')
#test.drop(columns = ['id'], inplace=True)

In [None]:
#Credit here for Autogluon Model Training: https://www.kaggle.com/aayush26/tps-aug-2021-autogluon-101
#model = TabularPredictor(problem_type='regression', label = 'loss')
#model.fit(train_data=train, presets='best_quality', time_limit = 20000, num_stack_levels = 3,num_bag_folds = 5,num_bag_sets = 1,)
#del train['loss']

In [None]:
#model.leaderboard()

In [None]:
#predictions_autogluon = model.predict(TabularDataset(test))

# 5.2. AutoKeras

In [None]:
#model = ak.StructuredDataRegressor(overwrite=True, loss = 'mean_squared_error', 
#                                   metrics=[tf.keras.metrics.RootMeanSquaredError()])
#model.fit('/kaggle/input/tabular-playground-series-aug-2021/train.csv',"loss")

In [None]:
#predictions_autokeras = model.predict(test)

# 6. Pseudolabel

We can use the XGBRegressor Optuna Data (which has the lowest RMSE thus far) as a pseudolabel to make a better model.  

In [None]:
#train = pd.read_csv('train.csv', index_col = 'id')
#test = pd.read_csv('test.csv', index_col = 'id')
#pseudolabel = predictions_optuna
#test_concat = test
#test_concat['loss'] = pseudolabel
#test_concat.head()

In [None]:
#test_30 = test_concat.sample(frac = 0.3)
#test_30.shape

In [None]:
#train_data = [train, test_30]
#train = pd.concat(train_data)
#train.shape

In [None]:
#train.index = range(295000)

In [None]:
#train_y = train['loss'] 
#train_x = train
#train_x.drop(columns = ['loss'], inplace=True)
#train_x.head()

In [None]:
#scaler = StandardScaler() #Scaling required.
#train_x = scaler.fit_transform(train_x)
#del test['loss']
#test = scaler.transform(test)

In [None]:
#argument = 'XGB'
#predictions_pseudolabel = Optuna(argument)

# 7.1. KERAS

In [None]:
input_shape = [100]
model = keras.Sequential([
    layers.Dense(units = 512, input_shape=input_shape, activation = 'relu'),
    layers.Dense(units = 512, input_shape=input_shape, activation = 'relu'),
    layers.Dense(units = 10, input_shape=input_shape, activation = 'relu'),
    layers.BatchNormalization()
])
model.compile(
    optimizer = 'adam',
    loss = 'mse',
    metrics=[tf.keras.metrics.RootMeanSquaredError()]
)

In [None]:
early_stopping = callbacks.EarlyStopping(
    min_delta = 0.001,
    restore_best_weights = True
)

In [None]:
model.fit(
    train_x_train, train_y_train,
    validation_data = (train_x_valid, train_y_valid),
    epochs = 50,
    verbose = 1,
    callbacks = [early_stopping]
)

In [None]:
predictions_keras = model.predict(test)

# 8. More Blending

In [None]:
FRACTION = Fraction(140, 363)

In [None]:
rank_1 = pd.read_csv('XGBOptuna.csv')
rank_2 = pd.read_csv('XGBPseudoLabel.csv')
rank_3 = pd.read_csv('CatXGBPseudoLabel.csv')
rank_4 = pd.read_csv('LGBMOptuna.csv')
rank_5 = pd.read_csv('AutoGluon.csv')
rank_6 = pd.read_csv('LGBMOptuna.csv')
rank_7 = pd.read_csv('AutoKeras.csv')

# Submission

In [None]:
sample_submission = pd.read_csv('/kaggle/input/tabular-playground-series-aug-2021/sample_submission.csv')
sample_submission['loss'] = predictions_keras
sample_submission.reset_index()
sample_submission.head()

In [None]:
sample_submission.to_csv('submission.csv', index=False)

In [None]:
#shutil.rmtree('AutogluonModels')
#gc.collect()