In [None]:
import numpy as np
import matplotlib.pyplot as plt 
import pandas as pd
import seaborn as sns 
from sklearn.datasets import load_boston
import warnings
warnings.filterwarnings('ignore')
plt.style.use('ggplot')

## 1) Load Data

In [None]:
train = pd.read_csv('../input/tabular-playground-series-aug-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-aug-2021/test.csv')
print('train shape:',train.shape)
print('test shape:',test.shape)

In [None]:
train.head()

In [None]:
# Train data
X=train.drop(columns = ['loss','id'])
y=train['loss'].values
# Test data
X_test=test.drop(columns = ['id'])
print('Train set:', X.shape)
print('Test set:', X_test.shape)

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

## Optuna

In [None]:
# import optuna
# from catboost import CatBoostRegressor
# from sklearn.metrics import mean_squared_error
# from sklearn.model_selection import KFold

# def objective(trial, X=X_scaled, y=y):
#     """
#     A function to train a model using different hyperparamerters combinations provided by Optuna. 
#     Log loss of validation data predictions is returned to estimate hyperparameters effectiveness.
#     """
#     oof_preds = np.zeros((X.shape[0],))
#     for fold, (train_idx, valid_idx) in enumerate(KFold(n_splits=10, shuffle=True, random_state=44).split(X_scaled, y)):
#         X_train, y_train = X_scaled[train_idx], y[train_idx]
#         X_valid, y_valid = X_scaled[valid_idx], y[valid_idx]
#         # A set of hyperparameters to optimize by optuna
#         cb_params = {
#                  "learning_rate": trial.suggest_float('learning_rate', 0.001, 1.0),
#                  "l2_leaf_reg": trial.suggest_float('l2_leaf_reg', 0.00001, 10),
#                  "bagging_temperature": trial.suggest_float('bagging_temperature', 0.0, 10.0),
#                  "random_strength": trial.suggest_float('random_strength', 1.0, 2.0),
#                  "depth": trial.suggest_int('depth', 6, 15),
#                  "grow_policy": trial.suggest_categorical("grow_policy", ["SymmetricTree", "Depthwise", "Lossguide"]),
#                  "leaf_estimation_method": trial.suggest_categorical("leaf_estimation_method", ["Newton", "Gradient"]),
#             }

#         model = CatBoostRegressor(random_state=42,
#                                  thread_count=4,
#                                  verbose=False,
#                                  loss_function='RMSE',
#                                  eval_metric='RMSE',
#                                  od_type="Iter",
#                                  early_stopping_rounds=500,
#                                  use_best_model=True,
#                                  iterations=10000,
#                                  task_type="GPU",
#                                  **cb_params)
#         model.fit(X_train, y_train,
#                 eval_set=[(X_valid, y_valid)], verbose=False)
#         oof_preds[valid_idx] = model.predict(X_valid)
    
#     return mean_squared_error(y, oof_preds, squared=False)

In [None]:
import optuna
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

def objective(trial, X=X_scaled, y=y):
    """
    A function to train a model using different hyperparamerters combinations provided by Optuna. 
    Log loss of validation data predictions is returned to estimate hyperparameters effectiveness.
    """
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.4)
    # A set of hyperparameters to optimize by optuna
    params = {
        "learning_rate": trial.suggest_float('learning_rate', 0.001, 1.0),
        "depth": trial.suggest_int("depth", 1, 12),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical("bootstrap_type", ["Bayesian", "Bernoulli", "MVS"])
    }
    
    if params["bootstrap_type"] == "Bayesian":
        params["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif params["bootstrap_type"] == "Bernoulli":
        params["subsample"] = trial.suggest_float("subsample", 0.1, 1)

    model = CatBoostRegressor(random_state=42,
                             verbose=False,
                             eval_metric='RMSE',
#                              use_best_model=True,
                             task_type="GPU",
                             **params)
    model.fit(X_train, y_train,
            eval_set=[(X_valid, y_valid)], verbose=False)
    
    return mean_squared_error(y_valid, model.predict(X_valid), squared=False)

In [None]:
%%time
# Creating Optuna object and defining its parameters
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials = 100)

# Showing optimization results
print('Number of finished trials:', len(study.trials))
print('Best trial parameters:', study.best_trial.params)
print('Best score:', study.best_value)

## 2) Train Model
value: 7.820554598329104

In [None]:
# params = {'learning_rate': 0.6966005931411917, 'depth': 3, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bernoulli', 'subsample': 0.293719685175609}
params = study.best_trial.params

In [None]:
from catboost import CatBoostRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

n_splits = 10
test_preds = 0
kf_rmse = []

for fold, (train_idx, valid_idx) in enumerate(KFold(n_splits=n_splits, shuffle=True).split(X_scaled, y)):
    X_train, y_train = X_scaled[train_idx], y[train_idx]
    X_valid, y_valid = X_scaled[valid_idx], y[valid_idx]
    model = CatBoostRegressor(random_state=42,
                             verbose=False,
                             eval_metric='RMSE',
#                              use_best_model=True,
                             task_type="GPU",
                             **params)
    model.fit(X_train, y_train,
            eval_set=[(X_valid, y_valid)], verbose=False)
       
    valid_pred = model.predict(X_valid)
    rmse = mean_squared_error(y_valid, valid_pred, squared=False)
    print(f'Fold {fold+1}/{n_splits} RMSE: {rmse:.4f}')
    kf_rmse.append(rmse)
    
    test_preds += model.predict(X_test_scaled)

test_preds /= n_splits
print(f'Average KFold RMSE: {np.mean(np.array(kf_rmse)):.5f}')

## Feature importances

In [None]:
# model_fi = model.feature_importances_

# x = np.arange(0, len(X.columns))
# height = 0.3
# fig, ax = plt.subplots(figsize=(10, 15))
# bars1 = ax.barh(x-height, model_fi, height=height,
#                 color="cornflowerblue",
#                 edgecolor="black",
#                 label='loss')
# ax.set_title("Feature importances", fontsize=20, pad=5)
# ax.set_ylabel("Feature names", fontsize=15, labelpad=5)
# ax.set_xlabel("Feature importance", fontsize=15, labelpad=5)
# ax.set_yticks(x)
# ax.set_yticklabels(X.columns, fontsize=8)
# ax.tick_params(axis="x", labelsize=10)
# ax.grid(axis="x")
# ax.legend(fontsize=13, loc="lower right")
# plt.margins(0.04, 0.01)
# plt.gca().invert_yaxis()

## Test Predict

In [None]:
preds = pd.read_csv("../input/tabular-playground-series-aug-2021/sample_submission.csv")
preds.loss = test_preds
preds.head()

In [None]:
preds.to_csv('submission.csv', index=False)