## Setup

In [None]:
# Libraries
import numpy as np
import pandas as pd

from lightgbm import LGBMRegressor

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

import optuna
from optuna.visualization import plot_optimization_history, plot_param_importances

import shap

In [None]:
# Data
data = pd.read_csv('../input/tabular-playground-series-feb-2021/train.csv', index_col=0)
test = pd.read_csv('../input/tabular-playground-series-feb-2021/test.csv', index_col=0)

preds = data.columns[:-1]
target = data.columns[-1]

In [None]:
# Preprocessing
cat_cols = [col for col in preds if 'cat' in col]
data[cat_cols] = data[cat_cols].astype('category')
test[cat_cols] = test[cat_cols].astype('category')

In [None]:
# Best params
best_params = {}

## Optimize n_estimators

In [None]:
def objective(trial):
    # Search spaces
    n_estimators = trial.suggest_int('n_estimators', 100, 1000)

    # Evaluation
    scores = []

    kf = KFold(5)
    for i, (train_idx, test_idx) in enumerate(kf.split(data)):
        X_train = data.iloc[train_idx][preds]
        y_train = data.iloc[train_idx][target]
        X_test = data.iloc[test_idx][preds]
        y_test = data.iloc[test_idx][target]

        estimator = LGBMRegressor(n_estimators=n_estimators)

        estimator.fit(X_train, 
                      y_train, 
                      eval_set=(X_test, y_test), 
                      eval_metric='rmse',
                      categorical_feature=cat_cols,
                      verbose=0)

        y_pred = estimator.predict(X_test)
        rmse = mean_squared_error(y_test, y_pred, squared=False)
        scores.append(rmse)

    return np.mean(scores)

In [None]:
# Optimization
study = optuna.create_study(direction='minimize')
study.optimize(objective, timeout=3600*0.25)

In [None]:
# Best score
study.best_value

In [None]:
# Historic
plot_optimization_history(study)

In [None]:
# Best params
best_params.update(study.best_params)
best_params

## Optimize Tree Properties - Num_leaves

In [None]:
def objective(trial):
    hyper_params = {
        'num_leaves': trial.suggest_int('num_leaves', 1, 63),
    }

    # Evaluation
    scores = []

    kf = KFold(5)
    for i, (train_idx, test_idx) in enumerate(kf.split(data)):
        X_train = data.iloc[train_idx][preds]
        y_train = data.iloc[train_idx][target]
        X_test = data.iloc[test_idx][preds]
        y_test = data.iloc[test_idx][target]

        hyper_params.update(best_params)
        
        estimator = LGBMRegressor(**hyper_params)

        estimator.fit(X_train, 
                      y_train, 
                      eval_set=(X_test, y_test), 
                      eval_metric='rmse',
                      categorical_feature=cat_cols,
                      verbose=0)

        y_pred = estimator.predict(X_test)
        rmse = mean_squared_error(y_test, y_pred, squared=False)
        scores.append(rmse)

    return np.mean(scores)

In [None]:
# Optimization
study = optuna.create_study(direction='minimize')
study.optimize(objective, timeout=3600*1.5)

In [None]:
# Best score
study.best_value

In [None]:
# Historic
plot_optimization_history(study)

In [None]:
# Best params
best_params.update(study.best_params)
best_params

## Otpimize Tree Properties - Others

In [None]:
def objective(trial):
    hyper_params = {
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf ', 1, 100),
        'max_bin': trial.suggest_int('max_bin', 15, 2043)
    }

    # Evaluation
    scores = []

    kf = KFold(5)
    for i, (train_idx, test_idx) in enumerate(kf.split(data)):
        X_train = data.iloc[train_idx][preds]
        y_train = data.iloc[train_idx][target]
        X_test = data.iloc[test_idx][preds]
        y_test = data.iloc[test_idx][target]

        hyper_params.update(best_params)
        
        estimator = LGBMRegressor(**hyper_params)

        estimator.fit(X_train, 
                      y_train, 
                      eval_set=(X_test, y_test), 
                      eval_metric='rmse',
                      categorical_feature=cat_cols,
                      verbose=0)

        y_pred = estimator.predict(X_test)
        rmse = mean_squared_error(y_test, y_pred, squared=False)
        scores.append(rmse)

    return np.mean(scores)

In [None]:
# Optimization
study = optuna.create_study(direction='minimize')
study.optimize(objective, timeout=3600*1)

In [None]:
# Best score
study.best_value

In [None]:
# Historic
plot_optimization_history(study)

In [None]:
# Importance
plot_param_importances(study)

In [None]:
# Best params
best_params.update(study.best_params)
best_params

## Optimize regulation - Fraction

In [None]:
def objective(trial): 
    hyper_params = {
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 100),
        'bagging_fraction': trial.suggest_float('bagging_fraction ', 0, 1.0),
        'feature_fraction': trial.suggest_float('feature_fraction', 0, 1.0)
    }

    # Evaluation
    scores = []

    kf = KFold(5)
    for i, (train_idx, test_idx) in enumerate(kf.split(data)):
        X_train = data.iloc[train_idx][preds]
        y_train = data.iloc[train_idx][target]
        X_test = data.iloc[test_idx][preds]
        y_test = data.iloc[test_idx][target]

        hyper_params.update(best_params)
        
        estimator = LGBMRegressor(**hyper_params)

        estimator.fit(X_train, 
                      y_train, 
                      eval_set=(X_test, y_test), 
                      eval_metric='rmse',
                      categorical_feature=cat_cols,
                      verbose=0)

        y_pred = estimator.predict(X_test)
        rmse = mean_squared_error(y_test, y_pred, squared=False)
        scores.append(rmse)

    return np.mean(scores)

In [None]:
# Optimization
study = optuna.create_study(direction='minimize')
study.optimize(objective, timeout=3600*2)

In [None]:
# Best score
study.best_value

In [None]:
# Historic
plot_optimization_history(study)

In [None]:
# Importance
plot_param_importances(study)

In [None]:
# Best params
best_params.update(study.best_params)
best_params

## Optimize Regulation - L1

In [None]:
def objective(trial): 
    hyper_params = {
        'lambda_l1': trial.suggest_float('lambda_l1', 1E-12, 25, log=True)
    }

    # Evaluation
    scores = []

    kf = KFold(5)
    for i, (train_idx, test_idx) in enumerate(kf.split(data)):
        X_train = data.iloc[train_idx][preds]
        y_train = data.iloc[train_idx][target]
        X_test = data.iloc[test_idx][preds]
        y_test = data.iloc[test_idx][target]

        hyper_params.update(best_params)
        
        estimator = LGBMRegressor(**hyper_params)

        estimator.fit(X_train, 
                      y_train, 
                      eval_set=(X_test, y_test), 
                      eval_metric='rmse',
                      categorical_feature=cat_cols,
                      verbose=0)

        y_pred = estimator.predict(X_test)
        rmse = mean_squared_error(y_test, y_pred, squared=False)
        scores.append(rmse)

    return np.mean(scores)

In [None]:
# Optimization
study = optuna.create_study(direction='minimize')
study.optimize(objective, timeout=3600*1)

In [None]:
# Best score
study.best_value

In [None]:
# Historic
plot_optimization_history(study)

In [None]:
# Best params
best_params.update(study.best_params)
best_params

## Optimize regulation - Others

In [None]:
def objective(trial): 
    hyper_params = {
        'lambda_l2': trial.suggest_float('lambda_l2', 1E-12, 20, log=True),
        'path_smooth': trial.suggest_float('path_smooth', 1E-12, 20, log=True),
        'cat_smooth': trial.suggest_float('cat_smooth', 1E-12, 20, log=True)
    }

    # Evaluation
    scores = []

    kf = KFold(5)
    for i, (train_idx, test_idx) in enumerate(kf.split(data)):
        X_train = data.iloc[train_idx][preds]
        y_train = data.iloc[train_idx][target]
        X_test = data.iloc[test_idx][preds]
        y_test = data.iloc[test_idx][target]

        hyper_params.update(best_params)
        
        estimator = LGBMRegressor(**hyper_params)

        estimator.fit(X_train, 
                      y_train, 
                      eval_set=(X_test, y_test), 
                      eval_metric='rmse',
                      categorical_feature=cat_cols,
                      verbose=0)

        y_pred = estimator.predict(X_test)
        rmse = mean_squared_error(y_test, y_pred, squared=False)
        scores.append(rmse)

    return np.mean(scores)

In [None]:
# Optimization
study = optuna.create_study(direction='minimize')
study.optimize(objective, timeout=3600*1)

In [None]:
# Best score
study.best_value

In [None]:
# Historic
plot_optimization_history(study)

In [None]:
# Importance
plot_param_importances(study)

In [None]:
# Best params
best_params.update(study.best_params)
best_params

## Evaluation with low learning rate

In [None]:
# Evaluation
k = 10
test[target] = 0

scores = []

kf = KFold(k)
for i, (train_idx, test_idx) in enumerate(kf.split(data)):
    X_train = data.iloc[train_idx][preds]
    y_train = data.iloc[train_idx][target]
    X_test = data.iloc[test_idx][preds]
    y_test = data.iloc[test_idx][target]

    
    best_params['learning_rate'] = 0.005
    best_params['n_estimators'] = 100000
    
    estimator = LGBMRegressor(**best_params)

    estimator.fit(X_train, 
                  y_train, 
                  eval_set=(X_test, y_test), 
                  eval_metric='rmse',
                  early_stopping_rounds=1000,
                  categorical_feature=cat_cols,
                  verbose=1000)

    y_pred = estimator.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    scores.append(rmse)

    test[target] += estimator.predict(test[preds]) / k

test[target].to_csv('submission.csv')

In [None]:
print(f"Expected score: {np.mean(scores)}")

In [None]:
# Shap values - only applied on the last estimator
explainer = shap.TreeExplainer(estimator)
shap_values = explainer.shap_values(X_test)

In [None]:
# Summary
shap.summary_plot(shap_values, X_test, plot_type="bar")