In [None]:
# !pip install optuna

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import optuna

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error
from sklearn.base import TransformerMixin

import xgboost as xgb
import lightgbm as lgb

Посмотрим на датасет.

In [None]:
df = pd.read_csv('../input/tabular-playground-series-jan-2021/train.csv')
df.head()

Построим корреляционную матрицу, чтобы выкинуть сильно корреллирующие признаки.

In [None]:
figure, ax = plt.subplots(figsize=(16, 16))
sns.heatmap(df.sample(n=1_000).corr(), annot=True, linewidths=.5, ax=ax)

Можно было бы выкинуть признак `cont6`, но так как признаков мало, оставим его.

Будем использовать ансамбль из двух библиотек градиентного бустинга, `XGBoost` и `LightGBM`.
Оптимизируем гиперпараметры, используем библиотеку `optuna`.

In [None]:
def objective_xgb(trial, data, target):
    parameters = {
        'tree_method': 'gpu_hist',
        'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9,1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4, 0.5, 0.6, 0.7, 0.8, 1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.008, 0.009, 0.01, 0.012, 0.014, 0.016, 0.018, 0.02]),
        'n_estimators': 1000,
        'max_depth': trial.suggest_categorical('max_depth', [5, 7, 9, 11, 13, 15, 17, 20]),
        'random_state': trial.suggest_categorical('random_state', [24, 48, 2020]),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
    }
    
    # Пропускаем через кросс-валидацию, усредняем ошибку 
    folds = KFold(n_splits=5, random_state=1337, shuffle=True)
    rmse = []
    
    for train_idx, test_idx in folds.split(data, target):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
        model = xgb.XGBRegressor(**parameters)
        model.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=100, verbose=False)
    
        rmse.append(mean_squared_error(y_test, model.predict(X_test), squared=False))
    
    print(f'Mean RMSE for all the folds: {np.mean(rmse)}')
    
    return np.mean(rmse)

Считается долго и на GPU, так что выпишем ниже полученные оптимальные параметры.

In [None]:
"""
study_xgb = optuna.create_study(direction='minimize')
study_xgb.optimize(objective_xgb, n_trials=50)

print(f'Number of finished trials: {len(study_xgb.trials)}')
print(f'Best trial: {study_xgb.best_trial.params}')
"""

In [None]:
xgb_parameters = {
    'objective': 'reg:squarederror',
    'tree_method': 'gpu_hist',
    'n_estimators': 1000,
    'lambda': 7.610705234008646, 
    'alpha': 0.0019377246932580476, 
    'colsample_bytree': 0.5, 
    'subsample': 0.7, 
    'learning_rate': 0.012, 
    'max_depth': 20, 
    'random_state': 24, 
    'min_child_weight': 229
}

Проделываем похожую процедуру с `LightGBM`.

In [None]:
def objective_lgb(trial):
    X, y = df.drop(columns=['target', 'id']).values, df['target'].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1337)
    
    ds_train = lgb.Dataset(X_train, label=y_train)
    ds_test = lgb.Dataset(X_test, label=y_test)
   
    parameters = {
        'device_type': 'gpu',
        'objective': 'regression',
        'metric': 'rmse',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
    }

    gbm = lgb.train(parameters, ds_train)
    prediction = gbm.predict(X_test)
    accuracy = mean_squared_error(y_test, prediction, squared=False)
    
    return accuracy

In [None]:
"""
study_lgb = optuna.create_study(direction='minimize')
study_lgb.optimize(objective_lgb, n_trials=100)

print(f'Number of finished trials: {len(study_lgb.trials)}')
print(f'Best trial: {study_lgb.best_trial.params}')
"""

In [None]:
lgb_parameters = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting': 'gbdt',
    'lambda_l1': 3.2737454713243543e-07,
    'lambda_l2': 3.685676983230042e-06,
    'num_leaves': 190,
    'feature_fraction': 0.47291296723211934,
    'bagging_fraction': 0.8846579981793894,
    'bagging_freq': 3,
    'min_child_samples': 58,
    'verbose': 0,
    'device_type': 'gpu'
}

In [None]:
class NonLinearTransformer(TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X = X.drop(columns=['id'])
    
        for c in X.columns:
            if c == 'target':
                continue
            X[f'{c}^2'] = X[c] ** 2
            
        return X

In [None]:
pipe_xgb = Pipeline([
    ('custom', NonLinearTransformer()),
    ('scaling', StandardScaler()),
    ('regression', xgb.XGBRegressor(**xgb_parameters))
])

pipe_lgb = Pipeline([
    ('custom', NonLinearTransformer()),
    ('scaling', StandardScaler()),
    ('regression', lgb.LGBMRegressor(**lgb_parameters))
])

In [None]:
df_train = pd.read_csv('../input/tabular-playground-series-jan-2021/train.csv')
df_predict = pd.read_csv('../input/tabular-playground-series-jan-2021/test.csv')

In [None]:
X, y = df_train.drop(columns=['target']), df_train['target']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1337)

In [None]:
pipe_xgb.fit(X_train, y_train)
pipe_lgb.fit(X_train, y_train)

print(f'XGB Score: {pipe_xgb.score(X_test, y_test)}, LGB Score: {pipe_lgb.score(X_test, y_test)}')
print(f'XGB RMSE: {mean_squared_error(y_test, pipe_xgb.predict(X_test), squared=False)}, LGB RMSE: {mean_squared_error(y_test, pipe_lgb.predict(X_test), squared=False)}')

In [None]:
def ensemble_predict(X):
    target_xgb = pipe_xgb.predict(X)
    target_lgb = pipe_lgb.predict(X)

    return [0.85 * x + 0.15 * l for (x, l) in zip(target_xgb, target_lgb)]

In [None]:
print(f'Ensemble RMSE: {mean_squared_error(y_test, ensemble_predict(X_test), squared=False)}')

In [None]:
pipe_xgb.fit(X, y)
pipe_lgb.fit(X, y)

In [None]:
target = pd.DataFrame({
    'id': df_predict['id'], 'target': ensemble_predict(df_predict)
})
target.to_csv('submission.csv', index=False)