In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
import optuna

In [None]:
train_pd = pd.read_csv('/kaggle/input/tabular-playground-series-aug-2021/train.csv')
test_pd = pd.read_csv('/kaggle/input/tabular-playground-series-aug-2021/test.csv')

In [None]:
X = train_pd.iloc[:, 1:-1].values
y = train_pd.iloc[:, -1].values

X_test = test_pd.iloc[:, 1:].values

# scaler = RobustScaler()
# X = scaler.fit_transform(X)
# X_test = scaler.transform(X_test)

In [None]:
def objective(trial):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33)
    train_data = lgb.Dataset(X_train, label=y_train)
    
    params = {
        'objective': 'regression',
        'metric': 'rmse',
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
    }
    
    gbm = lgb.train(params, train_data)
    
    y_pred = gbm.predict(X_val)
    loss = mean_squared_error(y_pred, y_val, squared=False)
    
    return loss

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33)
train_data = lgb.Dataset(X_train, label=y_train)
validation_data = lgb.Dataset(X_val, label=y_val)

In [None]:
lgb_params = study.best_params
lgb_params['objective'] = 'regression'
lgb_params['metric'] = 'rmse'
lgb_params['force_col_wise'] = True

nfold = 10
result = None
skf = StratifiedKFold(n_splits = nfold, shuffle = True)
for train_index, test_index in skf.split(X, y):
    X_train, X_val, y_train, y_val = X[train_index], X[test_index], y[train_index], y[test_index]
    train_data = lgb.Dataset(X_train, label=y_train)
    validation_data = lgb.Dataset(X_val, label=y_val)
    
    lgbm_model = lgb.train(lgb_params, train_data)
    
    y_pred = lgbm_model.predict(X_val)
    rmse = mean_squared_error(y_pred, y_val, squared=False)
    
    print(f'RMSE: {rmse:.4f}')
    
    y_test = lgbm_model.predict(X_test)
    print(y_test)
    
    if result is None:
        result = y_test
    else:
        result += y_test
        
result /= nfold

In [None]:
result

In [None]:
param = {
    'objective': 'regression',
    'metric': 'rmse',
    
    'max_bin': 150,
    'num_tree': 1000,
    'max_depth': 8,
    'learning_rate': 0.05,
    'num_leaves': 100,
    'boosting': 'dart',
    'extra_trees': True,
    
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'lambda_l1': 0.5,
    'lambda_l1': 2.0,
    'min_gain_to_split': 0.005
}

In [None]:
num_round = 500
lgbm_model = lgb.train(param, train_data, num_round, valid_sets=[validation_data])

In [None]:
lgbm_result = lgbm_model.predict(X_test)
lgbm_result

In [None]:
import catboost as cb

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.4)
train_pool = cb.Pool(X_train, y_train)
val_pool = cb.Pool(X_val, y_val)

In [None]:
cb_params = {
    'depth': 4,
    'iterations': 4500,
    'learning_rate': 0.05,
    'l2_leaf_reg': 3.0,
    'bagging_temperature': 0.4550402433961105,
    'random_strength': 0.5986843602259072,
    'border_count': 171,
    'eval_metric': 'RMSE',
    'loss_function': 'RMSE',
    'bootstrap_type': 'Bayesian',
    'use_best_model': True
}

cb_model = cb.CatBoostRegressor(**cb_params)
cb_model.fit(
    train_pool,
    use_best_model=True,
    eval_set=val_pool,
    early_stopping_rounds=500,
)

In [None]:
cb_result = cb_model.predict(X_test)
cb_result

In [None]:
import xgboost as xgb

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.4)

In [None]:
xgb_model = xgb.XGBRegressor(
    n_estimators=200,
    max_depth=10,
    learning_rate=0.05,
    tree_method='hist',
    objective='reg:squarederror',
    booster='dart',
    n_jobs=4,
    nthread=8,
#     gamma=1,
#     colsample_bylevel=1,
#     colsample_bynode=1,
#     colsample_bytree=0.8,
#     reg_alpha=0,
#     reg_lambda=1,
#     scale_pos_weight=1,
    importance_type='gain',
#     gpu_id=0,
    verbosity=1
)

xgb_model = xgb_model.fit(
    X_train,y_train,
    eval_set=[(X_val, y_val)],
    eval_metric='rmse',
)

In [None]:
xgb_result = xgb_model.predict(X_test)
xgb_result

In [None]:
result = (lgbm_result + cb_result + xgb_result) / 3
result

In [None]:
result_csv = pd.DataFrame(data={'id': test_pd.iloc[:, 0], 'loss': result})
print(result_csv)
result_csv.to_csv('submission.csv', index=False)