In [None]:
%config Completer.use_jedi = False

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt
from matplotlib.pyplot import imshow, imread

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture

import scipy.stats as stats

import lightgbm as lgb
import warnings

import optuna

In [None]:
R_SEED = 37

In [None]:
submit = True # for some testing

In [None]:
submission_ex = pd.read_csv('/kaggle/input/tabular-playground-series-aug-2021/sample_submission.csv')
train_df = pd.read_csv('/kaggle/input/tabular-playground-series-aug-2021/train.csv')
test_df = pd.read_csv('/kaggle/input/tabular-playground-series-aug-2021/test.csv')

In [None]:
targets_df = train_df[['loss']].copy()
train_df.drop(['id', 'loss'], axis=1, inplace=True) 
test_df.drop(['id'], axis=1, inplace=True) 

In [None]:
def plot_fea_hist(df, fea_name, bins):
    fig = plt.figure(figsize = (10, 10))
    ax = fig.gca()
    hist = df[fea_name].hist(bins = bins, color = 'k', alpha = 0.5, ax = ax)
    ax.set_title(fea_name)

#### Interesting about target value
This is distribution of loss value. All positive values!

In [None]:
plot_fea_hist(targets_df, 'loss', 42)
print('skew: ', targets_df['loss'].skew())

If we put aside **rmse**, distribution will get negative side.<br/>
Something like this:

In [None]:
arr = np.ones(250000)
arr[:125000]  = -1
np.random.shuffle(arr)
targets_df['loss_real'] = np.multiply(targets_df['loss'], arr)
plot_fea_hist(targets_df, 'loss_real', 84)
print('skew: ', targets_df['loss_real'].skew())
print('just an illustration!')

Only now we can see how big problem this zero value make.

In [None]:
del targets_df['loss_real']

In [None]:
if submit:
    X = train_df.copy()
    y = targets_df[['loss']].copy()
# else:
#     np.random.seed(R_SEED)
#     msk = np.random.rand(len(train_df)) < 0.8
#     X = train_df[msk].copy()
#     my_X = train_df[~msk].copy()
#     y = targets_df[msk].copy()
#     my_y = targets_df[~msk].copy()

#### Optuna

While having more train-data than test-data is desirable, I swapped places for them here. In that case we have more than 150000 samples for testing in cross-validation. So, if we get big differences for submitted data compared to these here ...

In [None]:
kfolds = KFold(n_splits = 3, shuffle = True, random_state = R_SEED)

In [None]:
def tune(objective):
    study = optuna.create_study(direction = "minimize")
    study.optimize(objective, n_trials = 1000, timeout = 3*60*60)
    optuna.visualization.plot_optimization_history(study)
    params = study.best_params
    best_score = study.best_value
    print(f"Best score: {best_score} \nOptimized parameters: params = {params}")
    return params

In [None]:
def lightgbm_objective(trial):
    
    params = {
        "objective": "rmse",
        "metric": "rmse",
        "boosting_type": "gbdt",
        'n_estimators': 5000, #trial.suggest_int("n_estimators", 2000, 5000),
        "learning_rate": 0.005,
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "min_child_samples": trial.suggest_int("min_child_samples", 1, 3000),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.25, 0.7),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.7, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 0, 5),
#         'device': 'gpu',
#         'gpu_platform_id': 0,
#         'gpu_device_id': 0
    }
    
    pruning_callback = optuna.integration.LightGBMPruningCallback(trial, 'rmse', valid_name='valid_0')
    
    model = lgb.LGBMRegressor(**params,
                              n_jobs=-1,
                              random_state = R_SEED) # ,device_type="gpu"
    
    val_rmse = []
    # !!!!!!!!!!!!!!!! intentionally !!!!!!!!!!!!!!!!
    for test_index, train_index in kfolds.split(X): # train_index, test_index
 
        X_train, X_val = X.iloc[train_index], X.iloc[test_index]
        y_train, y_val = y.iloc[train_index], y.iloc[test_index]
    
        model.fit(
            X_train, 
            y_train, 
            eval_metric = "rmse", 
            eval_set = [(X_val, y_val)],
            verbose = 100,
            early_stopping_rounds = 300,
            callbacks = [pruning_callback])
        oof_pred1 = model.predict(X_val)
        oof_pred1 = np.clip(oof_pred1, y['loss'].min(), y['loss'].max())
        val_rmse.append(mean_squared_error(y_val, oof_pred1, squared = False))
    
   
    score = sum(val_rmse) / len(val_rmse)
    
    return score

In [None]:
# lightgbm_params = tune(lightgbm_objective)

In [None]:
params_loss = {
                'n_estimators': 30000,
                'learning_rate': 0.001,
                'min_child_samples': 295,
                'feature_fraction': 0.2915087392510538,
                'bagging_fraction': 0.8549961258824171,
                'bagging_freq': 0,
                'num_leaves': 105, 
                }

lgbm_reg = lgb.LGBMRegressor(
                            **params_loss, 
                            objective='rmse',
                            metric='rmse',
                            n_jobs=-1
                            )

#### Submission

In [None]:
_target = 'loss'

print(X.shape)

lgbm_reg.fit(
                X, 
                y,
                callbacks = [lgb.reset_parameter(learning_rate = [0.001] * 20000 + [0.0005] * 10000)]
                )

p_s = lgbm_reg.predict(test_df)

submission_s = submission_ex[['id']].copy()
submission_s[_target] = p_s
submission_s.to_csv('submission_s.csv', index=False)

submission_s.head()