In this notebook I will train the "Tabular Playground Series - Jul 2021" model by the LightGBM.<br/>
This model is a multi-output regression so, because the LGBM doesn't support it we will have to use one of the two available techniques:
1. Train each label feature separately and combine them back afterwards. This method is implemented by the `sklearn.multioutput.MultiOutputRegressor`.
2. Train each label feature separately and append to every next model the predicted label from the previous model as a new features. This method is implemented by the `sklearn.multioutput.RegressorChain`.

We will also optimize the hyperparamenters by the `bayes_opt.BayesianOptimization`.<br/>
The problem with the `RegressorChain` is that it doesn't update the evaluation dataset so, we will have to write an envelope class over the `lightgbm.cv` to make it part of the pipeline. <br/>
Finally, the training pipeline chain will look like that:
```
BayesianOptimization --> RegressorChain --> LGBMWrapper --> lightgbm.cv
```

In [None]:
!pip install -U scikit-learn
!pip install -U bayesian-optimization
!pip install -U lightgbm

import os
import math
import datetime
import numpy as np 
import pandas as pd
import scipy as sc
import matplotlib.pyplot as plt

RANDOM_SEED = 111

np.random.seed(RANDOM_SEED)

from numpy.random import default_rng
rng = default_rng(RANDOM_SEED)

from sklearn.metrics import roc_curve, auc, roc_auc_score, accuracy_score, mean_squared_log_error
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler, StandardScaler, OneHotEncoder, Binarizer, KBinsDiscretizer, QuantileTransformer
from sklearn.feature_extraction import FeatureHasher
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV, KFold, StratifiedKFold, StratifiedShuffleSplit, ShuffleSplit
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn import set_config
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.tree import DecisionTreeRegressor
from sklearn.multioutput import MultiOutputRegressor, RegressorChain

import lightgbm as lgb
from bayes_opt import BayesianOptimization

INPUT_DIR = '/kaggle/input/tabular-playground-series-jul-2021'
OUTPUT_DIR = './'
BATCH_SIZE = 1024

In [None]:
def season(month):
  if (month == 12 or month == 1 or month == 2):   #winter
        return 0        
  elif(month == 3 or month == 4 or month == 5):   #spring
        return 1       
  elif(month == 6 or month == 7 or month == 8):   #summer
        return 2       
  else:                                           #outemn
        return 3 

def daytime(hour):
  if (hour > 5 and hour < 17):      #light
    return 0
  else:                             #darkness
    return 1

train_df = pd.read_csv(os.path.join(INPUT_DIR, 'train.csv'), index_col='date_time')
test_df = pd.read_csv(os.path.join(INPUT_DIR,'test.csv'), index_col='date_time')

train_df.index = pd.to_datetime(train_df.index)
test_df.index = pd.to_datetime(test_df.index)

labels = train_df[['target_carbon_monoxide','target_benzene','target_nitrogen_oxides']]

train_df.drop(labels.columns, axis=1, inplace=True)
total_df = train_df.append(test_df)     #pd.concat()

total_df['dew_point'] = total_df['deg_C'].apply(lambda x: (17.27 * x) / (237.7 + x)) + total_df['absolute_humidity'].apply(lambda x: math.log (x))
total_df['partial_pressure'] = (total_df['deg_C'].apply(lambda x: (237.7 + x) * 286.8) * total_df['absolute_humidity']) / 100000
total_df['saturated_wvd'] = (total_df['absolute_humidity'] * 100) / total_df['relative_humidity']

total_df['dt_low_absolute_humidity'] = (total_df["absolute_humidity"] < 0.25) & (23 < total_df["deg_C"])

total_df['dt_hour'] = [x.hour for x in total_df.index]
total_df['dt_weekday'] = [x.weekday() for x in total_df.index]
total_df['dt_month'] = [x.month for x in total_df.index]
total_df['dt_season'] = [season(x.month) for x in total_df.index]
total_df['dt_lights'] = [daytime(x.hour) for x in total_df.index]
total_df['dt_month_s'] = np.sin(np.pi * (total_df['dt_month']-1)/6)
total_df['dt_month_c'] = np.cos(np.pi * (total_df['dt_month']-1)/6)

total_df['dt_month_s'] = total_df['dt_month_s'].astype('category').cat.codes
total_df['dt_month_c'] = total_df['dt_month_c'].astype('category').cat.codes

total_df["dt_working_hours"] = total_df["dt_hour"].isin(np.arange(8, 21, 1)).astype("int")
total_df["dt_weekend"] = (total_df["dt_weekday"] >= 5).astype("int")

cat_cols = np.array([col for col in total_df.columns if 'dt_' in col])
num_cols = np.array([col for col in total_df.columns if not 'dt_' in col])
total_cols = np.concatenate([num_cols,cat_cols])
cat_cols_idx = [np.where(total_df.columns == x)[0][0] for x in cat_cols]

After training the model we could see that the `deg_C` is actually the main feature with highest impact. It also has a strict seasonality.<br/>
The `dew_point`,`partial_pressure` are pretty useless. They are also highly correlated with the `absolute_humidity`.<br/>
The `dt_*` features behaved surprisingly well, despite that they all were extracted from the DateTime index (except `dt_low_absolute_humidity`).

```
 ('deg_C', 20.0),
 ('relative_humidity', 11.9),
 ('absolute_humidity', 14.4),
 ('sensor_1', 15.7),
 ('sensor_2', 17.0),
 ('sensor_3', 19.0),
 ('sensor_4', 9.0),
 ('sensor_5', 1.0),
 ('dew_point', 4.6),
 ('partial_pressure', 4.0),
 ('saturated_wvd', 4.7),
 ('dt_low_absolute_humidity', 7.6),
 ('dt_hour', 8.1),
 ('dt_weekday', 3.8),
 ('dt_month', 11.2),
 ('dt_season', 18.0),
 ('dt_lights', 14.0),
 ('dt_month_s', 12.0),
 ('dt_month_c', 7.0),
 ('dt_working_hours', 3.0),
 ('dt_weekend', 4.0)
 ```

In [None]:
fig, axes = plt.subplots(nrows=3, ncols=3)
fig.set_size_inches(16, 8)

total_df['deg_C'].plot(ax=axes[0, 0], title='deg_C');
total_df['absolute_humidity'].plot(ax=axes[0, 1], title='absolute_humidity');
total_df['relative_humidity'].plot(ax=axes[0, 2], title='relative_humidity');
total_df['dt_season'].plot(ax=axes[1, 0], title='dt_season');
total_df['sensor_1'].plot(ax=axes[1, 1], title='sensor_1');
total_df['sensor_2'].plot(ax=axes[1, 2], title='sensor_2');
total_df['sensor_3'].plot(ax=axes[2, 0], title='sensor_3');
total_df['dt_month_c'].plot(ax=axes[2, 1], title='dt_month_c');
total_df['dt_month_s'].plot(ax=axes[2, 2], title='dt_month_s');

In [None]:
pd.concat((total_df.min(), total_df.max(), total_df.mean(), total_df.std(), total_df.nunique()), axis=1)

In [None]:
pd.concat((labels.min(), labels.max(), labels.mean(), labels.nunique()), axis=1)

The 3 target labels are highly correlated 66%/80%/88%. That's why we used `RegressorChain` and not `MultiOutputRegressor`.<br/>
The predicted labels become even more correlated - 93%

In [None]:
labels.corr()

Here we enveloppe the original `lightgbm.cv` to make it accessible by the sklearn Pipeline.<br/>
The competition required validation shoulf be RMSLE but here we used a built-in RMSE with updating label before (log1p) and after (expm1) the execution flow.<br/>
Finally, in the prediciotn step we receive predictions array with the `nfold` dimentions. The simpliest option is just to do their average but there can be applied also other options, like:
- median (np.median)
- geometric mean (sc.stats.mstats.gmean)
- meta-model stacking:
   - execute `Booster.predict` for X_test
   - combine nfold test predictions to matrix
   - execute `Booster.predict` for X_train
   - combine nfold train predictions in a new LGBM model with nfold features and original labels
   - train meta-model and predict it with the prepared X_test preditions from the second section
   
`stratified=True` doesn't work for regressions.<br/>
`kwargs['verbose']=-1` removes all logs and warnings, it is useful for hyperparameter optimization.

In [None]:
class LGBMWrapper(BaseEstimator):
  def __init__(self, verbose, nfold):
    self.verbose = verbose
    self.nfold = nfold

  def fit(self, X, y, **kwargs):
    y = np.log1p(y)
    d_train = lgb.Dataset(X, y)

    kwargs['objective'] = 'regression'

    if self.verbose < 0:
      kwargs['verbose'] = self.verbose 

    model = lgb.cv(kwargs, d_train, num_boost_round=10000, nfold=self.nfold, metrics='rmse', 
                   early_stopping_rounds=100, stratified=False, verbose_eval=self.verbose, 
                   return_cvbooster=True, seed=RANDOM_SEED)

    self.booster = model['cvbooster']
    self.score = model['rmse-mean'][-1]
    
  def predict(self, X):
    y_pred = self.booster.predict(X)
    y_pred = np.expm1(y_pred).mean(0)
    return y_pred

In [None]:
pipe_pre = Pipeline([
    ('preproc', ColumnTransformer([
      ('num', Pipeline([
          #('scale', StandardScaler()),
          ('gauss', QuantileTransformer(output_distribution="normal")),
          #('minmax', MinMaxScaler()),
          ('kbins', KBinsDiscretizer(n_bins=16, encode='ordinal'))  #strategy='uniform'
      ]), num_cols),
      #('cat', OrdinalEncoder(), cat_cols)
      ('cat', OneHotEncoder(sparse=False), cat_cols)
    ], remainder='passthrough')),
    #('scale', StandardScaler())
])

pipe_pre.fit(total_df)
total_data = pipe_pre.transform(total_df).astype('float')

train_data, test_data = total_data[:train_df.index.shape[0]], total_data[train_df.index.shape[0]:]

Results without `OneHotEncoder`:

In [None]:
pd.DataFrame(index=total_df.index, columns=total_cols, data=total_data).nunique().T

In [None]:
def feval(learning_rate, num_leaves, feature_fraction, bagging_fraction, max_depth, max_bin, lambda_l2, lambda_l1, min_data_in_leaf, min_sum_hessian_in_leaf, subsample):
    lgb_space = {
        'learning_rate': float(max(min(learning_rate, 1), 0)),
        'num_leaves': int(round(num_leaves)),
        'feature_fraction': float(max(min(feature_fraction, 1), 0)),
        'bagging_fraction': float(max(min(bagging_fraction, 1), 0)),
        'max_depth': int(round(max_depth)),
        'max_bin': int(round(max_depth)),
        "lambda_l2" : float(lambda_l2),
        "lambda_l1" : float(lambda_l1),
        'min_data_in_leaf': int(round(min_data_in_leaf)),
        'min_sum_hessian_in_leaf': float(min_sum_hessian_in_leaf),
        'subsample': float(max(min(subsample, 1), 0))
    }
         
    lgbpwrap = RegressorChain(LGBMWrapper(verbose=-1, nfold=3))
    lgbpwrap.fit(train_data, labels, **lgb_space)
    scores = [x.score for x in lgbpwrap.estimators_]
    print('scores: ', scores)
    return -np.mean(scores)


fspace = {
    'learning_rate': (0.01, 0.2),
    'num_leaves': (24, 80),
    'feature_fraction': (0.4, 1),
    'bagging_fraction': (0.8, 1),
    'max_depth': (5, 30),
    'max_bin':(20,90),
    'lambda_l2': (0.0, 0.05),
    'lambda_l1': (0.0, 0.05),
    'min_data_in_leaf': (20, 80),
    'min_sum_hessian_in_leaf':(0,100),
    'subsample': (0.01, 1.0)
}

optimizer = BayesianOptimization(feval, fspace, random_state=RANDOM_SEED)
optimizer.maximize(init_points=20, n_iter=20)
best_params = optimizer.max
best_params

In [None]:
print(best_params)
params = best_params['params']
params['max_bin'] = int(round(params['max_bin']))
params['max_depth'] = int(round(params['max_depth']))
params['min_data_in_leaf'] = int(round(params['min_data_in_leaf']))
params['num_leaves'] = int(round(params['num_leaves']))

In [None]:
params = {
    'learning_rate': 0.01, 
    'num_leaves': 70,
    'feature_fraction': 0.4049,
    'bagging_fraction': 0.8922,
    'max_depth': 30,
    'max_bin': 28,
    'min_data_in_leaf': 25,
    'min_sum_hessian_in_leaf': 6.056
}

lgbpwrap = RegressorChain(LGBMWrapper(verbose=1000, nfold=10))
lgbpwrap.fit(train_data, labels, **params)
output_test = lgbpwrap.predict(test_data)

In [None]:
scores = [x.score for x in lgbpwrap.estimators_]
features = np.argsort(lgbpwrap.estimators_[0].booster.feature_importance(importance_type='gain')).mean(0)
features = list(zip(total_cols, features))

print('scores: ', scores) 
features

In [None]:
output_res = pd.DataFrame(index=test_df.index, data={'date_time':test_df.index.values})
output_res[labels.columns] = output_test
output_res.to_csv('./submission.csv', index=False)

In [None]:
output_res.corr()