## ENSEMBLE STACKING META-REGRESSOR

<div align = 'center'> <img src ="http://rasbt.github.io/mlxtend/user_guide/regressor/StackingRegressor_files/stackingregression_overview.png"/></div>

---

In [None]:
import pandas as pd
import numpy as np

import lightgbm as lgb
import xgboost as xgb
import catboost as cbt
import random, os
import math

from mlxtend.regressor import StackingCVRegressor
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import VotingRegressor, RandomForestRegressor, HistGradientBoostingRegressor, ExtraTreesRegressor
from sklearn.linear_model import LinearRegression, Ridge, BayesianRidge
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import cross_val_score

import warnings
warnings.simplefilter('ignore')

RANDOM_SEED = 42

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(RANDOM_SEED)

### DATA PREPARATION
___

In [None]:
train_data = pd.read_csv('../input/tabular-playground-series-jul-2021/train.csv', parse_dates=True)
test_data = pd.read_csv('../input/tabular-playground-series-jul-2021/test.csv', parse_dates=True)
sample_sub = pd.read_csv('../input/tabular-playground-series-jul-2021/sample_submission.csv')

targets = ['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides']

train_data[targets] = np.log1p(train_data[targets])

targets_and_drop = {
    'target_carbon_monoxide': ['target_benzene', 'target_nitrogen_oxides'],
    'target_benzene': ['target_carbon_monoxide', 'target_nitrogen_oxides'],
    'target_nitrogen_oxides': ['target_carbon_monoxide', 'target_benzene']
}

### NEW FEATURES
___

In [None]:
def pb_add(X):
    X['day'] = X.date_time.dt.weekday
    is_odd = (X['sensor_4'] < 646) & (X['absolute_humidity'] < 0.238)
    X['is_odd'] = is_odd
    diff = X['date_time'] - min(X['date_time'])
    trend = diff.dt.days
    X['f1s'] = np.sin(trend * 2 * math.pi / (365 * 1)) 
    X['f1c'] = np.cos(trend * 2 * math.pi / (365 * 1))
    X['f2s'] = np.sin(2 * math.pi * trend / (365 * 2)) 
    X['f2c'] = np.cos(2 * math.pi * trend / (365 * 2)) 
    X['f3s'] = np.sin(2 * math.pi * trend / (365 * 3)) 
    X['f3c'] = np.cos(2 * math.pi * trend / (365 * 3)) 
    X['f4s'] = np.sin(2 * math.pi * trend / (365 * 4)) 
    X['f4c'] = np.cos(2 * math.pi * trend / (365 * 4)) 
    X['fh1s'] = np.sin(diff.dt.seconds * 2 * math.pi / ( 3600 * 24 * 1))
    X['fh1c'] = np.cos(diff.dt.seconds * 2 * math.pi / ( 3600 * 24 * 1))
    X['fh2s'] = np.sin(diff.dt.seconds * 2 * math.pi / ( 3600 * 24 * 2))
    X['fh2c'] = np.cos(diff.dt.seconds * 2 * math.pi / ( 3600 * 24 * 2))
    X['fh3s'] = np.sin(diff.dt.seconds * 2 * math.pi / ( 3600 * 24 * 3))
    X['fh3c'] = np.cos(diff.dt.seconds * 2 * math.pi / ( 3600 * 24 * 3))
    
    sensor_features = [
        'deg_C', 
        'relative_humidity', 'absolute_humidity', 
        'sensor_1', 'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5' ]
    
    lags = [-1, -4, -24, -7 * 24]  
    for sensor_feature in sensor_features:
        this = X[sensor_feature]

        for lag in lags:
            feature = f'{sensor_feature}_{abs(lag)}b'
            this_f = X[sensor_feature].shift(lag)
            X[feature] = (this_f - this).fillna(0)
        # look forwards
        for lag in lags:
            feature = f'{sensor_feature}_{abs(-lag)}f'
            this_f = X[sensor_feature].shift(-lag)
            X[feature] = (this_f - this).fillna(0)
            
    return X

In [None]:
all_data = pd.concat([train_data, test_data])
all_data['date_time'] = pd.to_datetime(all_data['date_time'])
months = all_data["date_time"].dt.month[:len(train_data)]

all_data["hour"] = all_data["date_time"].dt.hour
all_data["working_hours"] =  all_data["hour"].isin(np.arange(8, 21, 1)).astype("int")
all_data["is_weekend"] = (all_data["date_time"].dt.dayofweek >= 5).astype("int")
all_data['hr'] = all_data.date_time.dt.hour*60+all_data.date_time.dt.minute
all_data['satday'] = (all_data.date_time.dt.weekday==5).astype("int")
all_data["SMC"] = (all_data["absolute_humidity"] * 100) / all_data["relative_humidity"]
all_data.drop(columns = 'hour', inplace = True)


all_data = pb_add(all_data.copy())

all_data.drop(columns = 'date_time', inplace = True)

### DATA SPLIT
___

In [None]:
train = all_data[:len(train_data)]
test = all_data[len(train_data):].drop(targets, axis = 1)

### ENSEMBLE STACKING META-REGRESSOR
___

## MODEL LEARNING

I decided to split models (even code now is almost the same). Soon will be different for each targets. I will optimize model for each target.

In [None]:
rand_states = [2021, 1998, 42, 123]

#### Carbon monoxide
___

In [None]:
print(f"\nTraining regressors for carbon monoxide")

['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides']

target = 'target_carbon_monoxide'
    
yhat = np.zeros((sample_sub.shape[0],1))
    
for rand in rand_states:
    print(f"\nRandom state {rand} ")
    seed_everything(rand)
    
    m_lgb = lgb.LGBMRegressor(seed = rand)
    m_ctb = cbt.CatBoostRegressor(random_seed = rand, verbose=False)
    m_hgbr =  HistGradientBoostingRegressor(random_state = rand)

    stack = StackingCVRegressor(regressors=(m_lgb, m_ctb, m_hgbr), meta_regressor = BayesianRidge(normalize = True))

    regressors = ['LightGBM', 'CatBoost', 'HistGradientBoostingRegressor', 'StackingCVRegressor']

    for clf, label in zip([m_lgb, m_ctb, m_hgbr, stack], regressors):
        
        scores = cross_val_score(clf, train.drop(targets, axis = 1), train[target], cv = 3, scoring='neg_mean_squared_error')
        print("  - Neg. MSE Score: %0.4f (+/- %0.4f) [%s]" % (scores.mean(), scores.std(), label))

    stack.fit(train.drop(targets, axis = 1), train[target])
        
    yhat += np.expm1(stack.predict(test)).reshape(-1, 1)
        
sample_sub[target] = yhat / len(rand_states)

#### Benzene
___

In [None]:
print(f"\nTraining regressors for benzene")

target = 'target_benzene'
    
yhat = np.zeros((sample_sub.shape[0],1))
    
for rand in rand_states:
    print(f"\nRandom state {rand} ")
    seed_everything(rand)
    
    m_lgb = lgb.LGBMRegressor(seed = rand)
    m_ctb = cbt.CatBoostRegressor(random_seed = rand, verbose=False)
    m_hgbr =  HistGradientBoostingRegressor(random_state = rand)

    stack = StackingCVRegressor(regressors=(m_lgb, m_ctb, m_hgbr), meta_regressor = BayesianRidge(normalize = True))

    regressors = ['LightGBM', 'CatBoost', 'HistGradientBoostingRegressor', 'StackingCVRegressor']

    for clf, label in zip([m_lgb, m_ctb, m_hgbr, stack], regressors):
        
        scores = cross_val_score(clf, train.drop(targets, axis = 1), train[target], cv = 3, scoring='neg_mean_squared_error')
        print("  - Neg. MSE Score: %0.4f (+/- %0.4f) [%s]" % (scores.mean(), scores.std(), label))

    stack.fit(train.drop(targets, axis = 1), train[target])
        
    yhat += np.expm1(stack.predict(test)).reshape(-1, 1)
        
sample_sub[target] = yhat / len(rand_states)

#### Nitrogen oxides
___

In [None]:
print(f"\nTraining regressors for nitrogen oxides")

target = 'target_nitrogen_oxides'
    
yhat = np.zeros((sample_sub.shape[0],1))
    
for rand in rand_states:
    print(f"\nRandom state {rand} ")
    seed_everything(rand)
    
    m_lgb = lgb.LGBMRegressor(seed = rand)
    m_ctb = cbt.CatBoostRegressor(random_seed = rand, verbose=False)
    m_hgbr =  HistGradientBoostingRegressor(random_state = rand)

    stack = StackingCVRegressor(regressors=(m_lgb, m_ctb, m_hgbr), meta_regressor = BayesianRidge(normalize = True))

    regressors = ['LightGBM', 'CatBoost', 'HistGradientBoostingRegressor', 'StackingCVRegressor']

    for clf, label in zip([m_lgb, m_ctb, m_hgbr, stack], regressors):
        
        scores = cross_val_score(clf, train.drop(targets, axis = 1), train[target], cv = 3, scoring='neg_mean_squared_error')
        print("  - Neg. MSE Score: %0.4f (+/- %0.4f) [%s]" % (scores.mean(), scores.std(), label))

    stack.fit(train.drop(targets, axis = 1), train[target])
        
    yhat += np.expm1(stack.predict(test)).reshape(-1, 1)
        
sample_sub[target] = yhat / len(rand_states)

### SUBMISSION
___

In [None]:
sample_sub.to_csv("tps-ens-meta-010.csv", index = False)
sample_sub.head(10)