# Credits
---
[CatBoost_13Feature_Cross_Validation](https://www.kaggle.com/andy6804tw/catboost-13feature-cross-validation)  
[CatBoost_14Feature_Cross_Validation + FE](https://www.kaggle.com/paddykb/catboost-14feature-cross-validation-fe)  

# Libraries
---

In [None]:
import pandas as pd  
import numpy as np
import random
import math
import os

from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import ExtraTreesRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor, RandomForestRegressor, VotingRegressor
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.metrics import mean_squared_error

import lightgbm as lgb
import xgboost as xgb
import catboost as cbt

import warnings
warnings.filterwarnings('ignore')

In [None]:
CFG = {
    'seed': 2021,
}

In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(CFG['seed'])

# Datasets
---

In [None]:
train = pd.read_csv("../input/tabular-playground-series-jul-2021/train.csv", index_col="date_time", parse_dates=True)
test = pd.read_csv("../input/tabular-playground-series-jul-2021/test.csv", index_col="date_time", parse_dates=True)
submission = pd.read_csv("../input/tabular-playground-series-jul-2021/sample_submission.csv", index_col="date_time", parse_dates=True)

train=train.loc[~(train.index=='2011-01-01 00:00:00')]

In [None]:
all_df = pd.concat([train, test])

# Feature engineering
---

In [None]:
def make_features(df, params):
    df['deg_K'] = df['deg_C'] + 273.15
    
    df['sensor_6'] = (df['sensor_2'] - df['sensor_5']) / df['sensor_5']
    df['sensor_7'] = (df['sensor_3'] - df['sensor_4']) / df['sensor_4']
    
    for periods in params[0]:
        df[f'dt-{periods}'] = df['deg_C'] - df['deg_C'].shift(periods=periods, fill_value=0)

    for periods in params[1]:
        df[f'abshum-{periods}'] = df['absolute_humidity'] - df['absolute_humidity'].shift(periods=periods, fill_value=0)
        df[f'relhum-{periods}'] = df['relative_humidity'] - df['relative_humidity'].shift(periods=periods, fill_value=0)
    
    for i in range(7):
        for periods in params[2]:
            df[f's{i+1}-{periods}'] = df[f'sensor_{i+1}'] - df[f'sensor_{i+1}'].shift(periods=periods, fill_value=0)

            
    df.drop(columns='deg_C', inplace=True)
    
    return df

In [None]:
def pb_add(X):
    X['day'] = X.index.weekday
    X['is_odd'] = (X['sensor_4'] < 646) & (X['absolute_humidity'] < 0.238)
    diff = X.index - X.index.min()
    trend = diff.days
    
    for i in [1, 2, 3, 4]:
        X[f'f{i}s'] = np.sin(trend * 2 * math.pi / (365 * i)) 
        X[f'f{i}c'] = np.cos(trend * 2 * math.pi / (365 * i))
    for i in [1, 2, 3]:
        X[f'fh{i}s'] = np.sin(diff.seconds * 2 * math.pi / ( 3600 * 24 * i))
        X[f'fh{i}c'] = np.cos(diff.seconds * 2 * math.pi / ( 3600 * 24 * i))
    
    sensor_features = [
        'deg_K', 
        'relative_humidity', 'absolute_humidity', 
        'sensor_1', 'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5', 'sensor_6', 'sensor_7',
    ]
    
    lags = [-1, -4, -24, -7*24]
    for sensor_feature in sensor_features:
        this = X[sensor_feature]
        # look back
        for lag in lags:
            feature = f'{sensor_feature}_{abs(lag)}b'
            this_f = X[sensor_feature].shift(lag)
            X[feature] = (this_f - this).fillna(0)
        # look forwards
        for lag in lags:
            feature = f'{sensor_feature}_{abs(-lag)}f'
            this_f = X[sensor_feature].shift(-lag)
            X[feature] = (this_f - this).fillna(0)
            
    return X

In [None]:
# The months will be used for folds split
months = pd.Series(all_df.index.month[:len(train)])

all_df["hour"] = all_df.index.hour
all_df['seconds'] = all_df['hour']*3600+all_df.index.minute*60+all_df.index.second
all_df["working_hours"] =  all_df["hour"].isin(np.arange(8, 21, 1)).astype("int")
all_df['maximum_hours'] =  all_df['hour'].isin([8, 9, 17, 18, 19, 20]).astype('int')
all_df["is_weekend"] = (all_df.index.dayofweek >= 5).astype("int")
all_df['is_saturday'] = (all_df.index.dayofweek==5).astype("int")
all_df["SMC"] = (all_df["absolute_humidity"] * 100) / all_df["relative_humidity"]
all_df.drop(columns = 'hour', inplace = True)

all_df = make_features(all_df.copy(), [[3, 6], [3, 6], [1]])
all_df = pb_add(all_df.copy())

In [None]:
features = [col for col in all_df.columns.tolist() if 'target_' not in col]
target = [col for col in all_df.columns.tolist() if 'target_' in col]

pt = PowerTransformer(method='yeo-johnson', standardize=True)
all_df[features] = pt.fit_transform(all_df[features])

X = all_df[features][:len(train)].copy()
y = np.log1p(all_df[target][:len(train)]).copy()
X_test = all_df[features][len(train):]

# VotingRegressor
---

In [None]:
weights = {
    'target_carbon_monoxide': [1.0, -0.3, 2.0, 0.7, 0.6, 0.4, -0.2],
    'target_benzene': [1.0, -0.3, 2.0, 0.7, 0.6, 0.4, -0.2],
    'target_nitrogen_oxides': [1.0, -0.3, 2.0, 0.7, 0.6, 0.4, -0.2],
}

In [None]:
%%time

n_splits = months.nunique()

for target in y.columns:
    print(f"===== {target} =====")
    
    oof = np.zeros(X.shape[0])
    pred = 0
    logo = LeaveOneGroupOut()

    for fold, (trn_idx, val_idx) in enumerate(logo.split(X, y, months)):
        X_train, X_valid = X.iloc[trn_idx], X.iloc[val_idx]
        y_train, y_valid = y.iloc[trn_idx][target], y.iloc[val_idx][target]
        
        model1 = lgb.LGBMRegressor(random_state=CFG['seed'], n_jobs=-1)
        model2 = xgb.XGBRegressor(random_state=CFG['seed'], n_jobs=-1)
        model3 = cbt.CatBoostRegressor(random_seed=CFG['seed'], verbose=False)
        model4 = ExtraTreesRegressor(random_state=CFG['seed'], n_jobs=-1)
        model5 = GradientBoostingRegressor(random_state=CFG['seed'])
        model6 = RandomForestRegressor(random_state=CFG['seed'])
        model7 = HistGradientBoostingRegressor(random_state=CFG['seed'])

        model = VotingRegressor(
            estimators=[
                ('lgb', model1),
                ('xgb', model2),
                ('cbt', model3),
                ('etr', model4),
                ('gbr', model5),
                ('rfr', model6),
                ('hgb', model7)
            ],
            weights=weights[target],
            n_jobs=-1,
        )

        model.fit(X_train, y_train)

        pred += np.expm1(model.predict(X_test)) / n_splits
        oof[val_idx] = model.predict(X_valid)
        print(f"FOLD {fold} rmsle {mean_squared_error(y_valid, oof[val_idx], squared=False):.6f}")

    print("-"*40)
    print(f"rmsle {mean_squared_error(y[target], oof, squared=False):.6f}")

    submission[target] = pred

# Submission
---

In [None]:
submission.to_csv('submission.csv')
submission