# Libraries
---

In [None]:
from tqdm import tqdm
import pandas as pd
import numpy as np
import random
import os
import gc

from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import ExtraTreesRegressor, VotingRegressor, RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, BayesianRidge
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

import lightgbm as lgb
import xgboost as xgb
import catboost as cbt

import matplotlib.pyplot as plt
import seaborn as sns

pd.options.display.max_rows = 100
pd.options.display.max_columns = 100

import warnings
warnings.simplefilter('ignore')

In [None]:
CFG = {
    'debug': False,
    'seed': 2021,
}

In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(CFG['seed'])

# Datasets
---

In [None]:
PATH = "../input/tabular-playground-series-jul-2021/"

train = pd.read_csv(PATH + "train.csv", index_col="date_time", parse_dates=True)
test = pd.read_csv(PATH + "test.csv", index_col="date_time", parse_dates=True)
submission = pd.read_csv(PATH + "sample_submission.csv", index_col="date_time", parse_dates=True)

features = [col for col in train.columns if 'target_' not in col]
target = [col for col in train.columns if 'target_' in col]

train[target] = np.log1p(train[target])

# Feature engineering
---

In [None]:
def make_features(df, params):
    """
    Adds new features to a given dataset
    """
    df['year'] = df.index.year
    df['month'] = df.index.month
    df['week'] = df.index.week
    df['day'] = df.index.day
    df['dayofweek'] = df.index.dayofweek
    df['time'] = df.index.date - df.index.date.min()
    df['hour'] = df.index.hour
    df['time'] = df['time'].apply(lambda x : x.days)

    df['working_hours'] =  df['hour'].isin(np.arange(8, 21, 1)).astype('int')
    df['maximum_hours'] =  df['hour'].isin([8, 9, 17, 18, 19, 20]).astype('int')
    df['is_weekend'] = (df.index.dayofweek >= 5).astype('int')

    df['SMC'] = (df['absolute_humidity'] * 100) / df['relative_humidity']
    
    df['sensor_6'] = (df['sensor_2'] - df['sensor_5']) / df['sensor_5']
    df['sensor_7'] = (df['sensor_3'] - df['sensor_4']) / df['sensor_4']

    for periods in params[0]:
        df[f'dt-{periods}'] = df['deg_C'] - df['deg_C'].shift(periods=periods, fill_value=0)

    for periods in params[1]:
        df[f'abshum-{periods}'] = df['absolute_humidity'] - df['absolute_humidity'].shift(periods=periods, fill_value=0)
        df[f'relhum-{periods}'] = df['relative_humidity'] - df['relative_humidity'].shift(periods=periods, fill_value=0)
        
    for periods in params[2]:
        df[f's1-{periods}'] = df['sensor_1'] - df['sensor_1'].shift(periods=periods, fill_value=0)
        df[f's2-{periods}'] = df['sensor_2'] - df['sensor_2'].shift(periods=periods, fill_value=0)
        df[f's3-{periods}'] = df['sensor_3'] - df['sensor_3'].shift(periods=periods, fill_value=0)
        df[f's4-{periods}'] = df['sensor_4'] - df['sensor_4'].shift(periods=periods, fill_value=0)
        df[f's5-{periods}'] = df['sensor_5'] - df['sensor_5'].shift(periods=periods, fill_value=0)
        df[f's6-{periods}'] = df['sensor_6'] - df['sensor_6'].shift(periods=periods, fill_value=0)
        df[f's7-{periods}'] = df['sensor_7'] - df['sensor_7'].shift(periods=periods, fill_value=0)
            
    return df

In [None]:
train = make_features(train, [[3, 6], [3, 6], [1]])
test = make_features(test, [[3, 6], [3, 6], [1]])

scaler = StandardScaler()
scaler.fit(train[features])
train[features] = scaler.transform(train[features])
test[features] = scaler.transform(test[features])

# VotingRegressor
---

In [None]:
if CFG['debug']:
    X_train = train[features][(train.index.month>8)&(train.index.month<12)].reset_index(drop=True)
    y_train = train[target][(train.index.month>8)&(train.index.month<12)].reset_index(drop=True)
    X_valid = train[features][train.index.month==12].reset_index(drop=True)
    y_valid = train[target][train.index.month==12].reset_index(drop=True)
else:
    X_train = train[features][train.index.month>8].reset_index(drop=True)
    y_train = train[target][train.index.month>8].reset_index(drop=True)
    
X_test = test[features].reset_index(drop=True)

In [None]:
rmsle = np.zeros((len(target)))

for i, col in enumerate(target):
    model1 = lgb.LGBMRegressor(seed=CFG['seed'])
    # model2 = xgb.XGBRegressor(seed=CFG['seed'])
    model3 = cbt.CatBoostRegressor(random_seed=CFG['seed'], verbose=False)
    # model4 = LinearRegression(normalize=True, n_jobs=-1)
    # model5 = BayesianRidge(normalize=True)
    model6 = ExtraTreesRegressor(random_state=CFG['seed'], n_jobs=-1)
    # model7 = RandomForestRegressor(random_state=CFG['seed'], n_jobs=-1)
    model8 = HistGradientBoostingRegressor(random_state=CFG['seed'])
    
    model = VotingRegressor(
        estimators=[
            ('lgb', model1),
            # ('xgb', model2),
            ('cbt', model3),
            # ('lrg', model4),
            # ('bay', model5),
            ('etr', model6),
            # ('rfr', model7),
            ('hgb', model8)
        ],
    )
    
    model.fit(X_train, y_train[col])
    if CFG['debug']:
        rmsle[i] = mean_squared_error(y_valid[col], model.predict(X_valid), squared=False)
        print(f"{col} rmsle: {rmsle[i]:.6f}")
              
    submission[col] = np.expm1(model.predict(X_test))

if CFG['debug']:
    print("-"*20)
    print(f"rmsle: {np.mean(rmsle):.6f}")

# Submission
---

In [None]:
submission.to_csv("submission.csv")
submission