## Thanks to @junhyeok99 https://www.kaggle.com/junhyeok99/automl-pycaret

In [None]:
!pip install -q pycaret

# Libraries
---

In [None]:
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np
import random
import os
import gc

from pycaret.regression import setup, compare_models, tune_model, blend_models, finalize_model, predict_model, plot_model

import statsmodels.api as sm

pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

import warnings
warnings.simplefilter('ignore')

In [None]:
CFG = {
    'carbon_monoxide': {
        'target': 'target_carbon_monoxide',
        'seed': 2021,
        'n_select': 2,
        'fold': 3,
        'fold_strategy': 'timeseries',
        'tuning': False,
        'normalize': True,
        'optimize': 'RMSE',
    },
    'benzene': {
        'target': 'target_benzene',
        'seed': 2021,
        'n_select': 2,
        'fold': 3,
        'fold_strategy': 'timeseries',
        'tuning': False,
        'normalize': True,
        'optimize': 'RMSE',        
    },
    'nitrogen_oxides': {
        'target': 'target_nitrogen_oxides',
        'seed': 2021,
        'n_select': 2,
        'fold': 2,
        'fold_strategy': 'timeseries',
        'tuning': False,
        'normalize': True,
        'optimize': 'RMSE',        
    },
} 

In [None]:
def seed_everything(seed=2021):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything()

# Datasets
---

In [None]:
train = pd.read_csv("../input/tabular-playground-series-jul-2021/train.csv", index_col="date_time", parse_dates=True)
test = pd.read_csv("../input/tabular-playground-series-jul-2021/test.csv", index_col="date_time", parse_dates=True)
submission = pd.read_csv("../input/tabular-playground-series-jul-2021/sample_submission.csv", index_col="date_time", parse_dates=True)

target = [col for col in train.columns if 'target_' in col]
train[target] = np.log1p(train[target])

pseudo_label = submission.copy()

sub18549 = pd.read_csv("../input/tps07-18549/submission_.csv", index_col="date_time", parse_dates=True)

pseudo_label[CFG['carbon_monoxide']['target']] = sub18549[CFG['carbon_monoxide']['target']]
pseudo_label[CFG['benzene']['target']] = sub18549[CFG['benzene']['target']]
pseudo_label[CFG['nitrogen_oxides']['target']] = sub18549[CFG['nitrogen_oxides']['target']]
test_carbon_monoxide = pd.concat([test, pseudo_label], axis=1)
test_carbon_monoxide[target] = np.log1p(test_carbon_monoxide[target])

pseudo_label[CFG['carbon_monoxide']['target']] = sub18549[CFG['carbon_monoxide']['target']]
pseudo_label[CFG['benzene']['target']] = sub18549[CFG['benzene']['target']]
pseudo_label[CFG['nitrogen_oxides']['target']] = sub18549[CFG['nitrogen_oxides']['target']]
test_benzene = pd.concat([test, pseudo_label], axis=1)
test_benzene[target] = np.log1p(test_benzene[target])

pseudo_label[CFG['carbon_monoxide']['target']] = sub18549[CFG['carbon_monoxide']['target']]
pseudo_label[CFG['benzene']['target']] = sub18549[CFG['benzene']['target']]
pseudo_label[CFG['nitrogen_oxides']['target']] = sub18549[CFG['nitrogen_oxides']['target']]
test_nitrogen_oxides = pd.concat([test, pseudo_label], axis=1)
test_nitrogen_oxides[target] = np.log1p(test_nitrogen_oxides[target])

# Feature engineering
---

In [None]:
def make_features(df, params):
    """
    Adds new features to a given dataset
    """
    df['year'] = df.index.year
    df['month'] = df.index.month
    df['week'] = df.index.week
    df['day'] = df.index.day
    df['dayofweek'] = df.index.dayofweek
    df['time'] = df.index.date - df.index.date.min()
    df['hour'] = df.index.hour
    df['time'] = df['time'].apply(lambda x : x.days)

    df['working_hours'] =  df['hour'].isin(np.arange(8, 21, 1)).astype('int')
    df['maximum_hours'] =  df['hour'].isin([8, 9, 17, 18, 19, 20]).astype('int')
    df['is_weekend'] = (df.index.dayofweek >= 5).astype('int')

    df['SMC'] = (df['absolute_humidity'] * 100) / df['relative_humidity']
    
    df['sensor_6'] = (df['sensor_2'] - df['sensor_5']) / df['sensor_5']
    df['sensor_7'] = (df['sensor_3'] - df['sensor_4']) / df['sensor_4']

    for periods in params[0]:
        df[f'dt-{periods}'] = df['deg_C'] - df['deg_C'].shift(periods=periods, fill_value=0)

    for periods in params[1]:
        df[f'abshum-{periods}'] = df['absolute_humidity'] - df['absolute_humidity'].shift(periods=periods, fill_value=0)
        df[f'relhum-{periods}'] = df['relative_humidity'] - df['relative_humidity'].shift(periods=periods, fill_value=0)
        
    for periods in params[2]:
        df[f's1-{periods}'] = df['sensor_1'] - df['sensor_1'].shift(periods=periods, fill_value=0)
        df[f's2-{periods}'] = df['sensor_2'] - df['sensor_2'].shift(periods=periods, fill_value=0)
        df[f's3-{periods}'] = df['sensor_3'] - df['sensor_3'].shift(periods=periods, fill_value=0)
        df[f's4-{periods}'] = df['sensor_4'] - df['sensor_4'].shift(periods=periods, fill_value=0)
        df[f's5-{periods}'] = df['sensor_5'] - df['sensor_5'].shift(periods=periods, fill_value=0)
        df[f's6-{periods}'] = df['sensor_6'] - df['sensor_6'].shift(periods=periods, fill_value=0)
        df[f's7-{periods}'] = df['sensor_7'] - df['sensor_7'].shift(periods=periods, fill_value=0)
            
    return df

# PyCaret
---

In [None]:
def pycaret_model(train, test, config):
    print('Setup Your Data....')
    setup(
        data=train,
        target=config['target'],
        numeric_imputation='mean',
        session_id=config['seed'],
        normalize = config['normalize'],
        silent= True,
        fold_strategy=config['fold_strategy'],
    )

    print(f"Comparing Models....")
    best = compare_models(sort=config['optimize'], n_select=config['n_select'], fold=config['fold'], exclude=['xgboost'])

    if config['tuning']:
        print(f"Tuning Models....")
        best_tuned = [tune_model(model) for model in best]

        print(f"Blending Models....")
        blended = blend_models(estimator_list=best+best_tuned, fold=config['fold'], optimize=config['optimize'])
    else:
        print(f"Blending Models....")
        blended = blend_models(estimator_list=best, fold=config['fold'], optimize=config['optimize'])
        
    pred_holdout = predict_model(blended)

    print(f"Finallizing Models....")
    final_model = finalize_model(blended)

    print('Done...!!!')
    pred = predict_model(final_model, test)
    re = pred['Label']

    return re, final_model

## for Carbon Monoxide

In [None]:
# 'target_carbon_monoxide'
X = train[train.index.month>8].copy()

params = [[3, 6], [3, 6], [1]]
all_df = make_features(pd.concat([X, test_carbon_monoxide]), params)

X_train = all_df.copy()
X_test = all_df[len(X):].copy()

pred_CO, model_CO = pycaret_model(X_train, X_test, CFG['carbon_monoxide'])
submission[CFG['carbon_monoxide']['target']] = np.expm1(pred_CO)

In [None]:
plot_model(model_CO, plot='error')

## for Benzene

In [None]:
# 'target_benzene'
X = train[train.index.month>8].copy()

params = [[3, 6], [3, 6], [1]]
all_df = make_features(pd.concat([X, test_benzene]), params)

X_train = all_df.copy()
X_test = all_df[len(X):].copy()

pred_benzene, model_benzene = pycaret_model(X_train, X_test, CFG['benzene'])
submission[CFG['benzene']['target']] = np.expm1(pred_benzene)

In [None]:
plot_model(model_benzene, plot='error')

## for Nitrogen Oxides

In [None]:
# 'target_nitrogen_oxides'
X = train[train.index.month>8].copy()

params = [[3, 6], [3, 6], [1]]
all_df = make_features(pd.concat([X, test_nitrogen_oxides]), params)

X_train = all_df.copy()
X_test = all_df[len(X):].copy()

pred_NOx, model_NOx = pycaret_model(X_train, X_test, CFG['nitrogen_oxides'])
submission[CFG['nitrogen_oxides']['target']] = np.expm1(pred_NOx)

In [None]:
plot_model(model_NOx, plot='error')

# Submission
---

In [None]:
submission.to_csv('submission.csv')
submission