## Thanks to @junhyeok99 https://www.kaggle.com/junhyeok99/automl-pycaret

In [None]:
!pip install -q pycaret

# Libraries
---

In [None]:
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np
import random
import os
import gc

from pycaret.regression import setup, compare_models, tune_model, blend_models, finalize_model, predict_model, plot_model

from sklearn.model_selection import LeaveOneGroupOut
import statsmodels.api as sm

pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

import warnings
warnings.simplefilter('ignore')

In [None]:
CFG = {
    'carbon_monoxide': {
        'target': 'target_carbon_monoxide',
        'seed': 2021,
        'n_select': 4,
        'tuning': False,
        'normalize': True,
        'optimize': 'RMSE',
    },
    'benzene': {
        'target': 'target_benzene',
        'seed': 2021,
        'n_select': 4,
        'tuning': False,
        'normalize': True,
        'optimize': 'RMSE',        
    },
    'nitrogen_oxides': {
        'target': 'target_nitrogen_oxides',
        'seed': 2021,
        'n_select': 4,
        'tuning': False,
        'normalize': True,
        'optimize': 'RMSE',        
    },
} 

In [None]:
def seed_everything(seed=2021):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything()

# Datasets
---

In [None]:
train = pd.read_csv("../input/tabular-playground-series-jul-2021/train.csv")
test = pd.read_csv("../input/tabular-playground-series-jul-2021/test.csv")
submission = pd.read_csv("../input/tabular-playground-series-jul-2021/sample_submission.csv")

train=train.loc[~(train['date_time']=='2011-01-01 00:00:00')].reset_index(drop=True)

In [None]:
all_df = pd.concat([train, test])
all_df['date_time'] = pd.to_datetime(all_df['date_time'])

# Feature engineering
---

In [None]:
def make_features(df, params):
    df['deg_K'] = df['deg_C'] + 273.15
    
    df['sensor_6'] = (df['sensor_2'] - df['sensor_5']) / df['sensor_5']
    df['sensor_7'] = (df['sensor_3'] - df['sensor_4']) / df['sensor_4']
    
    for periods in params[0]:
        df[f'dt-{periods}'] = df['deg_C'] - df['deg_C'].shift(periods=periods, fill_value=0)

    for periods in params[1]:
        df[f'abshum-{periods}'] = df['absolute_humidity'] - df['absolute_humidity'].shift(periods=periods, fill_value=0)
        df[f'relhum-{periods}'] = df['relative_humidity'] - df['relative_humidity'].shift(periods=periods, fill_value=0)
    
    for i in range(7):
        for periods in params[2]:
            df[f's{i+1}-{periods}'] = df[f'sensor_{i+1}'] - df[f'sensor_{i+1}'].shift(periods=periods, fill_value=0)

            
    df.drop(columns='deg_C', inplace=True)
    
    return df

In [None]:
import math

def pb_add(X):
    X['day'] = X.date_time.dt.weekday
    X['is_odd'] = (X['sensor_4'] < 646) & (X['absolute_humidity'] < 0.238)
    diff = X['date_time'] - min(X['date_time'])
    trend = diff.dt.days
    
    for i in [1, 2, 3, 4]:
        X[f'f{i}s'] = np.sin(trend * 2 * math.pi / (365 * i)) 
        X[f'f{i}c'] = np.cos(trend * 2 * math.pi / (365 * i))
    for i in [1, 2, 3]:
        X[f'fh{i}s'] = np.sin(diff.dt.seconds * 2 * math.pi / ( 3600 * 24 * i))
        X[f'fh{i}c'] = np.cos(diff.dt.seconds * 2 * math.pi / ( 3600 * 24 * i))
    
    sensor_features = [
        'deg_K', 
        'relative_humidity', 'absolute_humidity', 
        'sensor_1', 'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5', 'sensor_6', 'sensor_7',
    ]
    
    lags = [-1, -4, -24, -7*24]
    for sensor_feature in sensor_features:
        this = X[sensor_feature]
        # look back
        for lag in lags:
            feature = f'{sensor_feature}_{abs(lag)}b'
            this_f = X[sensor_feature].shift(lag)
            X[feature] = (this_f - this).fillna(0)
        # look forwards
        for lag in lags:
            feature = f'{sensor_feature}_{abs(-lag)}f'
            this_f = X[sensor_feature].shift(-lag)
            X[feature] = (this_f - this).fillna(0)
            
    return X

In [None]:
# The months will be used for folds split
months = all_df["date_time"].dt.month[:len(train)]

all_df["hour"] = all_df["date_time"].dt.hour
all_df['seconds'] = all_df['hour']*3600+all_df.date_time.dt.minute*60+all_df.date_time.dt.second
all_df["working_hours"] =  all_df["hour"].isin(np.arange(8, 21, 1)).astype("int")
all_df['maximum_hours'] =  all_df['hour'].isin([8, 9, 17, 18, 19, 20]).astype('int')
all_df["is_weekend"] = (all_df["date_time"].dt.dayofweek >= 5).astype("int")
all_df['is_saturday'] = (all_df.date_time.dt.weekday==5).astype("int")
all_df["SMC"] = (all_df["absolute_humidity"] * 100) / all_df["relative_humidity"]
all_df.drop(columns = 'hour', inplace = True)

all_df = make_features(all_df.copy(), [[3, 6], [3, 6], [1]])
all_df = pb_add(all_df.copy())

all_df.drop(columns='date_time', inplace=True)

In [None]:
features = [col for col in all_df.columns.tolist() if 'target_' not in col]
target = [col for col in all_df.columns.tolist() if 'target_' in col]

all_df[target] = np.log1p(all_df[target])
all_df

# PyCaret
---

In [None]:
def pycaret_model(train, test, config):
    print('Setup Your Data....')
    setup(
        data=train,
        target=config['target'],
        numeric_imputation='mean',
        session_id=config['seed'],
        fold_strategy=config['fold_strategy'],
        fold_groups=config['fold_groups'],
        normalize = config['normalize'],
        silent= True,
    )

    print(f"Comparing Models....")
    best = compare_models(sort=config['optimize'], n_select=config['n_select'], exclude=['xgboost'])

    if config['tuning']:
        print(f"Tuning Models....")
        best_tuned = [tune_model(model) for model in best]

        print(f"Blending Models....")
        blended = blend_models(estimator_list=best+best_tuned, optimize=config['optimize'])
    else:
        print(f"Blending Models....")
        blended = blend_models(estimator_list=best, optimize=config['optimize'])
        
    pred_holdout = predict_model(blended)

    print(f"Finallizing Models....")
    final_model = finalize_model(blended)

    print('Done...!!!')
    pred = predict_model(final_model, test)
    re = pred['Label']

    return re, final_model

## for Carbon Monoxide

In [None]:
# 'target_carbon_monoxide'
CFG['carbon_monoxide']['fold_strategy'] = LeaveOneGroupOut()
CFG['carbon_monoxide']['fold_groups'] = months

X_train = all_df[features+[CFG['carbon_monoxide']['target']]][:len(train)].copy()
X_test = all_df[features+[CFG['carbon_monoxide']['target']]][len(train):].copy()

pred_CO, model_CO = pycaret_model(X_train, X_test, CFG['carbon_monoxide'])
submission[CFG['carbon_monoxide']['target']] = np.expm1(pred_CO)

In [None]:
plot_model(model_CO, plot='error')

## for Benzene

In [None]:
# 'target_benzene'
CFG['benzene']['fold_strategy'] = LeaveOneGroupOut()
CFG['benzene']['fold_groups'] = months

X_train = all_df[features+[CFG['benzene']['target']]][:len(train)].copy()
X_test = all_df[features+[CFG['benzene']['target']]][len(train):].copy()

pred_benzene, model_benzene = pycaret_model(X_train, X_test, CFG['benzene'])
submission[CFG['benzene']['target']] = np.expm1(pred_benzene)

In [None]:
plot_model(model_benzene, plot='error')

## for Nitrogen Oxides

In [None]:
# 'target_nitrogen_oxides'
CFG['nitrogen_oxides']['fold_strategy'] = LeaveOneGroupOut()
CFG['nitrogen_oxides']['fold_groups'] = months

X_train = all_df[features+[CFG['nitrogen_oxides']['target']]][:len(train)].copy()
X_test = all_df[features+[CFG['nitrogen_oxides']['target']]][len(train):].copy()

pred_NOx, model_NOx = pycaret_model(X_train, X_test, CFG['nitrogen_oxides'])
submission[CFG['nitrogen_oxides']['target']] = np.expm1(pred_NOx)

In [None]:
plot_model(model_NOx, plot='error')

# Submission
---

In [None]:
submission.to_csv('submission.csv', index=False)
submission