In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(rc={'figure.figsize':(16,8)})
sns.set(font_scale=1.3)
plt.style.use('fivethirtyeight')

from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv('../input/tabular-playground-series-jul-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-jul-2021/test.csv')
sub = pd.read_csv('../input/tabular-playground-series-jul-2021/sample_submission.csv')

In [None]:
train.head()

In [None]:
train.info()

In [None]:
# EDA

In [None]:
def cal_time(df):    
    df['date_time'] = pd.to_datetime(train['date_time'])
    df['year'] = train['date_time'].dt.year
    df['month'] = train['date_time'].dt.month
    df['week'] = train['date_time'].dt.week
    df['day'] = train['date_time'].dt.day
    df['dayofweek'] = train['date_time'].dt.dayofweek
    df['hour'] = train['date_time'].dt.hour
    return df 

In [None]:
train = cal_time(train)

In [None]:
plt.figure(figsize=(15,15))
sns.heatmap(train.corr(), cmap=plt.cm.Blues, vmin=-1, vmax=1, cbar_kws={'shrink': .8}, square=True, 
            annot=True, fmt='.2f', linewidths=.8)
plt.show()

In [None]:
fig, ax = plt.subplots(3, 3, figsize = (18,10))

ax[0,0].plot(train.groupby(train['month'])['target_carbon_monoxide'].mean(), 'r');
ax[0,1].plot(train.groupby(train['month'])['target_benzene'].mean(), 'r');
ax[0,2].plot(train.groupby(train['month'])['target_nitrogen_oxides'].mean(), 'r');

ax[1,0].plot(train.groupby(train['day'])['target_carbon_monoxide'].mean(), 'b');
ax[1,1].plot(train.groupby(train['day'])['target_benzene'].mean(), 'b');
ax[1,2].plot(train.groupby(train['day'])['target_nitrogen_oxides'].mean(), 'b');

ax[2,0].plot(train.groupby(train['hour'])['target_carbon_monoxide'].mean(), 'y');
ax[2,1].plot(train.groupby(train['hour'])['target_benzene'].mean(), 'y');
ax[2,2].plot(train.groupby(train['hour'])['target_nitrogen_oxides'].mean(), 'y');

ax[0,0].set_title('Month-CO')
ax[0,1].set_title('Month-Benzene')
ax[0,2].set_title('Month-NOx')

ax[1,0].set_title('Day-CO')
ax[1,1].set_title('Day-Benzene')
ax[1,2].set_title('Day-NOx')

ax[2,0].set_title('Hour-CO')
ax[2,1].set_title('Hour-Benzene')
ax[2,2].set_title('Hour-NOx')

fig.tight_layout()
plt.show()

In [None]:
targets = ["target_carbon_monoxide", "target_benzene", "target_nitrogen_oxides"]
target_names = ["Carbon monoxide", "Benzene", "Nitrogen oxides"]

In [None]:
fig, axs = plt.subplots(figsize=(15, 6), ncols=3, nrows=1, sharey=False)

fig.suptitle("Target values distribution", fontsize=20)

colors = ["mediumorchid", "lightseagreen", "cornflowerblue"]

for i in [0, 1, 2]:
    axs[i].hist(train[targets[i]], bins=40, edgecolor="black", color=colors[i])
    axs[i].set_title(f"{target_names[i]} (target #{i+1})", fontsize=15, pad=5)
    axs[i].set_ylabel("Amount of values", fontsize=13, labelpad=5)
    axs[i].set_xlabel(f"{target_names[i]} level", fontsize=13, labelpad=5)
    axs[i].grid(axis="y")

plt.show();

In [None]:
fig, axs = plt.subplots(figsize=(15, 6), ncols=3, nrows=1, sharey=False)

fig.suptitle("Target values distribution", fontsize=20)

colors = ["mediumorchid", "lightseagreen", "cornflowerblue"]

for i in [0, 1, 2]:
    axs[i].boxplot(train[targets[i]])
    axs[i].set_title(f"{target_names[i]} (target #{i+1})", fontsize=15, pad=5)
    axs[i].set_ylabel("Amount of values", fontsize=13, labelpad=5)
    axs[i].set_xlabel(f"{target_names[i]} level", fontsize=13, labelpad=5)
    axs[i].grid(axis="y")

plt.show();

In [None]:
pip install pycaret

In [None]:
from pycaret.regression import setup, compare_models, blend_models, finalize_model, predict_model, plot_model

In [None]:
carbon = train.drop(columns=['target_benzene','target_nitrogen_oxides'])
benzene = train.drop(columns=['target_carbon_monoxide','target_nitrogen_oxides'])
nitrogen = train.drop(columns=['target_carbon_monoxide','target_benzene'])

In [None]:
train_carbon, test_carbon = train_test_split(train, test_size=.2)
train_benzene, test_benzene = train_test_split(train, test_size=.2)
train_nitrogen, test_nitrogen = train_test_split(train, test_size=.2)

In [None]:
def training(train, test, target, n_select, fold, opt):
    setup(data=train, target=target, numeric_imputation='mean', silent=True)
    best = compare_models(sort=opt, n_select=n_select, fold=fold, exclude=['xgboost'])
    plot_model(estimator = best[0], plot = 'feature')
    blended = blend_models(estimator_list= best, fold=fold, optimize=opt)
    pred_holdout = predict_model(blended)
    final = finalize_model(blended)
    pred_esb = predict_model(final, test)
    re = pred_esb['Label']
    return re

In [None]:
sub['target_carbon_monoxide'] = np.exp(training(train_carbon, test_carbon, 'target_carbon_monoxide',5,3,'RMSLE'))-1

In [None]:
sub['target_benzene'] = np.exp(training(train_carbon, test_carbon, 'target_benzene',5,3,'RMSLE'))-1

In [None]:
sub['target_nitrogen_oxides'] = np.exp(training(train_carbon, test_carbon, 'target_nitrogen_oxides',5,3,'RMSLE'))-1