In [None]:
import numpy as np 
import pandas as pd
import plotly as py
import plotly.graph_objs as go
import plotly.express as px
from plotly.offline import init_notebook_mode
init_notebook_mode(connected = True)
import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import mean_squared_log_error

from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.model_selection import StratifiedKFold, KFold, LeaveOneGroupOut

import optuna

from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

pd.set_option('display.max_columns', None)
#########################################################
train = pd.read_csv('../input/tabular-playground-series-jul-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-jul-2021/test.csv')
ss = pd.read_csv('../input/tabular-playground-series-jul-2021/sample_submission.csv')

# Basic information

In [None]:
train.head(3)

In [None]:
train.info()

Nice, there are no NA values, 8 float features and 3 target values.

# New features

In [None]:
train = train.drop([7110], axis = 0)
for i in [train, test]:
    i['date_time'] = pd.to_datetime(i['date_time'])
months = train["date_time"].dt.month

for i in [train, test]:
    i['day_of_week'] = i['date_time'].dt.dayofweek
    i['hour'] = i['date_time'].dt.hour
    i['max_hours'] =  i['hour'].isin([8, 9, 17, 18, 19, 20]).astype('int')
    i['working_hours'] =  i['hour'].isin(np.arange(8, 19, 1)).astype('int')
    i['is_weekend'] = (i['date_time'].dt.dayofweek >= 5).astype('int')
    i['rel_hum-12'] = i['relative_humidity'] - i['relative_humidity'].shift(periods = 12, fill_value = 0)
    i['deg-12'] = i['deg_C'] - i['deg_C'].shift(periods = 12, fill_value = 0)
    for k in [1, 3, 6, 8, 12, 24]:
        i[f's1-{k}'] = i['sensor_1'] - i['sensor_1'].shift(periods = k, fill_value = 0)
        i[f's1+{k}'] = i['sensor_1'] - i['sensor_1'].shift(periods = -k, fill_value = 0)
        i[f's2-{k}'] = i['sensor_2'] - i['sensor_2'].shift(periods = k, fill_value = 0)
        i[f's2+{k}'] = i['sensor_2'] - i['sensor_2'].shift(periods = -k, fill_value = 0)
        i[f's3-{k}'] = i['sensor_3'] - i['sensor_3'].shift(periods = k, fill_value = 0)
        i[f's3+{k}'] = i['sensor_3'] - i['sensor_3'].shift(periods = -k, fill_value = 0)
        i[f's4-{k}'] = i['sensor_4'] - i['sensor_4'].shift(periods = k, fill_value = 0)
        i[f's4+{k}'] = i['sensor_4'] - i['sensor_4'].shift(periods = -k, fill_value = 0)
        i[f's5-{k}'] = i['sensor_5'] - i['sensor_5'].shift(periods = k, fill_value = 0)
        i[f's5+{k}'] = i['sensor_5'] - i['sensor_5'].shift(periods = -k, fill_value = 0)

# EDA

**Target values**

In [None]:
for i in [train, test]:
    i['date_time'] = i['date_time'].astype('object')

fig = plt.figure(figsize = (15, 15))

plt.subplot(211)
sns.set_style("white")
plt.title('Carbon monoxide', size = 25, y = 1.06, fontname = 'monospace', color = '#1e1f26')
plt.grid(color = 'gray', linestyle = ':', axis = 'x', alpha = 0.8, zorder = 0,  dashes = (1,7))
a = sns.kdeplot(train['target_carbon_monoxide'], color = '#4cb5f5', shade = True, label = 'Carbon monoxide', alpha = 0.8, linewidth = 1, edgecolor = 'black')
plt.ylabel('')
plt.xlabel('')
plt.xticks(fontname = 'monospace')
plt.yticks([])

for j in ['right', 'left', 'top']:
    a.spines[j].set_visible(False)
a.spines['bottom'].set_linewidth(1.1)

plt.subplot(212)
b = sns.lineplot(x = "date_time", y = "target_carbon_monoxide", data = train, color = '#4cb5f5', linewidth = 0.5)
plt.ylabel('carbon moxide level', size = 14, fontname = 'monospace')
plt.xlabel('train date time', size = 14, fontname = 'monospace', labelpad = 10)
plt.xticks([])
plt.yticks(size = 12, fontname = 'monospace')

for j in ['right', 'top']:
    b.spines[j].set_visible(False)
b.spines['bottom'].set_linewidth(1.3)
b.spines['left'].set_linewidth(1.3)

plt.figtext(0.16, 0.08, '''START
2010-03-10
18:00:00''', fontsize = 11, fontname = 'monospace')

plt.figtext(0.87, 0.08, '''END
2011-01-01
00:00:00''', fontsize = 11, fontname = 'monospace', ha = 'right')

plt.show()

In [None]:
fig = plt.figure(figsize = (15, 15))

plt.subplot(211)
sns.set_style("white")
plt.title('Benzene', size = 25, y = 1.06, fontname = 'monospace', color = '#1e1f26')
plt.grid(color = 'gray', linestyle = ':', axis = 'x', alpha = 0.8, zorder = 0,  dashes = (1,7))
a = sns.kdeplot(train['target_benzene'], color = '#b7b8b6', shade = True, label = 'Benzene', alpha = 0.8, linewidth = 1, edgecolor = 'black')
plt.ylabel('')
plt.xlabel('')
plt.xticks(fontname = 'monospace')
plt.yticks([])

for j in ['right', 'left', 'top']:
    a.spines[j].set_visible(False)
a.spines['bottom'].set_linewidth(1.1)

plt.subplot(212)
b = sns.lineplot(x = "date_time", y = "target_benzene", data = train, color = '#b7b8b6', linewidth = 0.5)
plt.ylabel('benzene level', size = 14, fontname = 'monospace')
plt.xlabel('train date time', size = 14, fontname = 'monospace', labelpad = 10)
plt.xticks([])
plt.yticks(size = 12, fontname = 'monospace')

for j in ['right', 'top']:
    b.spines[j].set_visible(False)
b.spines['bottom'].set_linewidth(1.3)
b.spines['left'].set_linewidth(1.3)

plt.figtext(0.16, 0.08, '''START
2010-03-10
18:00:00''', fontsize = 11, fontname = 'monospace')

plt.figtext(0.87, 0.08, '''END
2011-01-01
00:00:00''', fontsize = 11, fontname = 'monospace', ha = 'right')

plt.show()

In [None]:
fig = plt.figure(figsize = (15, 15))

plt.subplot(211)
sns.set_style("white")
plt.title('Nitrogen oxides', size = 25, y = 1.06, fontname = 'monospace', color = '#1e1f26')
plt.grid(color = 'gray', linestyle = ':', axis = 'x', alpha = 0.8, zorder = 0,  dashes = (1,7))
a = sns.kdeplot(train['target_nitrogen_oxides'], color = '#34675c', shade = True, label = 'Nitrogen oxides', alpha = 0.8, linewidth = 1, edgecolor = 'black')
plt.ylabel('')
plt.xlabel('')
plt.xticks(fontname = 'monospace')
plt.yticks([])

for j in ['right', 'left', 'top']:
    a.spines[j].set_visible(False)
a.spines['bottom'].set_linewidth(1.1)

plt.subplot(212)
b = sns.lineplot(x = "date_time", y = "target_nitrogen_oxides", data = train, color = '#34675c', linewidth = 0.5)
plt.ylabel('nitrogen oxides level', size = 14, fontname = 'monospace')
plt.xlabel('train date time', size = 14, fontname = 'monospace', labelpad = 10)
plt.xticks([])
plt.yticks(size = 12, fontname = 'monospace')

for j in ['right', 'top']:
    b.spines[j].set_visible(False)
b.spines['bottom'].set_linewidth(1.3)
b.spines['left'].set_linewidth(1.3)

plt.figtext(0.16, 0.08, '''START
2010-03-10
18:00:00''', fontsize = 11, fontname = 'monospace')

plt.figtext(0.87, 0.08, '''END
2011-01-01
00:00:00''', fontsize = 11, fontname = 'monospace', ha = 'right')

plt.show()

**Distribution of features in train and test data**

In [None]:
fig = plt.figure(figsize = (15, 18))

plt.subplot(421)
sns.set_style("white")
plt.title('Degree C°', size = 17, y = 1.03, fontname = 'monospace')
plt.grid(color = 'gray', linestyle = ':', axis = 'x', alpha = 0.8, zorder = 0,  dashes = (1,7))
a = sns.kdeplot(train['deg_C'], color = '#b7b8b6', shade = True, label = 'TRAIN', alpha = 0.7, linewidth = 1, edgecolor = 'black')
sns.kdeplot(test['deg_C'], color = '#34675c', shade = True, label = 'TEST', alpha = 0.7, linewidth = 1, edgecolor = 'black')
plt.ylabel('')
plt.xlabel('')
plt.xticks(fontname = 'monospace')
plt.yticks([])
plt.legend(['TRAIN', 'TEST'], bbox_to_anchor = (1.28, 1.2), ncol = 1, borderpad = 3, frameon = False, fontsize = 11)

plt.subplot(422)
sns.set_style("white")
plt.title('Relative humidity', size = 17, y = 1.03, fontname = 'monospace')
plt.grid(color = 'gray', linestyle = ':', axis = 'x', alpha = 0.8, zorder = 0,  dashes = (1,7))
b = sns.kdeplot(train['relative_humidity'], color = '#b7b8b6', shade = True, label = 'TRAIN', alpha = 0.7, linewidth = 1, edgecolor = 'black')
sns.kdeplot(test['relative_humidity'], color = '#34675c', shade = True, label = 'TEST', alpha = 0.7, linewidth = 1, edgecolor = 'black')
plt.ylabel('')
plt.xlabel('')
plt.xticks(fontname = 'monospace')
plt.yticks([])

plt.subplot(423)
sns.set_style("white")
plt.title('Absolute humidity', size = 17, y = 1.03, fontname = 'monospace')
plt.grid(color = 'gray', linestyle = ':', axis = 'x', alpha = 0.8, zorder = 0,  dashes = (1,7))
c = sns.kdeplot(train['absolute_humidity'], color = '#b7b8b6', shade = True, label = 'TRAIN', alpha = 0.7, linewidth = 1, edgecolor = 'black')
sns.kdeplot(test['absolute_humidity'], color = '#34675c', shade = True, label = 'TEST', alpha = 0.7, linewidth = 1, edgecolor = 'black')
plt.ylabel('')
plt.xlabel('')
plt.xticks(fontname = 'monospace')
plt.yticks([])

plt.subplot(424)
sns.set_style("white")
plt.title('Sensor 1', size = 17, y = 1.03, fontname = 'monospace')
plt.grid(color = 'gray', linestyle = ':', axis = 'x', alpha = 0.8, zorder = 0,  dashes = (1,7))
d = sns.kdeplot(train['sensor_1'], color = '#b7b8b6', shade = True, label = 'TRAIN', alpha = 0.7, linewidth = 1, edgecolor = 'black')
sns.kdeplot(test['sensor_1'], color = '#34675c', shade = True, label = 'TEST', alpha = 0.7, linewidth = 1, edgecolor = 'black')
plt.ylabel('')
plt.xlabel('')
plt.xticks(fontname = 'monospace')
plt.yticks([])

plt.subplot(425)
sns.set_style("white")
plt.title('Sensor 2', size = 17, y = 1.03, fontname = 'monospace')
plt.grid(color = 'gray', linestyle = ':', axis = 'x', alpha = 0.8, zorder = 0,  dashes = (1,7))
e = sns.kdeplot(train['sensor_2'], color = '#b7b8b6', shade = True, label = 'TRAIN', alpha = 0.7, linewidth = 1, edgecolor = 'black')
sns.kdeplot(test['sensor_2'], color = '#34675c', shade = True, label = 'TEST', alpha = 0.7, linewidth = 1, edgecolor = 'black')
plt.ylabel('')
plt.xlabel('')
plt.xticks(fontname = 'monospace')
plt.yticks([])

plt.subplot(426)
sns.set_style("white")
plt.title('Sensor 3', size = 17, y = 1.03, fontname = 'monospace')
plt.grid(color = 'gray', linestyle = ':', axis = 'x', alpha = 0.8, zorder = 0,  dashes = (1,7))
f = sns.kdeplot(train['sensor_3'], color = '#b7b8b6', shade = True, label = 'TRAIN', alpha = 0.7, linewidth = 1, edgecolor = 'black')
sns.kdeplot(test['sensor_3'], color = '#34675c', shade = True, label = 'TEST', alpha = 0.7, linewidth = 1, edgecolor = 'black')
plt.ylabel('')
plt.xlabel('')
plt.xticks(fontname = 'monospace')
plt.yticks([])

plt.subplot(427)
sns.set_style("white")
plt.title('Sensor 4', size = 17, y = 1.03, fontname = 'monospace')
plt.grid(color = 'gray', linestyle = ':', axis = 'x', alpha = 0.8, zorder = 0,  dashes = (1,7))
g = sns.kdeplot(train['sensor_4'], color = '#b7b8b6', shade = True, label = 'TRAIN', alpha = 0.7, linewidth = 1, edgecolor = 'black')
sns.kdeplot(test['sensor_4'], color = '#34675c', shade = True, label = 'TEST', alpha = 0.7, linewidth = 1, edgecolor = 'black')
plt.ylabel('')
plt.xlabel('')
plt.xticks(fontname = 'monospace')
plt.yticks([])

plt.subplot(428)
sns.set_style("white")
plt.title('Sensor 5', size = 17, y = 1.03, fontname = 'monospace')
plt.grid(color = 'gray', linestyle = ':', axis = 'x', alpha = 0.8, zorder = 0,  dashes = (1,7))
h = sns.kdeplot(train['sensor_5'], color = '#b7b8b6', shade = True, label = 'TRAIN', alpha = 0.7, linewidth = 1, edgecolor = 'black')
sns.kdeplot(test['sensor_5'], color = '#34675c', shade = True, label = 'TEST', alpha = 0.7, linewidth = 1, edgecolor = 'black')
plt.ylabel('')
plt.xlabel('')
plt.xticks(fontname = 'monospace')
plt.yticks([])

for i in [a,b,c,d,e,f,g,h]:
    for j in ['right', 'left', 'top']:
        i.spines[j].set_visible(False)
        i.spines['bottom'].set_linewidth(1.5)
        
fig.tight_layout(h_pad = 3)

plt.show()

**Dynamics of feature changes**

In [None]:
fig = plt.figure(figsize = (15, 45))

####################
#######################  1
####################

plt.subplot(811)
plt.title('Degree C°', size = 17, y = 1.03, fontname = 'monospace')
plt.grid(color = 'gray', linestyle = ':', axis = 'y', alpha = 0.8, zorder = 0,  dashes = (1,7))
b = sns.lineplot(x = "date_time", y = "deg_C", data = train, color = '#b7b8b6', linewidth = 0.5)
sns.lineplot(x = "date_time", y = "deg_C", data = test, color = '#34675c', linewidth = 0.5)
plt.ylabel('degree C° level', size = 14, fontname = 'monospace')
plt.xlabel('')
plt.xticks([])
plt.yticks(size = 12, fontname = 'monospace')

for j in ['right', 'top']:
    b.spines[j].set_visible(False)
b.spines['bottom'].set_linewidth(1.3)
b.spines['left'].set_linewidth(1.3)

plt.figtext(0.09, 0.881, '''START TRAIN
2010-03-10
18:00:00''', fontsize = 11, fontname = 'monospace', color = '#b7b8b6')

plt.figtext(0.74, 0.881, '''END TRAIN
2011-01-01
00:00:00''', fontsize = 11, fontname = 'monospace', ha = 'right', color = '#b7b8b6')

plt.figtext(0.745, 0.881, '''START TEST
2011-01-01
00:00:00''', fontsize = 11, fontname = 'monospace', color = '#34675c')

plt.figtext(0.95, 0.881, '''END TEST
2011-04-04
14:00:00''', fontsize = 11, fontname = 'monospace', ha = 'right', color = '#34675c')

####################
#######################  2
####################

plt.subplot(812)
plt.title('Relative humidity', size = 17, y = 1.03, fontname = 'monospace')
plt.grid(color = 'gray', linestyle = ':', axis = 'y', alpha = 0.8, zorder = 0,  dashes = (1,7))
b = sns.lineplot(x = "date_time", y = "relative_humidity", data = train, color = '#b7b8b6', linewidth = 0.5)
sns.lineplot(x = "date_time", y = "relative_humidity", data = test, color = '#34675c', linewidth = 0.5)
plt.ylabel('relative humidity level', size = 14, fontname = 'monospace')
plt.xlabel('')
plt.xticks([])
plt.yticks(size = 12, fontname = 'monospace')

for j in ['right', 'top']:
    b.spines[j].set_visible(False)
b.spines['bottom'].set_linewidth(1.3)
b.spines['left'].set_linewidth(1.3)

plt.figtext(0.09, 0.754, '''START TRAIN
2010-03-10
18:00:00''', fontsize = 11, fontname = 'monospace', color = '#b7b8b6')

plt.figtext(0.74, 0.754, '''END TRAIN
2011-01-01
00:00:00''', fontsize = 11, fontname = 'monospace', ha = 'right', color = '#b7b8b6')

plt.figtext(0.745, 0.754, '''START TEST
2011-01-01
00:00:00''', fontsize = 11, fontname = 'monospace', color = '#34675c')

plt.figtext(0.95, 0.754, '''END TEST
2011-04-04
14:00:00''', fontsize = 11, fontname = 'monospace', ha = 'right', color = '#34675c')

####################
#######################  3
####################

plt.subplot(813)
plt.title('Absolute humidity', size = 17, y = 1.03, fontname = 'monospace')
plt.grid(color = 'gray', linestyle = ':', axis = 'y', alpha = 0.8, zorder = 0,  dashes = (1,7))
b = sns.lineplot(x = "date_time", y = "absolute_humidity", data = train, color = '#b7b8b6', linewidth = 0.5)
sns.lineplot(x = "date_time", y = "absolute_humidity", data = test, color = '#34675c', linewidth = 0.5)
plt.ylabel('absolute humidity level', size = 14, fontname = 'monospace')
plt.xlabel('')
plt.xticks([])
plt.yticks(size = 12, fontname = 'monospace')

for j in ['right', 'top']:
    b.spines[j].set_visible(False)
b.spines['bottom'].set_linewidth(1.3)
b.spines['left'].set_linewidth(1.3)

plt.figtext(0.09, 0.627, '''START TRAIN
2010-03-10
18:00:00''', fontsize = 11, fontname = 'monospace', color = '#b7b8b6')

plt.figtext(0.74, 0.627, '''END TRAIN
2011-01-01
00:00:00''', fontsize = 11, fontname = 'monospace', ha = 'right', color = '#b7b8b6')

plt.figtext(0.745, 0.627, '''START TEST
2011-01-01
00:00:00''', fontsize = 11, fontname = 'monospace', color = '#34675c')

plt.figtext(0.95, 0.627, '''END TEST
2011-04-04
14:00:00''', fontsize = 11, fontname = 'monospace', ha = 'right', color = '#34675c')

####################
#######################  4
####################

plt.subplot(814)
plt.title('Sensor 1', size = 17, y = 1.03, fontname = 'monospace')
plt.grid(color = 'gray', linestyle = ':', axis = 'y', alpha = 0.8, zorder = 0,  dashes = (1,7))
b = sns.lineplot(x = "date_time", y = "sensor_1", data = train, color = '#b7b8b6', linewidth = 0.5)
sns.lineplot(x = "date_time", y = "sensor_1", data = test, color = '#34675c', linewidth = 0.5)
plt.ylabel('sensor 1 level', size = 14, fontname = 'monospace')
plt.xlabel('')
plt.xticks([])
plt.yticks(size = 12, fontname = 'monospace')

for j in ['right', 'top']:
    b.spines[j].set_visible(False)
b.spines['bottom'].set_linewidth(1.3)
b.spines['left'].set_linewidth(1.3)

plt.figtext(0.09, 0.5, '''START TRAIN
2010-03-10
18:00:00''', fontsize = 11, fontname = 'monospace', color = '#b7b8b6')

plt.figtext(0.74, 0.5, '''END TRAIN
2011-01-01
00:00:00''', fontsize = 11, fontname = 'monospace', ha = 'right', color = '#b7b8b6')

plt.figtext(0.745, 0.5, '''START TEST
2011-01-01
00:00:00''', fontsize = 11, fontname = 'monospace', color = '#34675c')

plt.figtext(0.95, 0.5, '''END TEST
2011-04-04
14:00:00''', fontsize = 11, fontname = 'monospace', ha = 'right', color = '#34675c')

####################
#######################  5
####################

plt.subplot(815)
plt.title('Sensor 2', size = 17, y = 1.03, fontname = 'monospace')
plt.grid(color = 'gray', linestyle = ':', axis = 'y', alpha = 0.8, zorder = 0,  dashes = (1,7))
b = sns.lineplot(x = "date_time", y = "sensor_2", data = train, color = '#b7b8b6', linewidth = 0.5)
sns.lineplot(x = "date_time", y = "sensor_2", data = test, color = '#34675c', linewidth = 0.5)
plt.ylabel('sensor 2 level', size = 14, fontname = 'monospace')
plt.xlabel('')
plt.xticks([])
plt.yticks(size = 12, fontname = 'monospace')

for j in ['right', 'top']:
    b.spines[j].set_visible(False)
b.spines['bottom'].set_linewidth(1.3)
b.spines['left'].set_linewidth(1.3)

plt.figtext(0.09, 0.373, '''START TRAIN
2010-03-10
18:00:00''', fontsize = 11, fontname = 'monospace', color = '#b7b8b6')

plt.figtext(0.74, 0.373, '''END TRAIN
2011-01-01
00:00:00''', fontsize = 11, fontname = 'monospace', ha = 'right', color = '#b7b8b6')

plt.figtext(0.745, 0.373, '''START TEST
2011-01-01
00:00:00''', fontsize = 11, fontname = 'monospace', color = '#34675c')

plt.figtext(0.95, 0.373, '''END TEST
2011-04-04
14:00:00''', fontsize = 11, fontname = 'monospace', ha = 'right', color = '#34675c')

####################
#######################  6
####################

plt.subplot(816)
plt.title('Sensor 3', size = 17, y = 1.03, fontname = 'monospace')
plt.grid(color = 'gray', linestyle = ':', axis = 'y', alpha = 0.8, zorder = 0,  dashes = (1,7))
b = sns.lineplot(x = "date_time", y = "sensor_3", data = train, color = '#b7b8b6', linewidth = 0.5)
sns.lineplot(x = "date_time", y = "sensor_3", data = test, color = '#34675c', linewidth = 0.5)
plt.ylabel('sensor 3 level', size = 14, fontname = 'monospace')
plt.xlabel('')
plt.xticks([])
plt.yticks(size = 12, fontname = 'monospace')

for j in ['right', 'top']:
    b.spines[j].set_visible(False)
b.spines['bottom'].set_linewidth(1.3)
b.spines['left'].set_linewidth(1.3)

plt.figtext(0.09, 0.246, '''START TRAIN
2010-03-10
18:00:00''', fontsize = 11, fontname = 'monospace', color = '#b7b8b6')

plt.figtext(0.74, 0.246, '''END TRAIN
2011-01-01
00:00:00''', fontsize = 11, fontname = 'monospace', ha = 'right', color = '#b7b8b6')

plt.figtext(0.745, 0.246, '''START TEST
2011-01-01
00:00:00''', fontsize = 11, fontname = 'monospace', color = '#34675c')

plt.figtext(0.95, 0.246, '''END TEST
2011-04-04
14:00:00''', fontsize = 11, fontname = 'monospace', ha = 'right', color = '#34675c')

####################
#######################  7
####################

plt.subplot(817)
plt.title('Sensor 4', size = 17, y = 1.03, fontname = 'monospace')
plt.grid(color = 'gray', linestyle = ':', axis = 'y', alpha = 0.8, zorder = 0,  dashes = (1,7))
b = sns.lineplot(x = "date_time", y = "sensor_4", data = train, color = '#b7b8b6', linewidth = 0.5)
sns.lineplot(x = "date_time", y = "sensor_4", data = test, color = '#34675c', linewidth = 0.5)
plt.ylabel('sensor 4 level', size = 14, fontname = 'monospace')
plt.xlabel('')
plt.xticks([])
plt.yticks(size = 12, fontname = 'monospace')

for j in ['right', 'top']:
    b.spines[j].set_visible(False)
b.spines['bottom'].set_linewidth(1.3)
b.spines['left'].set_linewidth(1.3)

plt.figtext(0.09, 0.12, '''START TRAIN
2010-03-10
18:00:00''', fontsize = 11, fontname = 'monospace', color = '#b7b8b6')

plt.figtext(0.74, 0.12, '''END TRAIN
2011-01-01
00:00:00''', fontsize = 11, fontname = 'monospace', ha = 'right', color = '#b7b8b6')

plt.figtext(0.745, 0.12, '''START TEST
2011-01-01
00:00:00''', fontsize = 11, fontname = 'monospace', color = '#34675c')

plt.figtext(0.95, 0.12, '''END TEST
2011-04-04
14:00:00''', fontsize = 11, fontname = 'monospace', ha = 'right', color = '#34675c')

####################
#######################  8
####################

plt.subplot(818)
plt.title('Sensor 5', size = 17, y = 1.03, fontname = 'monospace')
plt.grid(color = 'gray', linestyle = ':', axis = 'y', alpha = 0.8, zorder = 0,  dashes = (1,7))
b = sns.lineplot(x = "date_time", y = "sensor_5", data = train, color = '#b7b8b6', linewidth = 0.5)
sns.lineplot(x = "date_time", y = "sensor_5", data = test, color = '#34675c', linewidth = 0.5)
plt.ylabel('sensor 5 level', size = 14, fontname = 'monospace')
plt.xlabel('')
plt.xticks([])
plt.yticks(size = 12, fontname = 'monospace')

for j in ['right', 'top']:
    b.spines[j].set_visible(False)
b.spines['bottom'].set_linewidth(1.3)
b.spines['left'].set_linewidth(1.3)

plt.figtext(0.09, -0.007, '''START TRAIN
2010-03-10
18:00:00''', fontsize = 11, fontname = 'monospace', color = '#b7b8b6')

plt.figtext(0.74, -0.007, '''END TRAIN
2011-01-01
00:00:00''', fontsize = 11, fontname = 'monospace', ha = 'right', color = '#b7b8b6')

plt.figtext(0.745, -0.007, '''START TEST
2011-01-01
00:00:00''', fontsize = 11, fontname = 'monospace', color = '#34675c')

plt.figtext(0.95, -0.007, '''END TEST
2011-04-04
14:00:00''', fontsize = 11, fontname = 'monospace', ha = 'right', color = '#34675c')

fig.tight_layout(h_pad = 7)

plt.show()

**Changes of target values mean by month/day/hour**

In [None]:
m_carbon = train.groupby('day_of_week').agg({'target_carbon_monoxide': 'mean'}).reset_index()
m_carbon = m_carbon.iloc[1:,]

m_benzene = train.groupby('day_of_week').agg({'target_benzene': 'mean'}).reset_index()
m_benzene = m_benzene.iloc[1:,]

m_nitrogen = train.groupby('day_of_week').agg({'target_nitrogen_oxides': 'mean'}).reset_index()
m_nitrogen = m_nitrogen.iloc[1:,]

fig = plt.figure(figsize = (15, 13))

plt.subplot(311)
sns.set_style("white")
plt.title('Carbon monoxide', size = 17, y = 1.03, fontname = 'monospace')
plt.grid(color = 'gray', linestyle = ':', axis = 'y', alpha = 0.8, zorder = 0,  dashes = (1,7))
a = sns.lineplot(data = m_carbon, x = 'day_of_week', y = 'target_carbon_monoxide', color = '#4cb5f5', linewidth = 3)
sns.scatterplot(data = m_carbon, x = 'day_of_week', y = 'target_carbon_monoxide', color = '#4cb5f5', s = 60)
plt.xticks(fontname = 'monospace')
plt.yticks(fontname = 'monospace')
plt.ylabel('')
plt.xlabel('')


plt.subplot(312)
plt.title('Benzene', size = 17, y = 1.03, fontname = 'monospace')
plt.grid(color = 'gray', linestyle = ':', axis = 'y', alpha = 0.8, zorder = 0,  dashes = (1,7))
b = sns.lineplot(data = m_benzene, x = 'day_of_week', y = 'target_benzene', color = '#b7b8b6', linewidth = 3)
sns.scatterplot(data = m_benzene, x = 'day_of_week', y = 'target_benzene', color = '#b7b8b6', s = 60)
plt.xticks(fontname = 'monospace')
plt.yticks(fontname = 'monospace')
plt.ylabel('')
plt.xlabel('')

plt.subplot(313)
plt.title('Nitrogen oxides', size = 17, y = 1.03, fontname = 'monospace')
plt.grid(color = 'gray', linestyle = ':', axis = 'y', alpha = 0.8, zorder = 0,  dashes = (1,7))
c = sns.lineplot(data = m_nitrogen, x = 'day_of_week', y = 'target_nitrogen_oxides', color = '#34675c', linewidth = 3)
sns.scatterplot(data = m_nitrogen, x = 'day_of_week', y = 'target_nitrogen_oxides', color = '#34675c', s = 60)
plt.xticks(fontname = 'monospace')
plt.yticks(fontname = 'monospace')
plt.ylabel('')
plt.xlabel('')

for i in [a,b,c]:
    for j in ['right', 'top']:
        i.spines[j].set_visible(False)
    for k in ['left', 'bottom']:
        i.spines[k].set_linewidth(1.5)
        
for i in [a,b,c]:
    i.tick_params(labelsize = 11)
    
plt.figtext(0.2, 1.05, 'Changes of target values mean by day of week', fontsize = 30, fontname = 'monospace')
    
fig.tight_layout(h_pad = 5)

plt.show()

In [None]:
h_carbon = train.groupby('hour').agg({'target_carbon_monoxide': 'mean'}).reset_index()
h_benzene = train.groupby('hour').agg({'target_benzene': 'mean'}).reset_index()
h_nitrogen = train.groupby('hour').agg({'target_nitrogen_oxides': 'mean'}).reset_index()

fig = plt.figure(figsize = (15, 13))

plt.subplot(311)
sns.set_style("white")
plt.title('Carbon monoxide', size = 17, y = 1.03, fontname = 'monospace')
plt.grid(color = 'gray', linestyle = ':', axis = 'y', alpha = 0.8, zorder = 0,  dashes = (1,7))
a = sns.lineplot(data = h_carbon, x = 'hour', y = 'target_carbon_monoxide', color = '#4cb5f5', linewidth = 3)
sns.scatterplot(data = h_carbon, x = 'hour', y = 'target_carbon_monoxide', color = '#4cb5f5', s = 60)
plt.xticks(range(0,24,1), fontname = 'monospace')
plt.yticks(fontname = 'monospace')
plt.ylabel('')
plt.xlabel('')


plt.subplot(312)
plt.title('Benzene', size = 17, y = 1.03, fontname = 'monospace')
plt.grid(color = 'gray', linestyle = ':', axis = 'y', alpha = 0.8, zorder = 0,  dashes = (1,7))
b = sns.lineplot(data = h_benzene, x = 'hour', y = 'target_benzene', color = '#b7b8b6', linewidth = 3)
sns.scatterplot(data = h_benzene, x = 'hour', y = 'target_benzene', color = '#b7b8b6', s = 60)
plt.xticks(range(0,24,1), fontname = 'monospace')
plt.yticks(fontname = 'monospace')
plt.ylabel('')
plt.xlabel('')

plt.subplot(313)
plt.title('Nitrogen oxides', size = 17, y = 1.03, fontname = 'monospace')
plt.grid(color = 'gray', linestyle = ':', axis = 'y', alpha = 0.8, zorder = 0,  dashes = (1,7))
c = sns.lineplot(data = h_nitrogen, x = 'hour', y = 'target_nitrogen_oxides', color = '#34675c', linewidth = 3)
sns.scatterplot(data = h_nitrogen, x = 'hour', y = 'target_nitrogen_oxides', color = '#34675c', s = 60)
plt.xticks(range(0,24,1), fontname = 'monospace')
plt.yticks(fontname = 'monospace')
plt.ylabel('')
plt.xlabel('')

for i in [a,b,c]:
    for j in ['right', 'top']:
        i.spines[j].set_visible(False)
    for k in ['left', 'bottom']:
        i.spines[k].set_linewidth(1.5)
        
for i in [a,b,c]:
    i.tick_params(labelsize = 11)
    
plt.figtext(0.17, 1.05, 'Changes of target values mean by hour', fontsize = 30, fontname = 'monospace')
    
fig.tight_layout(h_pad = 5)

plt.show()

for i in [train, test]:
    i.drop(['hour'], axis = 1, inplace = True)

**Correlation**

In [None]:
matrix = np.triu(train.corr())
plt.figure(figsize = (12, 10))
sns.heatmap(train.corr(), annot = False, cmap = 'Blues', fmt=".2f", mask = matrix, vmin = -1, vmax = 1, linewidths = 0.1, linecolor = 'white', cbar = False)
plt.xticks(size = 8, fontname = 'monospace')
plt.yticks(size = 8, fontname = 'monospace')
plt.figtext(0.88, 0.76, '''Correlation
map''', fontsize = 40, fontname = 'monospace', ha = 'right', color = '#4897d8')
plt.show()

# Preprocessing for modeling

**As we have regression task, we should take the logarithm of target values for the most linear relationship.**

In [None]:
for i in ['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides']:
    train[i] = np.log1p(train[i])

y1 = train['target_carbon_monoxide']
y2 = train['target_benzene']
y3 = train['target_nitrogen_oxides']

X = train.drop(['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides'], axis = 1)
X['date_time'] = X['date_time'].astype('datetime64[ns]').astype(np.int64)/10**9
test['date_time'] = test['date_time'].astype('datetime64[ns]').astype(np.int64)/10**9

In [None]:
fig = plt.figure(figsize = (16, 7))

plt.subplot(131)
sns.set_style("white")
plt.title('Carbon monoxide', size = 17, y = 1.03, fontname = 'monospace')
plt.grid(color = 'gray', linestyle = ':', axis = 'x', alpha = 0.8, zorder = 0,  dashes = (1,7))
a = sns.distplot(y1, color = '#4cb5f5')
plt.ylabel('')
plt.xlabel('')
plt.xticks(fontname = 'monospace')
plt.yticks([])

plt.subplot(132)
sns.set_style("white")
plt.title('Benzene', size = 17, y = 1.03, fontname = 'monospace')
plt.grid(color = 'gray', linestyle = ':', axis = 'x', alpha = 0.8, zorder = 0,  dashes = (1,7))
b = sns.distplot(y2, color = '#b7b8b6')
plt.ylabel('')
plt.xlabel('')
plt.xticks(fontname = 'monospace')
plt.yticks([])

plt.subplot(133)
sns.set_style("white")
plt.title('Nitrogen oxides', size = 17, y = 1.03, fontname = 'monospace')
plt.grid(color = 'gray', linestyle = ':', axis = 'x', alpha = 0.8, zorder = 0,  dashes = (1,7))
c = sns.distplot(y3, color = '#34675c')
plt.ylabel('')
plt.xlabel('')
plt.xticks(fontname = 'monospace')
plt.yticks([])

for i in [a,b,c]:
    for j in ['right', 'left', 'top']:
        i.spines[j].set_visible(False)
        i.spines['bottom'].set_linewidth(1.5)
        
plt.figtext(0.3, 1.05, 'Logarithm of the target values', fontsize = 30, fontname = 'monospace')

plt.show()

# XGB

In [None]:
# Optuna parameters for each target (I change here only targets)

def objective(trial, data = X, target = y3):

    params = {
        'max_depth': trial.suggest_int('max_depth', 2, 8),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'n_estimators': trial.suggest_int('n_estimators', 5000, 30000),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 200),
        'gamma': trial.suggest_float('gamma', 0.0001, 1.0, log = True),
        'alpha': trial.suggest_float('alpha', 0.0001, 10.0, log = True),
        'lambda': trial.suggest_float('lambda', 0.0001, 10.0, log = True),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 0.8),
        'subsample': trial.suggest_float('subsample', 0.1, 0.8),
        'tree_method': 'gpu_hist',
        'booster': 'gbtree',
        'random_state': 228,
        'use_label_encoder': False,
        'eval_metric': 'rmsle'
    }
    
    model = XGBRegressor(**params)
    scores = []
    k = KFold(n_splits = 5, random_state = 228, shuffle = True)
    for i, (trn_idx, val_idx) in enumerate(k.split(X)):
        
        X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
        y_train, y_val = y3.iloc[trn_idx], y3.iloc[val_idx]

        model.fit(X_train, y_train, eval_set = [(X_val, y_val)], early_stopping_rounds = 200, verbose = False)
        
        tr_preds = np.expm1(model.predict(X_train))
        tr_preds[tr_preds < 0] = 0
        tr_score = np.sqrt(mean_squared_log_error(np.expm1(y_train), tr_preds))
        
        val_preds = np.expm1(model.predict(X_val))
        val_preds[val_preds < 0] = 0
        val_score = np.sqrt(mean_squared_log_error(np.expm1(y_val), val_preds))

        scores.append((tr_score, val_score))
        
        print(f"Fold {i} | RMSLE: {val_score}")
        
        
    scores = pd.DataFrame(scores, columns = ['train score', 'validation score'])
    
    return scores['validation score'].mean()

study = optuna.create_study(direction = 'minimize')
study.optimize(objective, n_trials = 100)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)
print('Best value:', study.best_value)

In [None]:
# Carbon monoxide
# Mean RMSLE on 5 Folds with Optuna - 0.099
paramsXGB1 = {'max_depth': 3, 
              'learning_rate': 0.06974270910763652, 
              'n_estimators': 24813, 
              'min_child_weight': 15, 
              'gamma': 0.00015352935707382668, 
              'alpha': 0.007300887912196733, 
              'lambda': 0.00233147304187698, 
              'colsample_bytree': 0.6706673656091967, 
              'subsample': 0.32392556118811044,
              'tree_method': 'gpu_hist',
              'booster': 'gbtree',
              'random_state': 123,
              'use_label_encoder': False,
              'eval_metric': 'rmsle'}

# Benzene
# Mean RMSE on 5 Folds with Optuna - 0.078
paramsXGB2 = {'max_depth': 3, 
              'learning_rate': 0.01630350395073977, 
              'n_estimators': 20058, 
              'min_child_weight': 11, 
              'gamma': 0.0009762828881569192, 
              'alpha': 0.001235465069634119, 
              'lambda': 0.0005268383741494084, 
              'colsample_bytree': 0.5100114916691317, 
              'subsample': 0.31372256786444536,
              'tree_method': 'gpu_hist',
              'booster': 'gbtree',
              'random_state': 123,
              'use_label_encoder': False,
              'eval_metric': 'rmsle'}

# Nitrogen oxides
# Mean RMSLE on 5 Folds with Optuna - 0.201
paramsXGB3 = {'max_depth': 6, 
              'learning_rate': 0.07398714527058703, 
              'n_estimators': 15509, 
              'min_child_weight': 1, 
              'gamma': 0.0010264813784765508, 
              'alpha': 0.002893496668661691, 
              'lambda': 0.008742987610869259, 
              'colsample_bytree': 0.6256322009147708, 
              'subsample': 0.5955732014997671,
              'tree_method': 'gpu_hist',
              'booster': 'gbtree',
              'random_state': 123,
              'use_label_encoder': False,
              'eval_metric': 'rmsle'}

In [None]:
# I change here only targets
predictions = np.zeros(len(test))
logo = LeaveOneGroupOut()
n_splits = months.nunique()
for fold, (trn_idx, val_idx) in enumerate(logo.split(X, y1, months)):
    
    X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
    y_train, y_val = y1.iloc[trn_idx], y1.iloc[val_idx]

    model = XGBRegressor(**paramsXGB1)
   
    model.fit(X_train, y_train, eval_set = [(X_val, y_val)], eval_metric = 'rmsle', verbose = False, early_stopping_rounds = 200)
    
    predictions += model.predict(test) / n_splits 
    
ss['target_carbon_monoxide'] = np.expm1(predictions)

In [None]:
ss.to_csv('xgb.csv', index=False)

**Result - 0.20204**

# LGBM

In [None]:
for i in ['day_of_week', 'max_hours', 'working_hours', 'is_weekend']:
    X[i] = X[i].astype('category')
    test[i] = test[i].astype('category')

In [None]:
# Optuna parameters for each target (I change here only targets)

def objective(trial, data = X, target = y1):

    params = {
        'reg_alpha': trial.suggest_float('reg_alpha', 0.001, 10.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.001, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 10, 500),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'max_depth': trial.suggest_int('max_depth', 2, 6),
        'n_estimators': trial.suggest_int('n_estimators', 5000, 30000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 0.6),
        'cat_smooth' : trial.suggest_int('cat_smooth', 10, 100),
        'cat_l2': trial.suggest_int('cat_l2', 1, 20),
        'min_data_per_group': trial.suggest_int('min_data_per_group', 1, 200),
        'cat_feature': ['day_of_week', 'max_hours', 'working_hours', 'is_weekend'],
        'device_type': 'gpu',
        'boosting_type': 'gbdt',
        'random_state': 228,
        'metric': 'rmse'
    }
    
    model = LGBMRegressor(**params)
    scores = []
    k = KFold(n_splits = 5, random_state = 228, shuffle = True)
    for i, (trn_idx, val_idx) in enumerate(k.split(X)):
        
        X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
        y_train, y_val = y1.iloc[trn_idx], y1.iloc[val_idx]

        model.fit(X_train, y_train, eval_set = [(X_val, y_val)], early_stopping_rounds = 200, verbose = False)
        
        tr_preds = np.expm1(model.predict(X_train))
        tr_preds[tr_preds < 0] = 0
        tr_score = np.sqrt(mean_squared_log_error(np.expm1(y_train), tr_preds))
        
        val_preds = np.expm1(model.predict(X_val))
        val_preds[val_preds < 0] = 0
        val_score = np.sqrt(mean_squared_log_error(np.expm1(y_val), val_preds))

        scores.append((tr_score, val_score))
        
        print(f"Fold {i} | RMSLE: {val_score}")
        
        
    scores = pd.DataFrame(scores, columns = ['train score', 'validation score'])
    
    return scores['validation score'].mean()

study = optuna.create_study(direction = 'minimize')
study.optimize(objective, n_trials = 100)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)
print('Best value:', study.best_value)

In [None]:
# Carbon monoxide
# Mean RMSLE on 5 Folds with Optuna - 0.093
paramsLGBM1 = {'reg_alpha': 0.15427250456095098, 
               'reg_lambda': 3.5336982866924536, 
               'num_leaves': 316, 
               'min_child_samples': 11,
               'max_depth': 6, 
               'n_estimators': 8261, 
               'learning_rate': 0.012655980151740909, 
               'colsample_bytree': 0.57088321168468, 
               'cat_smooth': 73, 
               'cat_l2': 14, 
               'min_data_per_group': 145,
               'cat_feature': ['day_of_week', 'max_hours', 'working_hours', 'is_weekend'],
               'device_type': 'gpu',
               'boosting_type': 'gbdt',
               'random_state': 228,
               'metric': 'rmse'}

# Benzene
# Mean RMSE on 5 Folds with Optuna - 0.078
paramsLGBM2 = {'reg_alpha': 0.20068505646131332, 
               'reg_lambda': 8.4922551227129, 
               'num_leaves': 363, 
               'min_child_samples': 20, 
               'max_depth': 3, 
               'n_estimators': 11088, 
               'learning_rate': 0.010393056328793684, 
               'colsample_bytree': 0.4742585430027215, 
               'cat_smooth': 96, 
               'cat_l2': 6, 
               'min_data_per_group': 27,
               'cat_feature': ['day_of_week', 'max_hours', 'working_hours', 'is_weekend'],
               'device_type': 'gpu',
               'boosting_type': 'gbdt',
               'random_state': 228,
               'metric': 'rmse'}

# Nitrogen oxides
# Mean RMSLE on 5 Folds with Optuna - 0.195
paramsLGBM3 = {'reg_alpha': 0.35252677348886996, 
               'reg_lambda': 2.474116211604083, 
               'num_leaves': 322, 
               'min_child_samples': 5, 
               'max_depth': 5, 
               'n_estimators': 24463, 
               'learning_rate': 0.020175434042643575, 
               'colsample_bytree': 0.46226355546207754, 
               'cat_smooth': 37, 
               'cat_l2': 8, 
               'min_data_per_group': 159,
               'cat_feature': ['day_of_week', 'max_hours', 'working_hours', 'is_weekend'],
               'device_type': 'gpu',
               'boosting_type': 'gbdt',
               'random_state': 228,
               'metric': 'rmse'}

In [None]:
# I change here only targets
predictions = np.zeros(len(test))
for fold, (trn_idx, val_idx) in enumerate(logo.split(X, y1, months)):
    
    X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
    y_train, y_val = y1.iloc[trn_idx], y1.iloc[val_idx]

    model = LGBMRegressor(**paramsLGBM1)
   
    model.fit(X_train, y_train, eval_set = [(X_val, y_val)], eval_metric = 'rmse', verbose = False, early_stopping_rounds = 200)
    
    predictions += model.predict(test) / n_splits
    
ss['target_carbon_monoxide'] = np.expm1(predictions)

In [None]:
ss.to_csv('lgbm.csv', index=False)

**Result - 0.19910**

# CatBoost

In [None]:
# Optuna parameters for each target (I change here only targets)
def objective(trial, data = X, target = y3):
    params = {
        'depth': trial.suggest_int('depth', 2, 6),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'iterations': trial.suggest_int('iterations', 5000, 30000),
        'max_bin': trial.suggest_int('max_bin', 1, 300),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 300),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 0.0001, 1.0, log = True),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.1, 10.0),
        'subsample': trial.suggest_float('subsample', 0.1, 0.8),
        'grow_policy': trial.suggest_categorical('grow_policy', ['SymmetricTree', 'Depthwise', 'Lossguide']),
        'leaf_estimation_method': trial.suggest_categorical('leaf_estimation_method', ['Newton', 'Gradient']),
        'cat_features': ['day_of_week', 'max_hours', 'working_hours', 'is_weekend'],
        'random_seed': 228,
        'loss_function': 'RMSE',
        'eval_metric': 'RMSE'
    }
    
    model = CatBoostRegressor(**params)
    scores = []
    k = KFold(n_splits = 5, random_state = 228, shuffle = True)
    for i, (trn_idx, val_idx) in enumerate(k.split(X)):
        
        X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
        y_train, y_val = y3.iloc[trn_idx], y3.iloc[val_idx]

        model.fit(X_train, y_train, eval_set = [(X_val, y_val)], early_stopping_rounds = 200, verbose = False, use_best_model = True)
        
        tr_preds = np.expm1(model.predict(X_train))
        tr_preds[tr_preds < 0] = 0
        tr_score = np.sqrt(mean_squared_log_error(np.expm1(y_train), tr_preds))
        
        val_preds = np.expm1(model.predict(X_val))
        val_preds[val_preds < 0] = 0
        val_score = np.sqrt(mean_squared_log_error(np.expm1(y_val), val_preds))

        scores.append((tr_score, val_score))
        
        print(f"Fold {i} | RMSLE: {val_score}")
        
        
    scores = pd.DataFrame(scores, columns = ['train score', 'validation score'])
    
    return scores['validation score'].mean()

study = optuna.create_study(direction = 'minimize')
study.optimize(objective, n_trials = 100)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)
print('Best value:', study.best_value)

In [None]:
# Carbon monoxide
# Mean RMSLE on 5 Folds with Optuna - 0.091
paramsCB1 = {'depth': 5, 
             'learning_rate': 0.010965529849139899, 
             'iterations': 14648, 
             'max_bin': 254, 
             'min_data_in_leaf': 200, 
             'l2_leaf_reg': 0.06534721701106948, 
             'bagging_temperature': 9.204797744803724, 
             'subsample': 0.49288891655151623, 
             'grow_policy': 'SymmetricTree', 
             'leaf_estimation_method': 'Newton',
             'random_seed': 228,
             'loss_function': 'RMSE',
             'eval_metric': 'RMSE',
             'cat_features': ['day_of_week', 'max_hours', 'working_hours', 'is_weekend']}

# Benzene
# Mean RMSLE on 5 Folds with Optuna - 0.077
paramsCB2 = {'depth': 3, 
             'learning_rate': 0.014982832523377104, 
             'iterations': 12631, 
             'max_bin': 127, 
             'min_data_in_leaf': 198,
             'l2_leaf_reg': 0.03581505752474062, 
             'bagging_temperature': 7.849773202956249, 
             'subsample': 0.5267753257701031, 
             'grow_policy': 'SymmetricTree', 
             'leaf_estimation_method': 'Gradient',
             'loss_function': 'RMSE',
             'eval_metric': 'RMSE',
             'cat_features': ['day_of_week', 'max_hours', 'working_hours', 'is_weekend']}

# Nitrogen oxides
# Mean RMSLE on 5 Folds with Optuna - 0.196
paramsCB3 = {'depth': 4, 
             'learning_rate': 0.01584329468703163, 
             'iterations': 25893, 
             'max_bin': 155, 
             'min_data_in_leaf': 283, 
             'l2_leaf_reg': 0.007844089667882592, 
             'bagging_temperature': 4.004631998790942, 
             'subsample': 0.7778197035482992, 
             'grow_policy': 'SymmetricTree',
             'leaf_estimation_method': 'Gradient',
             'random_seed': 228,
             'loss_function': 'RMSE',
             'eval_metric': 'RMSE',
             'cat_features': ['day_of_week', 'max_hours', 'working_hours', 'is_weekend']}

In [None]:
predictions = np.zeros(len(test))
logo = LeaveOneGroupOut()
n_splits = months.nunique()
for fold, (trn_idx, val_idx) in enumerate(logo.split(X, y3, months)):
    
    X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
    y_train, y_val = y3.iloc[trn_idx], y3.iloc[val_idx]

    model = CatBoostRegressor(**paramsCB3)
   
    model.fit(X_train, y_train, eval_set = [(X_val, y_val)], verbose = False, early_stopping_rounds = 200, use_best_model = True)
    
    predictions += model.predict(test) / n_splits
    
ss['target_nitrogen_oxides'] = np.expm1(predictions)

In [None]:
ss.to_csv('cb.csv', index=False)

**Result - 0.19584**

# Visualizing CatBoost predictions

In [None]:
train_v = pd.read_csv('../input/tabular-playground-series-jul-2021/train.csv')
fig = plt.figure(figsize = (15, 18))

plt.subplot(311)
plt.title('Carbon monoxide', size = 17, y = 1.03, fontname = 'monospace')
plt.grid(color = 'gray', linestyle = ':', axis = 'y', alpha = 0.8, zorder = 0,  dashes = (1,7))
a = sns.lineplot(x = "date_time", y = "target_carbon_monoxide", data = train_v, color = '#4cb5f5', linewidth = 0.5)
sns.lineplot(x = "date_time", y = "target_carbon_monoxide", data = ss, color = '#1e434c', linewidth = 0.5)
plt.ylabel('carbon moxide level', size = 14, fontname = 'monospace')
plt.xlabel('')
plt.xticks([])
plt.yticks(size = 12, fontname = 'monospace')

for j in ['right', 'top']:
    a.spines[j].set_visible(False)
a.spines['bottom'].set_linewidth(1.3)
a.spines['left'].set_linewidth(1.3)

plt.figtext(0.42, 0.675, 'train', fontsize = 13, fontname = 'monospace', color = '#4cb5f5')
plt.figtext(0.8, 0.675, 'predictions', fontsize = 13, fontname = 'monospace', color = '#1e434c')

plt.subplot(312)
plt.title('Benzene', size = 17, y = 1.03, fontname = 'monospace')
plt.grid(color = 'gray', linestyle = ':', axis = 'y', alpha = 0.8, zorder = 0,  dashes = (1,7))
b = sns.lineplot(x = "date_time", y = "target_benzene", data = train_v, color = '#b7b8b6', linewidth = 0.5)
sns.lineplot(x = "date_time", y = "target_benzene", data = ss, color = '#1e434c', linewidth = 0.5)
plt.ylabel('benzene level', size = 14, fontname = 'monospace')
plt.xlabel('')
plt.xticks([])
plt.yticks(size = 12, fontname = 'monospace')

for j in ['right', 'top']:
    b.spines[j].set_visible(False)
b.spines['bottom'].set_linewidth(1.3)
b.spines['left'].set_linewidth(1.3)

plt.figtext(0.42, 0.335, 'train', fontsize = 13, fontname = 'monospace', color = '#b7b8b6')
plt.figtext(0.8, 0.335, 'predictions', fontsize = 13, fontname = 'monospace', color = '#1e434c')

plt.subplot(313)
plt.title('Nitrogen oxides', size = 17, y = 1.03, fontname = 'monospace')
plt.grid(color = 'gray', linestyle = ':', axis = 'y', alpha = 0.8, zorder = 0,  dashes = (1,7))
b = sns.lineplot(x = "date_time", y = "target_nitrogen_oxides", data = train_v, color = '#34675c', linewidth = 0.5)
sns.lineplot(x = "date_time", y = "target_nitrogen_oxides", data = ss, color = '#1e434c', linewidth = 0.5)
plt.ylabel('nitrogen oxides level', size = 14, fontname = 'monospace')
plt.xlabel('')
plt.xticks([])
plt.yticks(size = 12, fontname = 'monospace')

for j in ['right', 'top']:
    b.spines[j].set_visible(False)
b.spines['bottom'].set_linewidth(1.3)
b.spines['left'].set_linewidth(1.3)

plt.figtext(0.41, -0.005, 'train', fontsize = 13, fontname = 'monospace', color = '#34675c')
plt.figtext(0.8, -0.005, 'predictions', fontsize = 13, fontname = 'monospace', color = '#1e434c')

fig.tight_layout(h_pad = 5)

plt.show()

# Conclusion

This is my first experience working with time series, and I can say with full confidence that the results of building time series models depend very much on data preprocessing. If you add a few new features here or remove them, the results may worsen up to 0.25, from which I could not crawl out for a very long time until I found out about lags.