In [None]:
pip install -U lightautoml

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.lines import Line2D
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
import time
import random
import torch

from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.tasks import Task
from lightautoml.dataset.roles import CategoryRole, DatetimeRole

# Pandas setting to display more dataset rows and columns
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## **Data import**

In [None]:
train = pd.read_csv("/kaggle/input/tabular-playground-series-jul-2021/train.csv", low_memory=False)#, nrows=10000)
train["date_time"] = pd.to_datetime(train["date_time"], format="%Y-%m-%d %H:%M:%S")
test = pd.read_csv("/kaggle/input/tabular-playground-series-jul-2021/test.csv", low_memory=False)
test["date_time"] = pd.to_datetime(test["date_time"], format="%Y-%m-%d %H:%M:%S")
train.info(memory_usage="deep")

In [None]:
test.info(memory_usage="deep")

In [None]:
train.head(10)

# **EDA**

In [None]:
targets = ["target_carbon_monoxide", "target_benzene", "target_nitrogen_oxides"]
target_names = ["Carbon monoxide", "Benzene", "Nitrogen oxides"]

The idea of SMC feature below was taken from this [notebook](https://www.kaggle.com/junhyeok99/automl-pycaret).

Taking into account temperature changes was suggested by [@lukaszborecki](https://www.kaggle.com/lukaszborecki) [here](https://www.kaggle.com/c/tabular-playground-series-jul-2021/discussion/250931#1380107).

In [None]:
def add_new_plot_features(df):
    """
    Adds new features to a given dataset for plotting
    """
    df["month"] = df["date_time"].dt.month
    df["day_of_week"] = df["date_time"].dt.dayofweek
    df["day_of_year"] = df["date_time"].dt.dayofyear
    df["hour"] = df["date_time"].dt.hour
    df["quarter"] = df["date_time"].dt.quarter
    df["week_of_year"] = df["date_time"].dt.isocalendar().week.astype("int")
#     df["is_winter"] = df["month"].isin([1, 2, 12])
#     df["is_sprint"] = df["month"].isin([3, 4, 5])
#     df["is_summer"] = df["month"].isin([6, 7, 8])
#     df["is_autumn"] = df["month"].isin([9, 10, 11])
    df["working_hours"] =  df["hour"].isin(np.arange(8, 21, 1)).astype("int")
    df["is_weekend"] = (df["date_time"].dt.dayofweek >= 5).astype("int")
    return df

def add_new_ml_features(df, i=3): # i=3 is for heatmap plot
    """
    Adds new features to a given dataset for training
    """
    # Features to be added to every target dataset
    df["hour"] = df["date_time"].dt.hour
    df["working_hours"] =  df["hour"].isin(np.arange(8, 21, 1)).astype("int")
    df["maximum_hours"] =  df["hour"].isin([8, 9, 17, 18, 19, 20]).astype("int")
    # Marking weekends because they usually have lower target values
    df["is_weekend"] = (df["date_time"].dt.dayofweek >= 5).astype("int")
    df["SMC"] = (df["absolute_humidity"] * 100) / df["relative_humidity"]
    
    # A list of features to generate shifted and lagged values
    shift_features = [["SMC", "absolute_humidity", "deg_C",
                      "sensor_1", "sensor_2", "sensor_3", "sensor_4", "sensor_5"],
                      ["SMC", "absolute_humidity", "target_carbon_monoxide_preds",
                      "sensor_1", "sensor_2", "sensor_3", "sensor_4", "sensor_5"],
                      ["SMC", "absolute_humidity", "target_carbon_monoxide_preds", "target_benzene_preds",
                      "sensor_1", "sensor_2", "sensor_3", "sensor_5"],
                      # Features for heatmap plot
                      ["SMC", "absolute_humidity", "deg_C",
                      "sensor_1", "sensor_2", "sensor_3", "sensor_4", "sensor_5"]]
    
    # Amounts of hour shifts and lags
    shifts = [1, 2, 3, 4, 5, 6, 12, 24]
#     shifts = [1, 2, 3, 6, 12, 24]
    
    for feature in shift_features[i]:
        for shift in shifts:
            df[feature+"-"+str(shift)+"abs_shift"] = df[feature] - df[feature].shift(periods=shift, fill_value=0)
            df[feature+"+"+str(shift)+"abs_shift"] = df[feature] - df[feature].shift(periods=-shift, fill_value=0)
#             df[feature+"-"+str(shift)+"prc_shift"] = (df[feature] / df[feature].shift(periods=shift, fill_value=0)) - 1
#             df[feature+"+"+str(shift)+"prc_shift"] = (df[feature] / df[feature].shift(periods=-shift, fill_value=0)) - 1
    
#     # Dropping the least important features as per previous runs
#     to_drop = [ ["sensor_2+2abs_shift", "sensor_1-2abs_shift", "deg_C-4abs_shift", "sensor_2-3abs_shift", "sensor_1+3abs_shift",
#                  "deg_C-5abs_shift", "sensor_1+2abs_shift", "sensor_1-4abs_shift", "sensor_2-4abs_shift", "sensor_1-3abs_shift"],
#                 ["sensor_5-12abs_shift", "sensor_3-5abs_shift", "sensor_5-3abs_shift", "sensor_5-4abs_shift", "sensor_5-5abs_shift",
#                  "sensor_4-5abs_shift", "absolute_humidity-3abs_shift", "sensor_5+6abs_shift", "sensor_5+3abs_shift", "sensor_1-5abs_shift"],
#                 ["sensor_3+2abs_shift", "sensor_1+4abs_shift", "sensor_1+3abs_shift", "sensor_2+3abs_shift", "maximum_hours",
#                  "SMC+5abs_shift", "sensor_3+1abs_shift", "sensor_3+3abs_shift", "sensor_5+5abs_shift", "sensor_1+2abs_shift"]
#                 ]
#     if i <= 2:
#         df.drop(to_drop[i], axis=1, inplace=True)
#     # Replacing infinity values as a result of devision by zero at the end of a dataset
#     df.replace(to_replace=np.inf, value=0, inplace=True)
    
#     return df.drop(["hour", "week_of_year"], axis=1)
    return df.drop(["hour"], axis=1)

In [None]:
train_copy = train.copy()
test_copy = test.copy()
train = add_new_plot_features(train)
test = add_new_plot_features(test)

The datasets have timestamps. Let's compare which dates are in each dataset.

In [None]:
# Plot dataframe
df = pd.concat([train["date_time"], test["date_time"]], axis=0).reset_index(drop=True)

fig, ax = plt.subplots(figsize=(16, 1.5))
bar1 =  ax.barh(0, 7111+2247, color="salmon", height=0.2)
bar2 =  ax.barh(0, 7111, color="teal", height=0.2)
ax.set_title("Train and test datasets size comparison", fontsize=20, pad=5)
ax.bar_label(bar1, ["Test dataset"], label_type="edge", padding=-170,
             fontsize=20, color="white", weight="bold")
ax.bar_label(bar2, ["Train dataset"], label_type="center",
             fontsize=20, color="white", weight="bold")
ax.set_xticks([0, 7111, 7111+2247])
ax.set_xticklabels(["2010-03-10", "2011-01-01", "2011-04-04"])
ax.set_yticks([])
plt.show();

The datasets also have three target columns that the model have to predict. Let's see how each target is changing in time.

In [None]:
fig, axs = plt.subplots(figsize=(16, 18), ncols=1, nrows=3, sharex=False)

plt.subplots_adjust(hspace = 0.3)

colors = ["palevioletred", "deepskyblue", "mediumseagreen"]

for i in [0, 1, 2]:
    axs[i].plot(train["date_time"], train[targets[i]], color=colors[i])
    axs[i].set_title(f"{target_names[i]} (target #{i+1}) levels across time", fontsize=20, pad=5)
    axs[i].set_ylabel(f"{target_names[i]} level", fontsize=14, labelpad=5)
    axs[i].set_xlabel("Date", fontsize=14, labelpad=5)
    axs[i].grid(axis="both")

plt.show();

Let's see mean target values per day of year.

In [None]:
# Dataframe copy excluding the last row which is the only one representing January
df = train.drop([7110], axis=0).copy()
df["day"] = df["date_time"].dt.dayofyear
df["weekday"] = df["date_time"].dt.dayofweek

colors = ["palevioletred", "deepskyblue", "mediumseagreen"]

# An array of number of days of year (i.e. from 1 to 365) which are mondays to mark week starts
mondays = df.loc[df["weekday"] == 0]["day"].value_counts(sort=False).index
# An array of number of weeks of year to be used as label ticks
weeks = df["date_time"].dt.isocalendar().week.unique()[1:]

fig, axs = plt.subplots(figsize=(16, 18), ncols=1, nrows=3, sharex=False)

plt.subplots_adjust(hspace = 0.3)

for i in [0, 1, 2]:
    axs[i].plot(df.groupby("day")[targets[i]].mean().index,
                df.groupby("day")[targets[i]].mean().values, color=colors[i])
    axs[i].set_title(f"{target_names[i]} (target #{i+1}) mean levels across time", fontsize=20, pad=5)
    axs[i].set_ylabel(f"{target_names[i]} level", fontsize=14, labelpad=5)
    axs[i].set_xlabel("Week starts", fontsize=14, labelpad=5)
    axs[i].set_xticks(mondays)
    axs[i].set_xticklabels(weeks)
    axs[i].grid(axis="both")

plt.show();

As you can see, all target values usually go down at the end of each week (i.e. during weekends). 

Let's check targets distribution along each month.

In [None]:
fig, axs = plt.subplots(ncols=2, nrows=5, figsize=(16, 20))
plt.subplots_adjust(hspace = 0.3)
fig.suptitle(target_names[0], fontsize=20)

i=3
for r in np.arange(5):
    for c in [0, 1]:
        axs[r, c].plot(train.loc[train["month"]==i, targets[0]], color="steelblue")
        axs[r, c].set_title(f"Month #{i}", fontsize=15)
        axs[r, c].legend(fontsize=13)
        i+=1

In [None]:
fig, axs = plt.subplots(ncols=2, nrows=5, figsize=(16, 20))
plt.subplots_adjust(hspace = 0.3)
fig.suptitle(target_names[1], fontsize=20)

i=3
for r in np.arange(5):
    for c in [0, 1]:
        axs[r, c].plot(train.loc[train["month"]==i, targets[1]], color="palevioletred")
        axs[r, c].set_title(f"Month #{i}", fontsize=15)
        axs[r, c].legend(fontsize=13)
        i+=1

There are some near zer flat areas at 4th, 6th, 8th, 12th month plots. Need to figure out what is so special about these days. It also may be a garbage data which sould be deleted before machine learning.

In [None]:
fig, axs = plt.subplots(ncols=2, nrows=5, figsize=(16, 20))
plt.set_cmap("Set2")
plt.subplots_adjust(hspace = 0.3)
fig.suptitle(target_names[2], fontsize=20)

i=3
for r in np.arange(5):
    for c in [0, 1]:
        axs[r, c].plot(train.loc[train["month"]==i, targets[2]], color="goldenrod")
        axs[r, c].set_title(f"Month #{i}", fontsize=15)
        axs[r, c].legend(fontsize=13)
        i+=1

Let's check each target value distribution.

In [None]:
fig, axs = plt.subplots(figsize=(15, 6), ncols=3, nrows=1, sharey=False)

fig.suptitle("Target values distribution", fontsize=20)

colors = ["mediumorchid", "lightseagreen", "cornflowerblue"]

for i in [0, 1, 2]:
    axs[i].hist(train[targets[i]], bins=60, edgecolor="black", color=colors[i])
    axs[i].set_title(f"{target_names[i]} (target #{i+1})", fontsize=15, pad=5)
    axs[i].set_ylabel("Amount of values", fontsize=13, labelpad=5)
    axs[i].set_xlabel(f"{target_names[i]} level", fontsize=13, labelpad=5)
    axs[i].grid(axis="y")

plt.show();

Let's check how each target value chenges depending on the time of day, day of week, and month.

In [None]:
fig, axs = plt.subplots(figsize=(16, 18), ncols=1, nrows=3, sharex=False)

plt.subplots_adjust(hspace = 0.3)
width=0.35
x = train.groupby("hour")["target_carbon_monoxide"].mean().index

for i in np.arange(3):
    bars1 = axs[i].bar(x-width/2, train.groupby("hour")[targets[i]].mean(),
                        width=width, edgecolor="black", label="Mean", color="cornflowerblue")
    bars2 = axs[i].bar(x+width/2, train.groupby("hour")[targets[i]].median(),
                        width=width, edgecolor="black", label="Median", color="palevioletred")
    axs[i].set_title(f"{target_names[i]} (target #{i+1})", fontsize=15, pad=10)
    axs[i].set_ylabel("Target value", fontsize=13, labelpad=5)
    axs[i].set_xlabel("Day hours", fontsize=13, labelpad=5)
    axs[i].set_xticks(x)
    axs[i].grid(axis="y")
    axs[i].legend(fontsize=13)

In [None]:
# Dataframe copy excluding the last row which is the only one representing January
df = train.drop([7110], axis=0).copy()

fig, axs = plt.subplots(figsize=(16, 19), ncols=2, nrows=3, sharex=False,
                        gridspec_kw={'width_ratios': [1, 1.5]})

fig.suptitle("Target values distribution per month and day of week", fontsize=20)

plt.subplots_adjust(hspace = 0.25)
width=0.35
x = df.groupby("day_of_week")["target_carbon_monoxide"].mean().index + 1

for i in np.arange(3):
    bars1 = axs[i, 0].bar(x-width/2, df.groupby("day_of_week")[targets[i]].mean(),
                        width=width, edgecolor="black", label="Mean", color="salmon")
    bars2 = axs[i, 0].bar(x+width/2, df.groupby("day_of_week")[targets[i]].median(),
                        width=width, edgecolor="black", label="Median", color="teal")
    axs[i, 0].set_title(f"{target_names[i]} (target #{i+1})", fontsize=15, pad=10)
    axs[i, 0].set_ylabel("Target value", fontsize=13, labelpad=5)
    axs[i, 0].set_xlabel("Day of week", fontsize=13, labelpad=5)
    axs[i, 0].set_xticks(x)
    axs[i, 0].grid(axis="y")
    axs[i, 0].legend(fontsize=13)

x = df.groupby("month")["target_carbon_monoxide"].mean().index
for i in np.arange(3):
    bars1 = axs[i, 1].bar(x-width/2, df.groupby("month")[targets[i]].mean(),
                        width=width, edgecolor="black", label="Mean", color="salmon")
    bars2 = axs[i, 1].bar(x+width/2, df.groupby("month")[targets[i]].median(),
                        width=width, edgecolor="black", label="Median", color="teal")
    axs[i, 1].set_title(f"{target_names[i]} (target #{i+1})", fontsize=15, pad=10)
    axs[i, 1].set_ylabel("Target value", fontsize=13, labelpad=5)
    axs[i, 1].set_xlabel("Month", fontsize=13, labelpad=5)
    axs[i, 1].set_xticks(x)
    axs[i, 1].grid(axis="y")
    axs[i, 1].legend(fontsize=13)

In [None]:
# Day hours which will be used for plotting data
hours = [0, 5, 8, 14, 19]
# Dataframe copy excluding the last row which is the only one representing January
df = train.loc[train["hour"].isin(hours)].drop([7110], axis=0).copy()

fig, axs = plt.subplots(figsize=(16, 18), ncols=2, nrows=3, sharex=False,
                        gridspec_kw={'width_ratios': [1, 1.5]})

fig.suptitle("Target values distribution per month and day of week at given hours", fontsize=20)

plt.subplots_adjust(hspace = 0.3)
width=0.15
x = np.sort(df["day_of_week"].unique()) + 1

for i in np.arange(3):
    bars1 = axs[i, 0].bar(x-width*2, df.loc[df["hour"] == 0].groupby("day_of_week")[targets[i]].mean(),
                        width=width, edgecolor="black", label="00:00", color="salmon")
    bars2 = axs[i, 0].bar(x-width, df.loc[df["hour"] == 5].groupby("day_of_week")[targets[i]].mean(),
                        width=width, edgecolor="black", label="05:00", color="sandybrown")
    bars3 = axs[i, 0].bar(x, df.loc[df["hour"] == 8].groupby("day_of_week")[targets[i]].mean(),
                        width=width, edgecolor="black", label="08:00", color="teal")
    bars4 = axs[i, 0].bar(x+width, df.loc[df["hour"] == 14].groupby("day_of_week")[targets[i]].mean(),
                        width=width, edgecolor="black", label="14:00", color="palevioletred")
    bars5 = axs[i, 0].bar(x+width*2, df.loc[df["hour"] == 19].groupby("day_of_week")[targets[i]].mean(),
                        width=width, edgecolor="black", label="19:00", color="mediumslateblue")
    axs[i, 0].set_title(f"{target_names[i]} (target #{i+1})", fontsize=15, pad=10)
    axs[i, 0].set_ylabel("Target value", fontsize=13, labelpad=5)
    axs[i, 0].set_xlabel("Day of week", fontsize=13, labelpad=5)
    axs[i, 0].set_xticks(x)
    axs[i, 0].grid(axis="y")
    axs[i, 0].legend(fontsize=10)

x = df["month"].unique()
for i in np.arange(3):
    bars1 = axs[i, 1].bar(x-width*2, df.loc[df["hour"] == 0].groupby("month")[targets[i]].mean(),
                        width=width, edgecolor="black", label="00:00", color="salmon")
    bars2 = axs[i, 1].bar(x-width, df.loc[df["hour"] == 5].groupby("month")[targets[i]].mean(),
                        width=width, edgecolor="black", label="05:00", color="sandybrown")
    bars3 = axs[i, 1].bar(x, df.loc[df["hour"] == 8].groupby("month")[targets[i]].mean(),
                        width=width, edgecolor="black", label="08:00", color="teal")
    bars4 = axs[i, 1].bar(x+width, df.loc[df["hour"] == 14].groupby("month")[targets[i]].mean(),
                        width=width, edgecolor="black", label="14:00", color="palevioletred")
    bars5 = axs[i, 1].bar(x+width*2, df.loc[df["hour"] == 19].groupby("month")[targets[i]].mean(),
                        width=width, edgecolor="black", label="19:00", color="mediumslateblue")
    axs[i, 1].set_title(f"{target_names[i]} (target #{i+1})", fontsize=15, pad=10)
    axs[i, 1].set_ylabel("Target value", fontsize=13, labelpad=5)
    axs[i, 1].set_xlabel("Month", fontsize=13, labelpad=5)
    axs[i, 1].set_xticks(x)
    axs[i, 1].grid(axis="y")
    axs[i, 1].legend(fontsize=10)

## Feature plots

In [None]:
# Lists of feature names to be used for plots below
all_features = ["deg_C", "relative_humidity", "absolute_humidity", "sensor_1", "sensor_2", "sensor_3", "sensor_4", "sensor_5"]
all_feature_names = ["Temperature (deg. C)", "Relative humidity", "Absolute humidity", "Sensor 1", "Sensor_2", "Sensor 3", "Sensor 4", "Sensor 5"]

weather_features = ["deg_C", "relative_humidity", "absolute_humidity"]
weather_feature_names = ["Temperature (deg. C)", "Relative humidity", "Absolute humidity"]

sensor_features = ["sensor_1", "sensor_2", "sensor_3", "sensor_4", "sensor_5"]
sensor_feature_names = ["Sensor 1", "Sensor_2", "Sensor 3", "Sensor 4", "Sensor 5"]

Let's compare our train and test feature data.

In [None]:
fig, axs = plt.subplots(figsize=(16, 30), ncols=1, nrows=8, sharex=False)

plt.subplots_adjust(hspace = 0.4)

colors = ["lightcoral", "sandybrown", "darkorange", "mediumseagreen",
          "lightseagreen", "cornflowerblue", "mediumpurple", "palevioletred",
          "lightskyblue", "sandybrown", "yellowgreen", "indianred",
          "lightsteelblue", "mediumorchid", "deepskyblue"]

for i in np.arange(8):
    legend_lines = [Line2D([0], [0], color=colors[i], lw=10),
                    Line2D([0], [0], color="black", lw=10)]
    axs[i].plot(train["date_time"], train[all_features[i]], color=colors[i], label="Train data")
    axs[i].plot(test["date_time"], test[all_features[i]], color="black", label="Test data")
    axs[i].set_title(f"{all_feature_names[i]} levels across time", fontsize=20, pad=5)
    axs[i].set_ylabel(f"{all_feature_names[i]} level", fontsize=14, labelpad=5)
    axs[i].set_xlabel("Date", fontsize=14, labelpad=5)
    axs[i].legend(legend_lines, ["Train data", "Test data"], fontsize=12, loc=1)
    axs[i].grid(axis="both")

In [None]:
# Plot dataframe creation
df = pd.concat([train_copy, test_copy], axis=0)
df.reset_index(drop=True, inplace=True)
df["week_of_year"] = df["date_time"].dt.isocalendar().week.astype("int")
df["day_of_year"] = df["date_time"].dt.dayofyear

fig, axs = plt.subplots(figsize=(16, 18), ncols=2, nrows=3, sharex=False)

plt.subplots_adjust(hspace = 0.4)

colors = ["palevioletred", "deepskyblue", "mediumseagreen"]

for i in [0, 1, 2]:
    # New year days start from 7110th row
    data = df.iloc[:7110].groupby("day_of_year")[weather_features[i]].mean()
    axs[i, 0].plot(data.index, data.values, color=colors[i], label="Train data")
    data = df.iloc[7110:].groupby("day_of_year")[weather_features[i]].mean()
    axs[i, 0].plot(data.index, data.values, color="black", alpha=0.7, label="Test data")
    axs[i, 0].set_title(f"Mean dayly {weather_feature_names[i]} levels", fontsize=20, pad=5)
    axs[i, 0].set_ylabel(f"{weather_feature_names[i]} level", fontsize=14, labelpad=5)
    axs[i, 0].set_xlabel("Day of year", fontsize=14, labelpad=5)
    axs[i, 0].grid(axis="both")
    axs[i, 0].legend(fontsize=12)


for i in [0, 1, 2]:
    # New year weeks start from 7159th row. 
    # Because of Jan 1st and 2nd from the test dataset are counted as 52nd week of 2010,
    # the colored plotline contains some test data. 
    data = df.iloc[:7159].groupby("week_of_year")[weather_features[i]].mean()
    axs[i, 1].plot(data.index, data.values, color=colors[i], label="Train data")
    data = df.iloc[7159:].groupby("week_of_year")[weather_features[i]].mean()
    axs[i, 1].plot(data.index, data.values, color="black", alpha=0.7, label="Test data")
    axs[i, 1].set_title(f"Mean weekly {weather_feature_names[i]} levels", fontsize=20, pad=5)
    axs[i, 1].set_ylabel(f"{weather_feature_names[i]} level", fontsize=14, labelpad=5)
    axs[i, 1].set_xlabel("Week of year", fontsize=14, labelpad=5)
    axs[i, 1].grid(axis="both")
    axs[i, 1].legend(fontsize=12)

plt.show();

In [None]:
# Plot dataframe creation
df = pd.concat([train_copy, test_copy], axis=0)
df.reset_index(drop=True, inplace=True)
df["week_of_year"] = df["date_time"].dt.isocalendar().week.astype("int")
df["day_of_year"] = df["date_time"].dt.dayofyear

fig, axs = plt.subplots(figsize=(16, 30), ncols=2, nrows=5, sharex=False)

plt.subplots_adjust(hspace = 0.4)

colors = ["palevioletred", "deepskyblue", "mediumseagreen", "goldenrod", "indianred"]

for i in np.arange(5):
    data = df.iloc[:7110].groupby("day_of_year")[sensor_features[i]].mean()
    axs[i, 0].plot(data.index, data.values, color=colors[i], label="Train data")
    data = df.iloc[7110:].groupby("day_of_year")[sensor_features[i]].mean()
    axs[i, 0].plot(data.index, data.values, color="black", alpha=0.7, label="Test data")
    axs[i, 0].set_title(f"Mean dayly {sensor_feature_names[i]} levels", fontsize=20, pad=5)
    axs[i, 0].set_ylabel(f"{sensor_feature_names[i]} level", fontsize=14, labelpad=5)
    axs[i, 0].set_xlabel("Day of year", fontsize=14, labelpad=5)
    axs[i, 0].grid(axis="both")
    axs[i, 0].legend(fontsize=12)


for i in np.arange(5):
    data = df.iloc[:7159].groupby("week_of_year")[sensor_features[i]].mean()
    axs[i, 1].plot(data.index, data.values, color=colors[i], label="Train data")
    data = df.iloc[7159:].groupby("week_of_year")[sensor_features[i]].mean()
    axs[i, 1].plot(data.index, data.values, color="black", alpha=0.7, label="Test data")
    axs[i, 1].set_title(f"Mean dayly {sensor_feature_names[i]} levels", fontsize=20, pad=5)
    axs[i, 1].set_ylabel(f"{sensor_feature_names[i]} level", fontsize=14, labelpad=5)
    axs[i, 1].set_xlabel("Week of year", fontsize=14, labelpad=5)
    axs[i, 1].grid(axis="both")
    axs[i, 1].legend(fontsize=12)

plt.show();

Let's check feature correlation.

In [None]:
# Plot dataframe
df = train_copy.copy()
df = pd.concat([df[targets], df.drop(targets, axis=1)], axis=1).corr().round(2)

# Mask to hide upper-right part of plot as it is a duplicate
mask = np.zeros_like(df)
mask[np.triu_indices_from(mask)] = True

# Making a plot
plt.figure(figsize=(12,12))
ax = sns.heatmap(df, annot=True, mask=mask, cmap="RdBu", linewidths=1,
                 annot_kws={"weight": "bold", "fontsize":13})
ax.set_title("Original dataset correlation heatmap", fontsize=17)
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
         rotation_mode="anchor", weight="bold")
plt.setp(ax.get_yticklabels(), weight="bold")
plt.show();

In [None]:
# Plot dataframe
df = add_new_ml_features(train_copy.copy())
df = pd.concat([df[targets], df.drop(targets, axis=1)], axis=1).corr().round(2)

# Mask to hide upper-right part of plot as it is a duplicate
mask = np.zeros_like(df)
mask[np.triu_indices_from(mask)] = True

# Making a plot
plt.figure(figsize=(16,16))
ax = sns.heatmap(df, annot=False, mask=mask, cmap="RdBu", annot_kws={"weight": "bold", "fontsize": 7})
ax.set_title("Original and engineered features correlation heatmap", fontsize=17)
plt.setp(ax.get_xticklabels(), rotation=90, ha="right",
         rotation_mode="anchor", weight="normal")
plt.setp(ax.get_yticklabels(), weight="normal")
plt.show();

In [None]:
# Checking the most correlated features 
for col in df.columns:
    for index in df[col].index:
        if df[col][index] != 1:
            if (df[col][index] >= 0.93) | (df[col][index] <=-0.93):
                print(f"Correlation of {index} and {col} is {df[col][index]}")

# **Model training**

The datetime conversion shown below was found in this [notebook](https://www.kaggle.com/jarupula/eda-rf-model-tps-july-21). It gives a significant score boost.

In [None]:
def prepare_dataset(train_copy, test_copy, i):
    
    X = add_new_ml_features(train_copy.copy(), i)

    # Dropping the last row which is 2011-01-01 00:00:00
    if X.index[-1] == 7110:
        X.drop([7110], axis=0, inplace=True)

    # Resetting dataframe index
    X.reset_index(drop=True, inplace=True)

    # The months will be used for folds split
    months = X["date_time"].dt.month

    # Adding 72 last train set rows to the head of test set in order to get shifting feature values
    X_test_temp = pd.concat([train_copy.iloc[-25:-1].drop(["target_carbon_monoxide", "target_benzene", "target_nitrogen_oxides"], axis=1), test_copy], axis=0)
    X_test_temp.reset_index(inplace=True, drop=True)
    X_test = add_new_ml_features(X_test_temp.copy(), i)
    # Deleting added train set rows
    X_test.drop(X_test.loc[:23].index, axis=0, inplace=True)
    X_test.reset_index(inplace=True, drop=True)

    y = np.log1p(X[["target_carbon_monoxide", "target_benzene", "target_nitrogen_oxides"]])
    X.drop(["target_carbon_monoxide", "target_benzene", "target_nitrogen_oxides"], axis=1, inplace=True)
    
    X['date_time'] = X['date_time'].astype('datetime64[ns]').astype(np.int64)/10**9
    X_test['date_time'] = X_test['date_time'].astype('datetime64[ns]').astype(np.int64)/10**9
    
#     print(X.shape, y.shape, X_test.shape)
#     display(X.head())
#     display(y.head())
    
    return X, X_test, y

In [None]:
%%time

# LightAutoML parameters
N_THREADS = 4 # threads cnt for lgbm and linear models
N_FOLDS = 5 # folds cnt for AutoML
RANDOM_STATE = 42 # fixed random state for various reasons
TEST_SIZE = 0.2 # Test size for metric check
TIMEOUT = 2.9 * 3600 # Time in seconds for automl run

# Fixing parameters for better repeatability 
np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)

# Initializing and filling predictions dataframe before datetime conversion
preds = pd.DataFrame()
preds["date_time"] = test_copy["date_time"].copy()

train_preds_df = pd.DataFrame(index=np.arange(7110))
test_preds_df = pd.DataFrame(index=test.index)

feature_importances = []
for i, target in enumerate(targets):
    
    ROLES = {CategoryRole(force_input=True, ordinal=True): ["working_hours", "maximum_hours", "is_weekend"],
#              DatetimeRole(base_date=False, base_feats=True, seasonality=("d", "wd", "hour")): "date_time",
             "target": target}    
    X, X_test, y = prepare_dataset(pd.concat([train_copy, train_preds_df], axis=1), pd.concat([test_copy, test_preds_df], axis=1), i)
    display(X)
    
    model = TabularUtilizedAutoML(task = Task("reg", loss="rmsle", metric="rmsle"),
                                  verbose=1,
                                  timeout = TIMEOUT,
                                  cpu_limit = N_THREADS,
                                  reader_params = {'n_jobs': N_THREADS, 'cv': N_FOLDS, 'random_state': RANDOM_STATE},
#                                   general_params = {'use_algos': [['lgb_tuned', 'cb_tuned'], ['lgb', 'cb_tuned']]}
                                  general_params = {'use_algos': [['lgb_tuned', 'cb_tuned', 'lgb', 'cb'], ['cb']]}
#                                   general_params = {'use_algos': [['cb']]}
                                 )

    oof_preds = model.fit_predict(pd.concat([X, y[target]], axis=1), roles = ROLES)

    print(f"{target} oof_score score is {np.sqrt(mean_squared_log_error(np.expm1(y[target].values), np.expm1(oof_preds.data)))}")
    preds[target] = np.expm1(model.predict(X_test).data)
    feature_importances.append(model.get_feature_scores('fast', silent=False))
    
    train_preds_df[target+"_preds"] = oof_preds.data
    test_preds_df[target+"_preds"] = np.log1p(preds[target])

# **Feature importances**

In [None]:
# Creating feature list from feature importance dataframes in case there are diffrent dataset used for each target
feature_list = set()
for i in np.arange(len(feature_importances)):
    feature_list = set.union(feature_list, set(feature_importances[i]["Feature"]))
print(f"There are {len(feature_list)} unique features used for training: {feature_list}")

In [None]:
# Creating a sorted dataframe with all feature importances
fi_df = pd.DataFrame(columns=["Feature", "Target1_imp", "Target2_imp", "Target3_imp"])
fi_df["Feature"] = list(feature_list)
fi_df.sort_values("Feature", inplace=True)
fi_df.reset_index(drop=True, inplace=True)
for i, fi in enumerate(feature_importances):
    for feature in fi["Feature"]:
        fi_df.loc[fi_df["Feature"]==feature, "Target"+str(i+1)+"_imp"] = fi.loc[fi["Feature"]==feature, "Importance"].values / fi["Importance"].sum()

fi_df.fillna(0, inplace=True)
fi_df["Overall_importance"] = fi_df["Target1_imp"] + fi_df["Target2_imp"] + fi_df["Target3_imp"]
fi_df.sort_values("Overall_importance", ascending=False, inplace=True)
fi_df.reset_index(drop=True, inplace=True)

# Displaying original feature importance dataframes to quickly check target specific feature performance
display(feature_importances[0].T)
print("Importance sum", feature_importances[0]["Importance"].sum())
display(feature_importances[1].T)
print("Importance sum", feature_importances[1]["Importance"].sum())
display(feature_importances[2].T)
print("Importance sum", feature_importances[2]["Importance"].sum())

In [None]:
fi_df.T

In [None]:
# Displaying 10 least important features for each target
display(feature_importances[0].tail(10).T)
display(feature_importances[1].tail(10).T)
display(feature_importances[2].tail(10).T)

In [None]:
df= fi_df
x = np.arange(0, len(df["Feature"]))
height = 0.3

fig, ax = plt.subplots(figsize=(12, 80))
bars1 = ax.barh(x-height, df["Target1_imp"], height=height,
                color="cornflowerblue",
                edgecolor="black",
                label=target_names[0])
bars2 = ax.barh(x, df["Target2_imp"], height=height,
                color="palevioletred",
                edgecolor="black",
                label=target_names[1])
bars3 = ax.barh(x+height, df["Target3_imp"], height=height,
                color="mediumseagreen",
                edgecolor="black",
                label=target_names[2])
ax.set_title("Feature importances", fontsize=20, pad=5)
ax.set_ylabel("Feature names", fontsize=15, labelpad=5)
ax.set_xlabel("Feature importance", fontsize=15, labelpad=5)
ax.set_yticks(x)
ax.set_yticklabels(df["Feature"], fontsize=12)
ax.tick_params(axis="x", labelsize=12)
ax.grid(axis="x")
ax.legend(fontsize=13, loc="upper right", bbox_to_anchor=(0, 0, 1, 0.92))
plt.margins(0.04, 0.01)
plt.gca().invert_yaxis()

# **Predictions submission and comparison**

In [None]:
preds.to_csv('submission.csv', index=False)
preds.head()

Let's compare predictions with the closest months from the train datasets.

In [None]:
fig, axs = plt.subplots(ncols=1, nrows=3, figsize=(16, 8))
plt.set_cmap("Set2")
plt.subplots_adjust(hspace = 0.3)

for i, target in enumerate(y.columns):
    axs[i].plot(np.arange(0, 744, 1), train.loc[train["month"]==12, target], label="Train, 12th month")
    axs[i].plot(np.arange(0, 744, 1), preds.loc[preds["date_time"].dt.month==1, target],
                label="Test, 1th month")
    axs[i].set_title(target_names[i], fontsize=15)
    axs[i].legend(fontsize=13)

In [None]:
fig, axs = plt.subplots(ncols=1, nrows=3, figsize=(16, 8))
plt.set_cmap("Set2")
plt.subplots_adjust(hspace = 0.3)

for i, target in enumerate(y.columns):
    axs[i].plot(np.arange(0, 720, 1), train.loc[train["month"]==11, target], label="Train, 11th month")
    axs[i].plot(np.arange(0, 744, 1), preds.loc[preds["date_time"].dt.month==1, target],
                label="Test, 1th month")
    axs[i].set_title(target_names[i], fontsize=15)
    axs[i].legend(fontsize=13)

In [None]:
fig, axs = plt.subplots(ncols=1, nrows=3, figsize=(16, 8))
plt.set_cmap("Set2")
plt.subplots_adjust(hspace = 0.3)

for i, target in enumerate(y.columns):
    axs[i].plot(np.arange(0, 598, 1), train.loc[:597, target], label="Train, from 10.3 to 4.4")
    axs[i].plot(np.arange(0, 596, 1), preds.loc[1651: , target],
                label="Test, from 10.3 to 4.4")
    axs[i].set_title(target_names[i], fontsize=15)
    axs[i].legend(fontsize=13)

As you can see, the predictions are the closest to the training set in the overlapping months (from March 10 to April 4). 