**Some ideas are from https://www.kaggle.com/andy6804tw/catboost-18feature-cross-validation
and https://www.kaggle.com/junhyeok99/automl-pycaret**

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.lines import Line2D
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from sklearn.ensemble import IsolationForest
from sklearn.datasets import load_boston
import warnings
warnings.filterwarnings('ignore')
plt.style.use('ggplot')
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv('../input/tabular-playground-series-jul-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-jul-2021/test.csv')

In [None]:
print('train shape:',train.shape)
print('test shape:',test.shape)

In [None]:
train.info

In [None]:
test.info

In [None]:
train.head(5)

# Data Preprocessing
* There are only numeric columns

In [None]:
all_data = pd.concat([train, test])
all_data

Confirm if there are missing values

In [None]:
all_data.isnull().sum()

In [None]:
all_data['date_time'] = pd.to_datetime(all_data['date_time'])
# add new column
# all_data['year'] = all_data['date_time'].dt.year
# all_data['month'] = all_data['date_time'].dt.month
# all_data['week'] = all_data['date_time'].dt.week
# all_data['day'] = all_data['date_time'].dt.day
# all_data['dayofweek'] = all_data['date_time'].dt.dayofweek
# all_data["hour"] = all_data["date_time"].dt.hour
# all_data["working_hours"] =  all_data["hour"].isin(np.arange(8, 21, 1)).astype("int")
# all_data["is_weekend"] = (all_data["date_time"].dt.dayofweek >= 5).astype("int")

In [None]:
all_data["hour"] = all_data["date_time"].dt.hour
all_data["working_hours"] =  all_data["hour"].isin(np.arange(8, 21, 1)).astype("int")
all_data["is_weekend"] = (all_data["date_time"].dt.dayofweek >= 5).astype("int")
all_data['hr'] = all_data.date_time.dt.hour*60+all_data.date_time.dt.minute
all_data['satday'] = (all_data.date_time.dt.weekday==5).astype("int")
# all_data["SMC"] = (all_data["absolute_humidity"] * 100) / all_data["relative_humidity"]

In [None]:
# add sensor shift
all_data["s1-6"] = all_data["sensor_1"] - all_data["sensor_1"].shift(periods=6, fill_value=0)
all_data["s2-6"] = all_data["sensor_2"] - all_data["sensor_2"].shift(periods=6, fill_value=0)
all_data["s3-6"] = all_data["sensor_3"] - all_data["sensor_3"].shift(periods=6, fill_value=0)
all_data["s4-6"] = all_data["sensor_4"] - all_data["sensor_4"].shift(periods=6, fill_value=0)
all_data["s5-6"] = all_data["sensor_5"] - all_data["sensor_5"].shift(periods=6, fill_value=0)
all_data.drop(columns = 'hour', inplace = True)

In [None]:
months = all_data["date_time"].dt.month[:len(train)]

In [None]:
# convert datetime to timestamp(s)
all_data['time'] = all_data['date_time'].astype(np.int64)//10**9
all_data.drop(columns = 'date_time', inplace = True)

In [None]:
all_data.head(10)

In [None]:
X=all_data[:len(train)].drop(columns = ['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides']).values
y=all_data[:len(train)][['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides']]
y_log=np.log1p(y)
X_test=all_data[len(train):].drop(columns = ['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides']).values

In [None]:
print('X_train shape:', X.shape)
print('y_train shape:', y.shape)
print('X_test shape:', X_test.shape)

# EDA

**Distribution of the features**
* The distribution of degrees Celsius shows a peak between 20 and 30 degrees. The distribution is similar to a Gaussian distribution.
* The relative humidity has a drop at 40% and two peaks at around 30% and 45%.
* The absolute humidity value shows a high value at 0.25 and peaks at around 1.
* The distribution of sensors_1,2,3 and 5 seems to be left skewed. While sensor-4 is normal and there are outliers with a value of 500.

In [None]:
cols = train.columns[1:9]
fig,ax=plt.subplots(4,2,figsize=(12,15))
n = 0
for i,col in enumerate(train.columns[1:9]):
    sns.histplot(train[col], ax=ax[i%4][i//4]);
    
fig.tight_layout()
plt.show()

**Distribution of the targets**
* The distribution of all target values is left skewed.

In [None]:
fig,ax=plt.subplots(3,1,figsize=(12,15))
n = 0
for col in train.columns[9:12]:
    sns.histplot(train[col], ax=ax[n]);
    n += 1
    
fig.tight_layout()
plt.show()

**Feature Heatmap**
* The heat map shows that sensors 1 to 5 are influential features.
* deg_c and relative_humidity have some relations.

In [None]:
plt.figure(figsize=(12,12))
sns.heatmap(train.corr());

In [None]:
plt.figure(figsize=(12,12))
sns.heatmap(all_data.corr());

**Trend anayisi**
* trend of the target across time

In [None]:
train['date_time']=pd.to_datetime(train['date_time'],format='%Y-%m-%d %H:%M:%S')
test['date_time']=pd.to_datetime(test['date_time'],format='%Y-%m-%d %H:%M:%S')

targets = ["target_carbon_monoxide", "target_benzene", "target_nitrogen_oxides"]
target_names = ["Carbon monoxide", "Benzene", "Nitrogen oxides"]
colors = ['blue', 'red', 'orange']
fig,ax =plt.subplots(3,1,figsize=(18,24))
for i in [0, 1, 2]:
    ax[i].plot(train["date_time"], train[targets[i]], color=colors[i])
    ax[i].set_title(f"{target_names[i]} (target #{i+1}) trend across time", fontsize=20, pad=5)
    ax[i].set_ylabel(f"{target_names[i]}", fontsize=14, labelpad=5)
    ax[i].set_xlabel("Date", fontsize=14, labelpad=5)
    ax[i].grid(axis="both")

Comparsion between train data and test data
* The feature data trend across time

In [None]:
all_features = ["deg_C", "relative_humidity", "absolute_humidity", "sensor_1", "sensor_2", "sensor_3", "sensor_4", "sensor_5"]
all_feature_names = ["Temperature (deg. C)", "Relative humidity", "Absolute humidity", "Sensor 1", "Sensor_2", "Sensor 3", "Sensor 4", "Sensor 5"]

fig, ax = plt.subplots(8,1,figsize=(16, 30))
plt.subplots_adjust(hspace = 0.4)


for i in np.arange(8):
    legend_lines = [Line2D([0], [0], color= 'orange', lw=10),
                    Line2D([0], [0], color="black", lw=10)]
    ax[i].plot(train["date_time"], train[all_features[i]], color= 'orange', label="Train data")
    ax[i].plot(test["date_time"], test[all_features[i]], color="black", label="Test data")
    ax[i].set_title(f"{all_feature_names[i]} levels across time", fontsize=20, pad=5)
    ax[i].set_ylabel(f"{all_feature_names[i]} level", fontsize=14, labelpad=5)
    ax[i].set_xlabel("Date", fontsize=14, labelpad=5)
    ax[i].legend(legend_lines, ["Train data", "Test data"], fontsize=12, loc=1)
    ax[i].grid(axis="both")

# Train Model

In [None]:
# Sets of hyperparameters optimized by Optuna for each target
cb_params = [
                {'learning_rate': 0.010169009412219588,
                 'l2_leaf_reg': 8.908337085912136,
                 'bagging_temperature': 8.384477224270551,
                 'random_strength': 1.950237493637981,
                 'depth': 6,
                 'grow_policy': 'Lossguide',
                 'leaf_estimation_method': 'Newton'},
                {'learning_rate': 0.166394867169309,
                 'l2_leaf_reg': 8.704675157564441,
                 'bagging_temperature': 3.340826164726799,
                 'random_strength': 1.538518016574368,
                 'depth': 3,
                 'grow_policy': 'Depthwise',
                 'leaf_estimation_method': 'Newton'},
                {'learning_rate': 0.028141156076957437,
                 'l2_leaf_reg': 3.116523267336638,
                 'bagging_temperature': 4.420661209459851,
                 'random_strength': 1.8011752694610028,
                 'depth': 6,
                 'grow_policy': 'Depthwise',
                 'leaf_estimation_method': 'Newton'},
            ]

In [None]:
preds = pd.read_csv('../input/tabular-playground-series-jul-2021/sample_submission.csv')

In [None]:
%%time
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import LeaveOneGroupOut
from catboost import CatBoostRegressor

all_fi = []
splits = 10
target_names=y_log.columns

for i, target in enumerate(target_names):
    print(f"\nTraining for {target}...")
    logo = LeaveOneGroupOut()
    oof_preds = np.zeros((X.shape[0],))
    model_preds = 0
    model_fi = 0
    for num, (train_idx, valid_idx) in enumerate(logo.split(X, y_log, months)):
        X_train, X_valid = X[[train_idx]], X[[valid_idx]]
        y_train, y_valid = y_log.loc[train_idx, target], y_log.loc[valid_idx, target]
        model = CatBoostRegressor(random_state=42,
                                 thread_count=4,
                                 verbose=False,
                                 loss_function='RMSE',
                                 eval_metric='RMSE',
                                 od_type="Iter",
                                 early_stopping_rounds=500,
                                 use_best_model=True,
                                 iterations=10000,
                                 task_type="CPU",
                                 **cb_params[i])
        
        model.fit(X_train, y_train,
                  eval_set=(X_valid, y_valid),
                  verbose=False)
        model_preds += np.expm1(model.predict(X_test)) / splits
        model_fi += model.feature_importances_
        oof_preds[valid_idx] = np.expm1(model.predict(X_valid))
        print(f"Fold {num} RMSLE: {np.sqrt(mean_squared_log_error(np.expm1(y_valid), oof_preds[valid_idx]))}")
    print(f"\nOverall RMSLE: {np.sqrt(mean_squared_log_error(np.expm1(y_log[target]), oof_preds))}")    
    preds[target] = model_preds
    all_fi.append(dict(zip(all_data.columns, model_fi)))

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import LeaveOneGroupOut
from catboost import CatBoostRegressor

all_fi = []
splits = 10
target_names=y_log.columns

for i, target in enumerate(target_names):
    print(f"\nTraining for {target}...")
    logo = LeaveOneGroupOut()
    oof_preds = np.zeros((X.shape[0],))
    model_preds = 0
    model_fi = 0
    for num, (train_idx, valid_idx) in enumerate(logo.split(X, y_log, months)):
        X_train, X_valid = X[[train_idx]], X[[valid_idx]]
        y_train, y_valid = y_log.loc[train_idx, target], y_log.loc[valid_idx, target]
        model1 = LinearRegression()
        model1.fit(X_train, y_train)
        model_preds += np.expm1(model1.predict(X_test)) / splits
        oof_preds[valid_idx] = np.expm1(model1.predict(X_valid))
        oof_preds[oof_preds < 0] = 0
        print(f"Fold {num} RMSLE: {np.sqrt(mean_squared_log_error(np.expm1(y_valid), oof_preds[valid_idx]))}")
    print(f"\nOverall RMSLE: {np.sqrt(mean_squared_log_error(np.expm1(y_log[target]), oof_preds))}")    

# Feature importance

In [None]:
# Creating feature list from feature importance dictionaries
feature_list = set()
for i in np.arange(len(all_fi)):
    feature_list = set.union(feature_list, set(all_fi[i].keys()))
print(f"There are {len(feature_list)} unique features used for training: {feature_list}")

In [None]:
# Combining feature importances of different models into one dataframe
df = pd.DataFrame(columns=["Feature"])
df["Feature"] = list(feature_list)
for i in np.arange(len(all_fi)):
    for key in all_fi[i].keys():
        df.loc[df["Feature"] == key, "Importance_" + str(i+1)] = all_fi[i][key] / 1000
df.fillna(0, inplace=True)
df.sort_values("Importance_1", axis=0, ascending=False, inplace=True)

In [None]:
x = np.arange(0, len(df["Feature"]))
height = 0.3

fig, ax = plt.subplots(figsize=(12, 9))
bars1 = ax.barh(x-height, df["Importance_1"], height=height,
                color="cornflowerblue",
                edgecolor="black",
                label=target_names[0])
bars2 = ax.barh(x, df["Importance_2"], height=height,
                color="palevioletred",
                edgecolor="black",
                label=target_names[1])
bars3 = ax.barh(x+height, df["Importance_3"], height=height,
                color="mediumseagreen",
                edgecolor="black",
                label=target_names[2])
ax.set_title("Feature importances", fontsize=20, pad=5)
ax.set_ylabel("Feature names", fontsize=15, labelpad=5)
ax.set_xlabel("Feature importance", fontsize=15, labelpad=5)
ax.set_yticks(x)
ax.set_yticklabels(df["Feature"], fontsize=12)
ax.tick_params(axis="x", labelsize=12)
ax.grid(axis="x")
ax.legend(fontsize=13, loc="lower right")
plt.margins(0.04, 0.01)
plt.gca().invert_yaxis()

# Prediction

In [None]:
preds.head()

In [None]:
preds.to_csv('submission.csv', index=False)

In [None]:
targets = train[["target_carbon_monoxide", "target_benzene", "target_nitrogen_oxides"]]

In [None]:
preds['date_time'] = pd.to_datetime(preds['date_time'])

In [None]:
fig, ax = plt.subplots(3,1, figsize=(16, 8))
plt.set_cmap("Set2")
plt.subplots_adjust(hspace = 0.3)

for i, target in enumerate(targets.columns):
    ax[i].plot(np.arange(0, 744, 1), targets.loc[train["date_time"].dt.month==12, target], label="Train, 12th month")
    ax[i].plot(np.arange(0, 744, 1), preds.loc[preds["date_time"].dt.month==1, target],
                label="Preds, 1th month")
    ax[i].set_title(target_names[i], fontsize=15)
    ax[i].legend(fontsize=13)