In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
train = pd.read_csv('../input/tabular-playground-series-jul-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-jul-2021/test.csv')
submission = pd.read_csv('../input/tabular-playground-series-jul-2021/sample_submission.csv')

I plot all the feature distributions to get an idea of what distribution we are dealing with and to get an idea of what the scales of the data are:

In [None]:
train.describe()

In [None]:
plt.figure(figsize=(20, 10))
ax = sns.histplot(train['deg_C'], kde=True)
ax.set(xlabel='Degrees In Celsius', ylabel='Frequency')

In [None]:
plt.figure(figsize=(20, 10))
ax = sns.histplot(train['relative_humidity'], kde=True)
ax.set(xlabel='Relative Humidity', ylabel='Frequency')

In [None]:
plt.figure(figsize=(20, 10))
ax = sns.histplot(train['absolute_humidity'], kde=True)
ax.set(xlabel='Absolute Humidity', ylabel='Frequency')

In [None]:
plt.figure(figsize=(20, 10))
ax = sns.histplot(train['sensor_1'], kde=True)
ax.set(xlabel='Sensor 1', ylabel='Frequency')

In [None]:
plt.figure(figsize=(20, 10))
ax = sns.histplot(train['sensor_2'], kde=True)
ax.set(xlabel='Sensor 2', ylabel='Frequency')

In [None]:
plt.figure(figsize=(20, 10))
ax = sns.histplot(train['sensor_3'], kde=True)
ax.set(xlabel='Sensor 3', ylabel='Frequency')

In [None]:
plt.figure(figsize=(20, 10))
ax = sns.histplot(train['sensor_4'], kde=True)
ax.set(xlabel='Sensor 4', ylabel='Frequency')

In [None]:
plt.figure(figsize=(20, 10))
ax = sns.histplot(train['sensor_5'], kde=True)
ax.set(xlabel='Sensor 5', ylabel='Frequency')

In [None]:
plt.figure(figsize=(20, 10))
ax = sns.histplot(train['target_carbon_monoxide'], kde=True)
ax.set(xlabel='Carbon Monoxide', ylabel='Frequency')

In [None]:
plt.figure(figsize=(20, 10))
ax = sns.histplot(train['target_benzene'], kde=True)
ax.set(xlabel='Benzene', ylabel='Frequency')

In [None]:
plt.figure(figsize=(20, 10))
ax = sns.histplot(train['target_nitrogen_oxides'], kde=True)
ax.set(xlabel='Nitrogen Oxides', ylabel='Frequency')

The code below is from [The Seaborn Docs](https://seaborn.pydata.org/examples/many_pairwise_correlations.html).  It shows the feature correlation between the features of the training set.

In [None]:
plt.figure(figsize=(20, 10))
corr = train.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
f, ax = plt.subplots(figsize=(11, 9))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

Based on the histogram plots of all of the features (including the targets) we can see that most follow a Gaussian Distribution (Majority of the data (99.5%) fall between 3 standard deviations from the mean). However, it is important to note that most of the data is of different scales (e.g. The sensor features have scales from 500-1200, while humidity is between 0.25-2.25). Therefore normalizing the data will be an important preprocessing step.

In [None]:
train_features = train.drop(['date_time', 'target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides'], axis=1).values
test_features = test.drop('date_time', axis=1).values
targets = train[['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides']].values

In [None]:
print(f'The train set has {train_features.shape[0]} samples, and {train_features.shape[1]} features')
print(f'The test set has {test_features.shape[0]} samples, and {test_features.shape[1]} features')

The metric is the column wise average of the root mean squared logarithmic error between the three predictions. The following is the implementation in tensorflow. This loss works if you were to compile a neural network via (model.compile(optimizer='adam', loss=CRMSLE)).

In [None]:
def CRMSLE(truth, pred):
    msle = tf.keras.losses.MeanSquaredLogarithmicError()
    rmsle1 = (msle(truth[:,0], pred[:,0]))**(1/2)
    rmsle2 = (msle(truth[:,1], pred[:,1]))**(1/2)
    rmsle3 = (msle(truth[:,2], pred[:,2]))**(1/2)
    return (rmsle1 + rmsle2 + rmsle3)/3

I don't believe that lightgbm supports multi-output losses so you can use scikit-learns MultiOutputRegressor class which works as a wrapper around a lightgbm regressor. The following implements it:

In [None]:
from sklearn.multioutput import MultiOutputRegressor
from sklearn.preprocessing import normalize
import lightgbm
import xgboost
import catboost
lgb = MultiOutputRegressor(lightgbm.LGBMRegressor(), n_jobs=-1)

The following is the standard blending technique which uses a decision tree, lightgbm, and linear regression model. However, before beginning training I normalize the data (If you don't normalize, the lb score is 47.93549, while with normalizing is 45.15300). A ridge regressor is then fit on the blended data.

In [None]:
from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor()
lr = LinearRegression()
rd = Ridge()
kfold = KFold(n_splits=10, random_state=2021, shuffle=True)

lgb_valid_preds = []
lgb_test_preds = []

dt_valid_preds = []
dt_test_preds = []

lr_valid_preds = []
lr_test_preds = []
blend_preds = []

train_features = pd.DataFrame(normalize(train_features))
test_features = pd.DataFrame(normalize(test_features))
targets = pd.DataFrame(targets)

for fold, (train_idx, test_idx) in enumerate(kfold.split(train_features, targets)):
    
    print('*' * 15, f'Fold {fold+1}', '*' * 15)
    
    X_train, X_valid = train_features.iloc[train_idx], train_features.iloc[test_idx]
    y_train, y_valid = targets.iloc[train_idx].to_numpy(), targets.iloc[test_idx].to_numpy()
    
    lgb.fit(X_train, y_train)
    lgb_valid_preds.append(lgb.predict(X_valid))
    lgb_test_preds.append(lgb.predict(test_features))
    
    dt.fit(X_train, y_train)
    dt_valid_preds.append(dt.predict(X_valid))
    dt_test_preds.append(dt.predict(test_features))
    
    lr.fit(X_train, y_train)
    lr_valid_preds.append(lr.predict(X_valid))
    lr_test_preds.append(lr.predict(test_features))
    
    blend_train = np.c_[lgb_valid_preds[-1], dt_valid_preds[-1], lr_valid_preds[-1]]
    blend_test = np.c_[lgb_test_preds[-1], dt_test_preds[-1], lr_test_preds[-1]]
    
    rd.fit(blend_train, y_valid)
    blend_preds.append(rd.predict(blend_test))
    print(f'Ridge Blend CRMSLE: {CRMSLE(rd.predict(blend_train), y_valid)}')

In [None]:
submission[['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides']] = sum(blend_preds)/len(blend_preds)
submission.to_csv('submission.csv', index=False)

In [None]:
submission