## Hi everyone!

After checking out all of the top notebooks in this competition (July 2nd) I wanted to see if a simple neural network could beat the best Machine Learning model I found.

Check out maksymshkliarevskyi's kernel for a great EDA walkthrough and a very strong baseline model using XGBRegressor!

https://www.kaggle.com/maksymshkliarevskyi/tps-july-eda-baseline-analysis-xgbregressor

The artificial neural network could not beat maksymshkliarevskyi's model, but got somewhat close.

Don't forget to upvote!

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")


plt.style.use('seaborn')

figure = {'dpi': '200'}
font = {'family': 'fantasy'}
grid = {'linestyle': ':', 'alpha': .9}
axes = {'titlecolor': 'black', 'titlesize': 20, 'titleweight': 'bold',
        'labelsize': 12, 'labelweight': 'bold'}

plt.rc('font', **font)
plt.rc('figure', **figure)
plt.rc('grid', **grid)
plt.rc('axes', **axes)

In [None]:
train = pd.read_csv('../input/tabular-playground-series-jul-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-jul-2021/test.csv')

In [None]:
train.head()

In [None]:
train['date_time'] = pd.to_datetime(train['date_time'])

In [None]:
train.shape

In [None]:
train['dayofweek'] = train['date_time'].dt.dayofweek
train['dayofyear'] = train['date_time'].dt.dayofyear
train['week'] = train['date_time'].dt.week
train['month'] = train['date_time'].dt.month
train['year'] = train['date_time'].dt.year

In [None]:
train.shape

In [None]:
train = train.set_index('date_time')

In [None]:
corr = train.corr()
mask = np.triu(np.ones_like(corr, dtype = bool))

plt.figure(figsize = (10, 10))
plt.title('Correlation matrix')
sns.heatmap(corr, mask = mask, cmap = 'magma', linewidths = .5)
plt.show()

In [None]:
train.plot(figsize = (11, 25), subplots = True, linewidth = 0.8)
plt.xlabel('')
plt.show()

In [None]:
target_name = ['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides']

In [None]:
X = train.drop(target_name, axis = 1)
y = train[target_name]

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
                                                      test_size = 0.2,
                                                      random_state = 0,
                                                      shuffle = False)

## maksymshkliarevskyi's XGBRegressor

In [None]:
from xgboost import XGBRegressor

# The basic model
params = {'n_estimators': 400,
          'subsample': 0.8,
          'max_depth': 8,
          'learning_rate': 0.05,
          'n_jobs': -1,
          'colsample_bytree': 0.8,
          'reg_alpha': 0.1,
          'reg_lambda': 0.1,
          'random_state': 0}

model1 = XGBRegressor(**params).fit(X_train, y_train.iloc[:, 0])
model2 = XGBRegressor(**params).fit(X_train, y_train.iloc[:, 1])
model3 = XGBRegressor(**params).fit(X_train, y_train.iloc[:, 2])

In [None]:
from sklearn.metrics import mean_squared_log_error

In [None]:
y_pred1 = model1.predict(X_valid)
print('RMSLE ({}): {}'.format(target_name[0], round(np.sqrt(mean_squared_log_error(y_valid.iloc[:, 0], y_pred1)), 4)))
y_pred2 = model2.predict(X_valid)
print('RMSLE ({}): {}'.format(target_name[1], round(np.sqrt(mean_squared_log_error(y_valid.iloc[:, 1], y_pred2)), 4)))
y_pred3 = model3.predict(X_valid)
print('RMSLE ({}): {}'.format(target_name[2], round(np.sqrt(mean_squared_log_error(y_valid.iloc[:, 2], y_pred3)), 4)))

## Neural network testing

In [None]:
import tensorflow as tf

In [None]:
# define baseline model
def baseline_ann_model(hidden_dim = 128):
    ann = tf.keras.models.Sequential()
    ann.add(tf.keras.layers.Dense(hidden_dim, input_dim = X.shape[1], kernel_initializer='he_uniform', activation='relu'))
    ann.add(tf.keras.layers.Dense(hidden_dim, activation='relu'))

    ann.add(tf.keras.layers.Dense(3, activation = 'linear'))
    ann.compile(optimizer = 'adam', loss = 'mean_squared_logarithmic_error')
    return ann

In [None]:
ann = baseline_ann_model(hidden_dim = 8)
ann.summary()

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [None]:
history = ann.fit(X_train_scaled, y_train, epochs = 200, batch_size = 16, verbose = 1, validation_split = 0.3)

In [None]:
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
X_valid_scaled = scaler.fit_transform(X_valid)

In [None]:
y_pred = ann.predict(X_valid_scaled)
print('RMSLE baseline: {}'.format(round(np.sqrt(mean_squared_log_error(y_valid, y_pred)), 4)))

In [None]:
y_pred = pd.DataFrame(y_pred, columns = y_valid.columns, index = y_valid.index)

In [None]:
print('RMSLE ({}): {}'.format(target_name[0], 
                              round(np.sqrt(mean_squared_log_error(y_valid.iloc[:, 0], y_pred.iloc[:, 0])), 4)))

print('RMSLE ({}): {}'.format(target_name[1], 
                              round(np.sqrt(mean_squared_log_error(y_valid.iloc[:, 1], y_pred.iloc[:, 1])), 4)))

print('RMSLE ({}): {}'.format(target_name[2], 
                              round(np.sqrt(mean_squared_log_error(y_valid.iloc[:, 2], y_pred.iloc[:, 2])), 4)))

In [None]:
date = pd.to_datetime(X_valid.reset_index().date_time).apply(lambda x: x.strftime('%Y/%m/%d'))

valid_preds = pd.DataFrame({'date': date,
                            'target_carbon_monoxide': y_valid.iloc[:, 0].values,
                            'target_benzene': y_valid.iloc[:, 1].values,
                            'target_nitrogen_oxides': y_valid.iloc[:, 2].values,
                            'preds_carbon_monoxide': y_pred.iloc[:, 0].values,
                            'preds_benzene': y_pred.iloc[:, 1].values,
                            'preds_nitrogen_oxides': y_pred.iloc[:, 2].values})
valid_preds = valid_preds.groupby('date').mean()

In [None]:
plt.figure(figsize = (15, 5))
valid_preds['target_carbon_monoxide'].plot(color = 'blue', label = 'true')
valid_preds['preds_carbon_monoxide'].plot(color = 'red', label = 'preds')
plt.legend()
plt.xlabel('')
plt.show()

In [None]:
plt.figure(figsize = (15, 5))
valid_preds['target_benzene'].plot(color = 'blue', label = 'true')
valid_preds['preds_benzene'].plot(color = 'red', label = 'preds')
plt.legend()
plt.xlabel('')
plt.show()

In [None]:
plt.figure(figsize = (15, 5))
valid_preds['target_nitrogen_oxides'].plot(color = 'blue', label = 'true')
valid_preds['preds_nitrogen_oxides'].plot(color = 'red', label = 'preds')
plt.legend()
plt.xlabel('')
plt.show()

In [None]:
test['date_time'] = pd.to_datetime(test['date_time'])

In [None]:
test['dayofweek'] = test['date_time'].dt.dayofweek
test['dayofyear'] = test['date_time'].dt.dayofyear
test['week'] = test['date_time'].dt.week
test['month'] = test['date_time'].dt.month
test['year'] = test['date_time'].dt.year

In [None]:
test = test.drop('date_time', axis = 1)

In [None]:
submission = pd.read_csv('../input/tabular-playground-series-jul-2021/sample_submission.csv')
submission = submission.set_index('date_time')

In [None]:
ann = baseline_ann_model(hidden_dim = 8)
X_scaled = scaler.fit_transform(X)
history = ann.fit(X_scaled, y, epochs = 200, batch_size = 16, verbose = 1)
test_scaled = scaler.fit_transform(test)
predictions = ann.predict(test_scaled)
predictions = pd.DataFrame(predictions, columns = submission.columns, index = submission.index)

In [None]:
predictions = predictions.reset_index()
predictions.head()

In [None]:
predictions.to_csv('submission.csv', index = False)