## **Tabular Playground Series - Jul 2021**
A Kaggle's competition for predicting air pollution in a city via various input sensor values.

### **Reading Data**

In [None]:
import pandas as pd
import numpy as np
import h2o
from h2o.automl import H2OAutoML
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
train = pd.read_csv('/kaggle/input/tabular-playground-series-jul-2021/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-jul-2021/test.csv')

In [None]:
train.tail()

In [None]:
test.head()

### **Data Manipulation**

In [None]:
# concatenating train and test
train['is_test'] = 0
test ['is_test'] = 1
all_data = pd.concat((train,test))

### **Quick Data Analysis**

In [None]:
all_data.info()

In [None]:
all_data

In [None]:
all_data.describe()

In [None]:
all_data.corr().style.background_gradient()

### **Feature Engineering**

In [None]:
# getting day of week and hour from date_time
all_data['weekday'] = pd.to_datetime(all_data['date_time']).dt.dayofweek
all_data['hour'] = pd.to_datetime(all_data['date_time']).dt.hour

In [None]:
no_per_hour = all_data.groupby(['hour'])['target_nitrogen_oxides'].agg(['mean'])
no_per_hour.plot(kind='bar', figsize=(10,4), title='Mean Nitrogen Oxides per hour');

In [None]:
# grouping hours in: quiet_hour, sleep_hour and rush_hour, based on target_nitrogen_oxides levels

all_data['quiet_hour'] = all_data['hour'].apply(
    lambda x: 1 if x in [22, 23, 0, 1, 6] else 0)

all_data['sleep_hour'] = all_data['hour'].apply(
    lambda x: 1 if x in [2, 3, 4, 5] else 0)

all_data['hour'] = all_data['hour'].apply(
    lambda x: 1 if x in [8, 9, 10, 11, 17, 18, 19, 20] else 0)

all_data.rename(columns={'hour': 'rush_hour'}, inplace=True)

In [None]:
# mean target_nitrogen_oxides level per day of week
all_data.groupby(['weekday'])['target_nitrogen_oxides'].agg(['mean']).transpose()

In [None]:
# creating variables for specific days

all_data['saturday'] = all_data['weekday'].apply(
    lambda x: 1 if x == 5 else 0)

all_data['monday'] = all_data['weekday'].apply(
    lambda x: 1 if x == 0 else 0)

all_data['weekday'] = all_data['weekday'].apply(
    lambda x: 1 if x == 6 else 0)

all_data.rename(columns={'weekday': 'sunday'}, inplace=True)

In [None]:
# transforming date_time into numeric
all_data['time_num'] = all_data['date_time'].astype('datetime64[ns]').astype(np.int64) / 10 ** 9

In [None]:
# calculating smc
all_data['smc'] = all_data['absolute_humidity'] * 100 / all_data['relative_humidity']

In [None]:
# lag of 3 days for absolute_humidity
all_data['abs_humidity_lag3'] = all_data['absolute_humidity'] - all_data['absolute_humidity'].shift(periods=3)
# lag of 12 days for deg_C
all_data['deg_lag12'] = all_data['deg_C'] - all_data['deg_C'].shift(periods=12)
# removing first 12 rows
all_data = all_data.iloc[12:]

In [None]:
# applying log to target variables
for col in ['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides']:
    all_data[col] = np.log1p(all_data[col])

In [None]:
# selecting features
features = [col for col in all_data if col not in [
    'date_time', 'target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides', 'is_test']]

# separating in train and test
test_cols = [col for col in all_data if col not in [
    'target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides']]
train = all_data.loc[all_data.is_test == 0]
test = all_data.loc[all_data.is_test == 1, test_cols]

### **H2O AutoML Model**

In [None]:
h2o.init()

# creating H2O frames
h2o_train_co = h2o.H2OFrame(train)
h2o_train_be = h2o.H2OFrame(train)
h2o_train_no = h2o.H2OFrame(train[4182:]) # selecting rows after 4182
h2o_test = h2o.H2OFrame(test)

param = {
    'include_algos': ['XGBoost', 'GBM', 'DeepLearning', 'StackedEnsemble'],
    'max_models': 100,
    'max_runtime_secs': 400,
    'stopping_metric': 'RMSLE',
    'sort_metric': 'RMSLE',
    'seed': 42}

# predicting target_carbon_monoxide
model_co = H2OAutoML(**param)
model_co.train(x=features, y='target_carbon_monoxide', training_frame=h2o_train_co)
h2o_prediction_co = model_co.leader.predict(h2o_test)
print(model_co.leaderboard.head(3))

# predicting target_benzene
model_be = H2OAutoML(**param)
model_be.train(x=features, y='target_benzene', training_frame=h2o_train_be)
h2o_prediction_be = model_be.leader.predict(h2o_test)
print(model_be.leaderboard.head(3))

# predicting target_nitrogen_oxides
model_no = H2OAutoML(**param)
model_no.train(x=features, y='target_nitrogen_oxides', training_frame=h2o_train_no)
h2o_prediction_no = model_no.leader.predict(h2o_test)
print(model_no.leaderboard.head(3))

In [None]:
# submission
h2o_submission = pd.DataFrame({
    'date_time': test['date_time'],
    'target_carbon_monoxide': np.exp(h2o_prediction_co.as_data_frame().predict) - 1,
    'target_benzene': np.exp(h2o_prediction_be.as_data_frame().predict) - 1,
    'target_nitrogen_oxides': np.exp(h2o_prediction_no.as_data_frame().predict) - 1})

h2o_submission.to_csv('h2o_submission.csv', index=False)

### **Gradient Boosting Regressor Model**

In [None]:
param_grid = {'subsample': [0.6, 0.8], 'max_depth': [5, 6]}
scoring = 'neg_mean_squared_log_error'
cv = KFold(n_splits=5)
estimator = GradientBoostingRegressor(
    n_estimators=1000, learning_rate=0.02, n_iter_no_change=10, validation_fraction=0.2)

In [None]:
# nitrogen grid search
np.random.seed(42)
train_no = train[4182:]
grid = GridSearchCV(estimator=estimator, param_grid=param_grid, scoring=scoring, cv=cv)
grid.fit(train_no[features], train_no.target_nitrogen_oxides)
print('Best: %f using %s' % (grid.best_score_, grid.best_params_))

In [None]:
# nitrogen prediction
model_no = GradientBoostingRegressor(**grid.best_params_, n_estimators=1000, learning_rate=0.02)
model_no.fit(train_no[features], train_no.target_nitrogen_oxides)
gbr_prediction_no = np.exp(model_no.predict(test[features])) - 1

In [None]:
# benzene grid search
np.random.seed(42)
train_be = train
grid = GridSearchCV(estimator=estimator, param_grid=param_grid, scoring=scoring, cv=cv)
grid.fit(train_be[features], train_be.target_benzene)
print('Best: %f using %s' % (grid.best_score_, grid.best_params_))

In [None]:
# benzene prediction
model_be = GradientBoostingRegressor(**grid.best_params_, n_estimators=1000, learning_rate=0.02)
model_be.fit(train_be[features], train_be.target_benzene)
gbr_prediction_be = np.exp(model_be.predict(test[features])) - 1

In [None]:
# carbon grid search
np.random.seed(42)
train_co = train
grid = GridSearchCV(estimator=estimator, param_grid=param_grid, scoring=scoring, cv=cv)
grid.fit(train_co[features], train_co.target_carbon_monoxide)
print('Best: %f using %s' % (grid.best_score_, grid.best_params_))

In [None]:
# carbon prediction
model_co = GradientBoostingRegressor(**grid.best_params_, n_estimators=1000, learning_rate=0.02)
model_co.fit(train_co[features], train_co.target_carbon_monoxide)
gbr_prediction_co = np.exp(model_co.predict(test[features])) - 1

In [None]:
# submission
gbr_submission = pd.DataFrame({
    'date_time': test['date_time'],
    'target_carbon_monoxide': gbr_prediction_co,
    'target_benzene': gbr_prediction_be,
    'target_nitrogen_oxides': gbr_prediction_no})

gbr_submission.to_csv('gbr_submission.csv', index=False)