In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import re
import optuna
import xgboost as xgb

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_log_error

RANDOM_STATE = 42

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Load Data

In [None]:
data = pd.read_csv('/kaggle/input/tabular-playground-series-jul-2021/train.csv')

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.isnull().any()

In [None]:
sns.pairplot(data)

In [None]:
corr = data.corr()

mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True

with sns.axes_style("white"):
    plt.figure(figsize = (20,16))
    sns.heatmap(corr, mask=mask, linewidths=.1, cmap="YlGnBu", annot=True, fmt=".2f")

# Feature Engineering

## Time

In [None]:
def get_time(data):
    data.date_time     = data.date_time.astype('datetime64')
    
    data['year']       = data.date_time.dt.year
    data['month']      = data.date_time.dt.month
    data['hour']       = data.date_time.dt.hour
    
    data['dayofyear']  = data.date_time.dt.dayofyear
    data['dayofmonth'] = data.date_time.dt.day
    data['dayofweek']  = data.date_time.dt.dayofweek
    
    data['weekofyear'] = data.date_time.dt.isocalendar().week
    data['quarter']    = data.date_time.dt.quarter
    
    data['is_winter']  = data['month'].isin([1, 2, 12])
    data['is_spring']  = data['month'].isin([3, 4, 5])
    data['is_summer']  = data['month'].isin([6, 7, 8])
    data['is_autumn']  = data['month'].isin([9, 10, 11])
    
    data['working_hours'] =  data['hour'].isin(np.arange(8, 21, 1)).astype("int")
    data['is_weekend']    = (data['date_time'].dt.dayofweek >= 5).astype("int")

## Heat Index
<div style='font-size:18px'>
    <span style='font-size:20px'>
        $$HI = c_1 + c_2T + c_3R + c_4TR + c_5T^2 + c_6R^2 + c_7T^2R + c_8TR^2 + c_9T^2R^2$$
        <br>
        <i> If $RH < 13$ and TF between 80 and TF 112, substract adjustment</i>
        $$ADJUSTMENT = \frac{13-RH}{4}   \cdot \sqrt{\frac{17-|TF-95|}{17}}$$
        <br>
        <i> If $RH > 85$ and TF between 80 and TF 87, add adjustment</i>
        $$ADJUSTMENT = \frac{RH-85}{10}  \cdot \frac{87-TF}{5}$$
        <br>
        <i> If $TF < 80$ use simple formula</i>
        $$HI = 0.5 \cdot {T + 61.0 + [(T-68.0) \cdot 1.2] + (RH \cdot 0.094)}$$
    </span>
    <br>
    <i>where</i>
    <br>
    <ul>
        <li>$RH$ - relative humidity</li>
        <li>$T$ - temperature in Celsius</li>
        <li>$c_1-c_9$ - constants</li>
    </ul>
</div>

In [None]:
def get_HI(data):
    TF  = (data.deg_C * 9/5) + 32
    RH = data.relative_humidity

    conditions = [
        (TF <= 40.0),
        (RH < 13) & (TF >= 80) & (TF <= 112),
        (RH > 85) & (TF >= 80) & (TF <= 87),
        (TF < 80)
    ]

    HI = -42.379 + 2.04901523*TF + 10.14333127*RH - 0.22475541*TF*RH - 0.00683783*TF**2 - 0.05481717*RH**2 + 0.00122874*RH*TF**2 + 0.00085282*TF*RH**2 - 0.00000199*TF**2*RH**2
    HI_simple = 0.5 * (TF + 61.0 + ((TF-68.0)*1.2) + (RH*0.094))

    adjust1 = ((13-RH)/4)  * np.sqrt((17-np.abs(TF-95))/17)
    adjust2 = ((RH-85)/10) * ((87-TF)/5)

    choices = [
        TF, 
        HI-adjust1, 
        HI+adjust2,
        HI_simple
    ]

    data['HI'] = np.select(conditions, choices)

## Dew Point

<div style='font-size:18px'> 
    <span style='font-size:22px'>$$T_p = \frac{b\cdot(\frac{a \cdot T}{b+T}+lnRH)}{a-(\frac{a \cdot T}{b+T} + lnRH)}$$</span>
    <i>where</i>
    <br>
    <ul>
        <li>$a$ = 17.27</li>
        <li>$b$ = 237.7 °C</li>
        <li>$RH$ - relative humidity</li>
        <li>$T$ - temperature in Celsius</li>
    </ul>
</div>

In [None]:
def get_DP(data):
    a = 17.27
    b = 237.7
    f = (a*data.deg_C) / (b+data.deg_C) + np.log(data.relative_humidity/100)
    data['DP'] = (b * f) / (a - f)

## Saturated vapor pressure
<div style='font-size:18px'>
    <br>
    <span style='font-size:24px'> 
        $$e_{s} = 6.112 \cdot e^{\frac{(17.62 \cdot T)}{(T+243.12)}}$$
    </span><i>where</i>
    <br>
    <ul>
        <li>$T$ - temperature in Celsius</li>
    </ul>
</div>

In [None]:
def get_pressure(data):
    data['saturated_pressure'] = 6.112 * np.exp(17.62*data.deg_C / (data.deg_C+243.12)) 

## SMC

<div style='font-size:18px'>
    <br>
    <span style='font-size:20px'> 
        $$SMC = \frac{AH \cdot 100}{RH}$$
    </span>
    <i>where</i>
    <br>
    <ul>
        <li>$AH$ - absolute humidity</li>
        <li>$RH$ - relative humidity</li>
    </ul>
</div>

In [None]:
def get_SMC(data):
    data['SMC'] = (data.absolute_humidity * 100) / data.relative_humidity

In [None]:
def get_feauters(data):
    get_time(data)
    get_HI(data)
    get_DP(data)
    get_pressure(data)
    get_SMC(data)

In [None]:
get_feauters(data)

In [None]:
corr = data.corr()

mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True

with sns.axes_style("white"):
    plt.figure(figsize = (20,16))
    sns.heatmap(corr, mask=mask, linewidths=.1, cmap="YlGnBu", annot=True, fmt=".2f")

In [None]:
cols = ['year', 'month', 'hour', 'dayofyear', 'dayofmonth', 'dayofweek' ,'weekofyear', 'quarter', 'is_winter', 'is_spring', 'is_summer', 'is_autumn', 'working_hours', 'is_weekend']
corr = data[cols].corr()

mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True

with sns.axes_style("white"):
    plt.figure(figsize = (20,16))
    sns.heatmap(corr, mask=mask, linewidths=.1, cmap="YlGnBu", annot=True, fmt=".2f")

# XGBoost

In [None]:
X = np.array(data.drop(['date_time', 'target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides'], axis=1))
y = np.array((data[['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides']]))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=RANDOM_STATE)
X_train_, X_valid, y_train_, y_valid = train_test_split(X_train, y_train, train_size=0.8, random_state=RANDOM_STATE)

In [None]:
def get_preds(model, data, y_test=[]):
    preds = np.abs(model.predict(data))
    if len(y_test):
        print(mean_squared_log_error(y_test, preds)**1/2)
    return preds

In [None]:
def get_model_rmse(params):
    model = xgb.train(params, dtrain, num_boost_round=150, evals=[(dvalid, 'eval')], early_stopping_rounds=30, verbose_eval=0)
    results = mean_squared_error(y_valid_, np.abs(model.predict(dvalid)))**1/2
    return results

In [None]:
def objective(trial):
    
    learning_rate    = trial.suggest_float('learning_rate', 0.00001, 10.0)
    max_depth        = trial.suggest_int('max_depth', 3, 20)
    n_estimators     = trial.suggest_int('n_estimators', 50, 5000)
    eta              = trial.suggest_float('eta', 0.01, 0.3)
    min_child_weight = trial.suggest_int('min_child_weight', 1, 100)
    colsample_bytree = trial.suggest_float('colsample_bytree', 0.3, 0.5)
    subsample        = trial.suggest_float('subsample', 0.1, 1)

    alpha            = trial.suggest_loguniform('alpha', 0.00001, 50.0)
    reg_lambda       = trial.suggest_loguniform('lambda', 0.00001, 10.0)
    gamma            = trial.suggest_loguniform('gamma', 1, 9)
            
    params = {
        'learning_rate':     learning_rate, 
        'max_depth':         max_depth, 
        'alpha':             alpha, 
        'n_estimators':      n_estimators,
        'eta':               eta,
        'min_child_weight':  min_child_weight,
        'colsample_bytree':  colsample_bytree,
        'lambda':            reg_lambda,
        'gamma':             gamma,
        'subsample':         subsample,
        'tree_method':       'gpu_hist',
        'use_label_encoder': False,
    }
    
    return get_model_rmse(params)

In [None]:
dtrain0 = xgb.DMatrix(X_train_, y_train_[:, 0])
dvalid0 = xgb.DMatrix(X_valid,  y_valid[:, 0])
dtest0  = xgb.DMatrix(X_test,   y_test[:, 0])

dtrain1 = xgb.DMatrix(X_train_, y_train_[:, 1])
dvalid1 = xgb.DMatrix(X_valid,  y_valid[:, 1])
dtest1  = xgb.DMatrix(X_test,   y_test[:, 1])

dtrain2 = xgb.DMatrix(X_train_, y_train_[:, 2])
dvalid2 = xgb.DMatrix(X_valid,  y_valid[:, 2])
dtest2  = xgb.DMatrix(X_test,   y_test[:, 2])

In [None]:
optuna.logging.set_verbosity(optuna.logging.ERROR)

dtrain = dtrain0
dvalid = dvalid0
y_valid_ = y_valid[:, 0]
study0 = optuna.create_study(direction='minimize')
study0.optimize(objective, n_trials=100, show_progress_bar=True)

dtrain = dtrain1
dvalid = dvalid1
y_valid_ = y_valid[:, 1]
study1 = optuna.create_study(direction='minimize')
study1.optimize(objective, n_trials=100, show_progress_bar=True)

dtrain = dtrain2
dvalid = dvalid2
y_valid_ = y_valid[:, 2]
study2 = optuna.create_study(direction='minimize')
study2.optimize(objective, n_trials=100, show_progress_bar=True)

In [None]:
params0 = study0.best_params
params1 = study1.best_params
params2 = study2.best_params

model0 = xgb.train(params0, dtrain0, num_boost_round=5000, evals=[(dvalid0, 'eval')], early_stopping_rounds=1000, verbose_eval=500)
model1 = xgb.train(params1, dtrain1, num_boost_round=5000, evals=[(dvalid1, 'eval')], early_stopping_rounds=1000, verbose_eval=500)
model2 = xgb.train(params2, dtrain2, num_boost_round=5000, evals=[(dvalid2, 'eval')], early_stopping_rounds=1000, verbose_eval=500)

In [None]:
get_preds(model0, dtest0, y_test[:, 0])
get_preds(model1, dtest1, y_test[:, 1])
get_preds(model2, dtest2, y_test[:, 2]);

In [None]:
dtrain0 = xgb.DMatrix(X_train, y_train[:, 0])
dtrain1 = xgb.DMatrix(X_train, y_train[:, 1])
dtrain2 = xgb.DMatrix(X_train, y_train[:, 2])


model0 = xgb.train(params0, dtrain0, num_boost_round=7500, evals=[(dtest0, 'eval')], early_stopping_rounds=1000, verbose_eval=500)
model1 = xgb.train(params1, dtrain1, num_boost_round=7500, evals=[(dtest1, 'eval')], early_stopping_rounds=1000, verbose_eval=500)
model2 = xgb.train(params2, dtrain2, num_boost_round=7500, evals=[(dtest2, 'eval')], early_stopping_rounds=1000, verbose_eval=500)

# Make Predictions

In [None]:
data_test = pd.read_csv('../input/tabular-playground-series-jul-2021/test.csv')
output = pd.read_csv('../input/tabular-playground-series-jul-2021/sample_submission.csv')

In [None]:
get_feauters(data_test)
X_data_test = np.array(data_test.drop(['date_time'], axis=1))

In [None]:
pred0_test = get_preds(model0, xgb.DMatrix(X_data_test))
pred1_test = get_preds(model1, xgb.DMatrix(X_data_test))
pred2_test = get_preds(model2, xgb.DMatrix(X_data_test))

In [None]:
output.target_carbon_monoxide = pred0_test
output.target_benzene = pred1_test
output.target_nitrogen_oxides = pred2_test

output.to_csv('outputfile.csv', index=False)
output