In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import xgboost as xgb

from sklearn.model_selection import KFold, TimeSeriesSplit, cross_val_score
from sklearn.metrics import mean_squared_log_error
import sklearn.metrics as metrics

import optuna
from optuna import Trial, visualization

In [None]:
train = pd.read_csv('../input/tabular-playground-series-jul-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-jul-2021/test.csv')
sample = pd.read_csv('../input/tabular-playground-series-jul-2021/sample_submission.csv')

In [None]:
# Date_time as index
train['date_time'] = pd.to_datetime(train['date_time'])
train = train.set_index('date_time')
train.head()

In [None]:
feature_cols = [col for col in test.columns.tolist() if col not in ['date_time']]
target1 = ['target_carbon_monoxide']
target2 = ['target_benzene']
target3 = ['target_nitrogen_oxides']

In [None]:
def fit_xgb(trial, xtr, ytr, xval, yval):
    params = {
        "n_estimators": trial.suggest_int("n_estimators",200,1200,100),
        "subsample": trial.suggest_discrete_uniform("subsample", 0.6,1,0.1),
        "colsample_bytree": trial.suggest_discrete_uniform("colsample_bytree", 0.6,1,0.1),
        "eta": trial.suggest_loguniform("eta",1e-3,0.1),
        "max_depth": trial.suggest_int("max_depth",5,20),
        "reg_alpha": trial.suggest_int("reg_alpha",1,50),
    }
    
    model = xgb.XGBRegressor(**params, random_state = 42, eval_metric="rmsle")
    model.fit(xtr, ytr.reshape(-1,))
    
    y_tr_pred = model.predict(xtr)
    y_val_pred = model.predict(xval)
    
    y_tr_pred = np.clip(y_tr_pred, 0.1, None)
    y_val_pred = np.clip(y_val_pred, 0.1, None)
    
    log = {
        "train rmsle": np.sqrt(mean_squared_log_error(ytr, y_tr_pred)),
        "valid rmsle": np.sqrt(mean_squared_log_error(yval, y_val_pred))
    }
    
    return model, log

In [None]:
def objective(trial):
    rmsle = 0
    tscv = TimeSeriesSplit(n_splits = 9)
    for trn_idx, val_idx in tscv.split(X_train):
        
        trn = X_train.iloc[:len(trn_idx)]
        val = X_train.iloc[len(trn_idx):(len(trn_idx)+len(val_idx))]

        xtr, ytr = trn[feature_cols].values, y_train.iloc[trn_idx].values
        xval, yval = val[feature_cols].values, y_train.iloc[val_idx].values

        model, log = fit_xgb(trial, xtr, ytr, xval, yval)
        rmsle += log['valid rmsle']/9
        
    return rmsle

In [None]:
# model 1
new_df = train[target1]
new_df['Yesterday'] = new_df.loc[:,target1].shift(24)
new_df['Yesterday_diff'] = new_df.loc[:, target1].diff(24)
new_df = new_df.dropna()

new_df_full = new_df.join(train[feature_cols])

y_train = new_df_full.loc[:'2010-12-10', target1]
X_train = new_df_full[:'2010-12-10'].drop(target1, axis = 1)

y_test = train.loc['2010-12-11':'2010-12-31',target1]
X_test = new_df_full['2010-12-11':'2010-12-31'].drop(target1, axis = 1)

study = optuna.create_study(direction = "minimize", study_name = 'Target 1 optimization')
study.optimize(objective, n_trials = 10)

In [None]:
#study.best_params
clf1 = xgb.XGBRegressor(**(study.best_params))
clf1.fit(train[feature_cols], train[target1], eval_metric="rmsle")

preds1 = clf1.predict(test[feature_cols])
preds1 = np.clip(preds1, 0.1, None)
sample[target1] = preds1

In [None]:
# model 2
new_df = train[target2]
new_df['Yesterday'] = new_df.loc[:,target2].shift(24)
new_df['Yesterday_diff'] = new_df.loc[:, target2].diff(24)
new_df = new_df.dropna()

new_df_full = new_df.join(train[feature_cols])

y_train = new_df_full.loc[:'2010-12-10', target2]
X_train = new_df_full[:'2010-12-10'].drop(target2, axis = 1)

y_test = train.loc['2010-12-11':'2010-12-31',target2]
X_test = new_df_full['2010-12-11':'2010-12-31'].drop(target2, axis = 1)

study = optuna.create_study(direction="minimize", study_name='Target 2 optimization')
study.optimize(objective, n_trials=10)

In [None]:
#history = study.trials_dataframe()
#history.sort_values(by="value", ascending=True)
#study.best_params

clf2 = xgb.XGBRegressor(**(study.best_params))
clf2.fit(train[feature_cols], train[target2], eval_metric="rmsle")
preds2 = clf2.predict(test[feature_cols])
preds2 = np.clip(preds2, 0.1, None)
sample[target2] = preds2


In [None]:
# model 3
new_df = train[target3]
new_df['Yesterday'] = new_df.loc[:,target3].shift(24)
new_df['Yesterday_diff'] = new_df.loc[:, target3].diff(24)
new_df = new_df.dropna()

new_df_full = new_df.join(train[feature_cols])

y_train = new_df_full.loc[:'2010-12-10', target3]
X_train = new_df_full[:'2010-12-10'].drop(target3, axis = 1)

y_test = train.loc['2010-12-11':'2010-12-31',target3]
X_test = new_df_full['2010-12-11':'2010-12-31'].drop(target3, axis = 1)

study = optuna.create_study(direction="minimize", study_name='Target 3 optimization')
study.optimize(objective, n_trials=10)

In [None]:
#study.best_params
clf3 = xgb.XGBRegressor(**(study.best_params))
clf3.fit(train[feature_cols], train[target3], eval_metric="rmsle")
preds3 = clf3.predict(test[feature_cols])
preds3 = np.clip(preds3, 0.1, None)
sample[target3] = preds3

In [None]:
sample.to_csv("submission.csv", index=False)