In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import xgboost as xgb

from sklearn.model_selection import KFold, TimeSeriesSplit
from sklearn.metrics import mean_squared_log_error
import sklearn.metrics as metrics

import optuna
from optuna import Trial, visualization

In [None]:
train = pd.read_csv('../input/tabular-playground-series-jul-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-jul-2021/test.csv')
sample = pd.read_csv('../input/tabular-playground-series-jul-2021/sample_submission.csv')

In [None]:
test.head()

In [None]:
feature_cols = [col for col in test.columns.tolist() if col not in ['date_time']]
target1 = ['target_carbon_monoxide']
target2 = ['target_benzene']
target3 = ['target_nitrogen_oxides']

In [None]:
kf = KFold(n_splits = 5, random_state = 4022, shuffle = False)

for i, (trn, val) in enumerate(kf.split(train)):
    train.loc[val, 'kfold'] = i
train['kfold'] = train['kfold'].astype(int)

In [None]:
def fit_xgb(trial, xtr, ytr, xval, yval):
    params = {
        "n_estimators": trial.suggest_int("n_estimators",200,1200,100),
        "subsample": trial.suggest_discrete_uniform("subsample", 0.6,1,0.1),
        "colsample_bytree": trial.suggest_discrete_uniform("colsample_bytree", 0.6,1,0.1),
        "eta": trial.suggest_loguniform("eta",1e-3,0.1),
        "max_depth": trial.suggest_int("max_depth",5,20),
        "reg_alpha": trial.suggest_int("reg_alpha",1,50),
    }
    
    model = xgb.XGBRegressor(**params, random_state = 42, eval_metric="rmsle")
    model.fit(xtr, ytr.reshape(-1,))
    
    y_tr_pred = model.predict(xtr)
    y_val_pred = model.predict(xval)
    
    y_tr_pred = np.clip(y_tr_pred, 0.1, None)
    y_val_pred = np.clip(y_val_pred, 0.1, None)
    
    log = {
        "train rmsle": np.sqrt(mean_squared_log_error(ytr, y_tr_pred)),
        "valid rmsle": np.sqrt(mean_squared_log_error(yval, y_val_pred))
    }
    
    return model, log

In [None]:
def objective1(trial):
    rmsle = 0
    for fold in range(5):
        trn_idx = train['kfold'] != fold
        val_idx = train['kfold'] == fold
        trn = train.loc[trn_idx, :]
        val = train.loc[val_idx, :]

        xtr, ytr = trn[feature_cols].values, trn[target1].values
        xval, yval = val[feature_cols].values, val[target1].values
        
        model, log = fit_xgb(trial, xtr, ytr, xval, yval)
        rmsle += log['valid rmsle']/5
        
    return rmsle

In [None]:
def objective2(trial):
    rmsle = 0
    for fold in range(5):
        trn_idx = train['kfold'] != fold
        val_idx = train['kfold'] == fold
        trn = train.loc[trn_idx, :]
        val = train.loc[val_idx, :]

        xtr, ytr = trn[feature_cols].values, trn[target2].values
        xval, yval = val[feature_cols].values, val[target2].values
        
        model, log = fit_xgb(trial, xtr, ytr, xval, yval)
        rmsle += log['valid rmsle']/5
        
    return rmsle

In [None]:
def objective3(trial):
    rmsle = 0
    for fold in range(5):
        trn_idx = train['kfold'] != fold
        val_idx = train['kfold'] == fold
        trn = train.loc[trn_idx, :]
        val = train.loc[val_idx, :]

        xtr, ytr = trn[feature_cols].values, trn[target3].values
        xval, yval = val[feature_cols].values, val[target3].values
        
        model, log = fit_xgb(trial, xtr, ytr, xval, yval)
        rmsle += log['valid rmsle']/5
        
    return rmsle

In [None]:
study = optuna.create_study(direction = "minimize", study_name = 'Target 1 optimization')
study.optimize(objective1, n_trials = 10)

In [None]:
history = study.trials_dataframe()
history.sort_values(by = "value", ascending = True)

In [None]:
study.best_params

In [None]:
clf1 = xgb.XGBRegressor(**(study.best_params))

In [None]:
clf1.fit(train[feature_cols], train[target1], eval_metric="rmsle")

In [None]:
preds1 = clf1.predict(test[feature_cols])
preds1 = np.clip(preds1, 0.1, None)
sample[target1] = preds1

In [None]:
study = optuna.create_study(direction="minimize", study_name='Target 2 optimization')
study.optimize(objective2, n_trials=10)

In [None]:
history = study.trials_dataframe()
history.sort_values(by="value", ascending=True)

In [None]:
study.best_params

In [None]:
clf2 = xgb.XGBRegressor(**(study.best_params))

In [None]:
clf2.fit(train[feature_cols], train[target2], eval_metric="rmsle")

In [None]:
preds2 = clf2.predict(test[feature_cols])
preds2 = np.clip(preds2, 0.1, None)
sample[target2] = preds2

In [None]:
study = optuna.create_study(direction="minimize", study_name='Target 3 optimization')
study.optimize(objective3, n_trials=20)

In [None]:
history = study.trials_dataframe()
history.sort_values(by="value", ascending=True)

In [None]:
study.best_params

In [None]:
clf3 = xgb.XGBRegressor(**(study.best_params))

In [None]:
clf3.fit(train[feature_cols], train[target3], eval_metric="rmsle")

In [None]:
preds3 = clf3.predict(test[feature_cols])
preds3 = np.clip(preds3, 0.1, None)
sample[target3] = preds3

In [None]:
sample.to_csv("submission.csv", index=False)