# Load Dependencies

In [None]:
import pandas as pd
import numpy as np
import time

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from sklearn.model_selection import cross_val_score

from catboost import CatBoostClassifier

import optuna
from optuna.visualization import plot_optimization_history, plot_param_importances

In [None]:
train=pd.read_csv('../input/tabular-playground-series-jun-2021/train.csv')
test=pd.read_csv('../input/tabular-playground-series-jun-2021/test.csv')
sub=pd.read_csv('../input/tabular-playground-series-jun-2021/sample_submission.csv')

In [None]:
for col in train.columns: train[col] = train[col].astype('category')
for col in test.columns: test[col] = test[col].astype('category')

In [None]:
X_test = test.drop(['id'], axis=1)
X = train.drop(['id', 'target'], axis=1)
y = train.target.str.extract("(\d)").astype(int)

In [None]:
oof_lightautoml=pd.read_csv('../input/tps-jun2021-lightautoml/oof_lightautoml.csv')
sub_lightautoml=pd.read_csv('../input/tps-jun2021-lightautoml/sub_lightautoml.csv')

oof_lightautoml = oof_lightautoml.drop('id', axis=1)
oof_lightautoml.columns = ['pred_lightautoml' + str(i) for i in range(1, 10)]

sub_lightautoml = sub_lightautoml.drop('id', axis=1)
sub_lightautoml.columns = ['pred_lightautoml' + str(i) for i in range(1, 10)]

X = pd.concat([X, oof_lightautoml], axis=1)
X_test = pd.concat([X_test, sub_lightautoml], axis=1)

In [None]:
K=5
SEED=314
ESR=100

fixed_params = {
    'random_state': SEED,
    'task_type':"GPU",
    'iterations': 100000, 
    'od_wait' : 50,
    'loss_function':'MultiClass',
    'use_best_model': True,
    'eval_metric':'MultiClass', 
    'leaf_estimation_method':'Newton',
    'bootstrap_type': 'Bernoulli',
    'boosting_type': "Plain"
}

kf = StratifiedKFold(n_splits=K, shuffle=True, random_state=SEED)

# Baseline

In [None]:
cat_oof = np.zeros((X.shape[0], 9))
cat_pred = 0

for fold, (train_idx, val_idx) in enumerate(kf.split(X=X, y=y)):
    print(f"➜ FOLD :{fold}")
    X_train = X.iloc[train_idx]
    y_train = y.iloc[train_idx]
    X_val = X.iloc[val_idx]
    y_val = y.iloc[val_idx]

    start = time.time()
    
    model = CatBoostClassifier(**fixed_params, 
                               cat_features=["feature_"+str(i) for i in range(75)])
    
    model.fit(X_train, y_train,
              eval_set=(X_val, y_val),
              early_stopping_rounds=ESR,
              verbose=False
             )
    
    cat_oof[val_idx,:] = model.predict_proba(X_val)
    cat_pred += model.predict_proba(X_test) / K
    
    cat_logloss = log_loss(y_val, cat_oof[val_idx])
    print(f"score: {cat_logloss:.6f} ")
    print(f"elapsed: {time.time()-start:.2f} sec\n")
    
    del model

cat_logloss = log_loss(y, cat_oof)
print(f"Final logloss score: {cat_logloss} ✔️ ")

In [None]:
sub.iloc[:, 1:] = cat_pred
sub.to_csv("sub_cat_default.csv", index=False)

oof_cat = pd.concat([train.id,
                     pd.DataFrame(cat_oof,
                                  columns=["Class_1", "Class_2", "Class_3",
                                           "Class_4", "Class_5", "Class_6",
                                           "Class_7", "Class_8", "Class_9"])],
                    axis=1)
oof_cat.to_csv("oof_cat_default.csv", index=False)

# Optuna

In [None]:
def objective(trial):
    
    max_depth = trial.suggest_int('depth', 2, 6)
    max_num_leaves = (2 ** max_depth) - 1

    hyperparams = {
        'learning_rate':trial.suggest_uniform("learning_rate", 0.01, 0.3),
        'random_strength':trial.suggest_int("random_strength", 1,50),
        'max_bin':trial.suggest_categorical('max_bin', [2,3,4,5,6,8,10,20,30]),
        'grow_policy':trial.suggest_categorical('grow_policy', ['SymmetricTree', 'Depthwise', 'Lossguide']),        
        "depth": max_depth,
        "l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 1e-5, 100),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf',1,30),
        'leaf_estimation_iterations': trial.suggest_int('leaf_estimation_iterations',1,15),
        "subsample": trial.suggest_float("subsample", 0.1, 1)
        }
    
    if hyperparams['grow_policy'] == "Lossguide": 
        max_leaves = trial.suggest_int('num_leaves', 3, max_num_leaves)
        if max_leaves<64:
            hyperparams["max_leaves"] = max_leaves
        else:
            hyperparams["max_leaves"] = 31

    params = dict(**fixed_params, **hyperparams)
    cat_oof = np.zeros((X.shape[0], 9))

    for i, (train_idx, val_idx) in enumerate(kf.split(X, y) ):

        X_train = X.iloc[train_idx]
        y_train = y.iloc[train_idx]
        X_val = X.iloc[val_idx]
        y_val = y.iloc[val_idx]

        model = CatBoostClassifier(**params, 
                               cat_features=["feature_"+str(i) for i in range(75)])

        model.fit(X_train, y_train,
                  eval_set=(X_val, y_val),
                  early_stopping_rounds=ESR,
                  verbose=False
                 )

        cat_oof[val_idx,:] = model.predict_proba(X_val)

    return log_loss(y, cat_oof)

In [None]:
study = optuna.create_study(direction='minimize',
                            pruner=optuna.pruners.HyperbandPruner(),
                            #pruner=optuna.pruners.HyperbandPruner(min_resource=100,  reduction_factor=4),
                            #sampler=optuna.samplers.TPESampler(n_startup_trials=50, multivariate=True, seed=123)
                           )

study.optimize(objective, 
               timeout=60*60*7.5, 
               n_trials=None, 
               gc_after_trial=False)

In [None]:
study.best_value

In [None]:
plot_optimization_history(study)

In [None]:
optuna.visualization.plot_parallel_coordinate(study)

In [None]:
plot_param_importances(study)

In [None]:
final_params = dict(**fixed_params, **study.best_params)
final_params

# Final Model

In [None]:
cat_oof = np.zeros((X.shape[0], 9))
cat_pred = 0

for fold, (train_idx, val_idx) in enumerate(kf.split(X=X, y=y)):
    print(f"➜ FOLD :{fold}")
    X_train = X.iloc[train_idx]
    y_train = y.iloc[train_idx]
    X_val = X.iloc[val_idx]
    y_val = y.iloc[val_idx]

    start = time.time()
    
    model = CatBoostClassifier(**final_params, 
                               cat_features=["feature_"+str(i) for i in range(75)])
    
    model.fit(X_train, y_train,
              eval_set=(X_val, y_val),
              early_stopping_rounds=ESR,
              verbose=False
             )
    
    cat_oof[val_idx,:] = model.predict_proba(X_val)
    cat_pred += model.predict_proba(X_test) / K
    
    cat_logloss = log_loss(y_val, cat_oof[val_idx])
    print(f"score: {cat_logloss:.6f} ")
    print(f"elapsed: {time.time()-start:.2f} sec\n")
    
    del model

cat_logloss = log_loss(y, cat_oof)
print(f"Final logloss score: {cat_logloss} ✔️ ")

# Sub

In [None]:
sub.iloc[:, 1:] = cat_pred
sub.to_csv("sub_cat_optuned.csv", index=False)

In [None]:
oof_cat = pd.concat([train.id,
                     pd.DataFrame(cat_oof, columns=["Class_1", "Class_2", "Class_3",
                                                    "Class_4", "Class_5", "Class_6",
                                                    "Class_7", "Class_8", "Class_9"])],
                    axis=1)
oof_cat.to_csv("oof_cat_optuned.csv", index=False)