# CatBoostClassifier

Related notebooks:

https://www.kaggle.com/agorinenko/feb-2022-part1-eda

In [None]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_validate

from catboost import CatBoostClassifier, Pool, metrics

# Load data from eda notebook

In [None]:
train_df = pd.read_csv('../input/feb-2022-eda/train.csv', index_col="row_id")
test_df = pd.read_csv('../input/feb-2022-eda/test.csv')

In [None]:
train_df.head()

Let's separate the target variable and the features.

In [None]:
features_columns = [e for e in train_df.columns if e != 'row_id' and e != 'target']

Encode the target variable.

In [None]:
target_col = 'target_num'

le = LabelEncoder()
train_df[target_col] = le.fit_transform(train_df.target)

train_df.head()

In [None]:
X_train = train_df[features_columns].astype(np.float64)
y_train = train_df[target_col].astype(np.float64)

X_test = test_df[features_columns].astype(np.float64)

# Tuning global parameters

In [None]:
!pip install optuna -q

In [None]:
import optuna
from sklearn.metrics import accuracy_score

In [None]:
def objective(trial):
    train_x, valid_x, train_y, valid_y = train_test_split(X_train, y_train, test_size=0.3)

    param = {
        "task_type": 'GPU',
        "loss_function": 'MultiClass',
        "random_seed": 42,
        "logging_level": 'Silent',
        "eval_metric": "Accuracy",
        'iterations': trial.suggest_int('iterations', 1000, 2000, step=100),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.36, step=0.05),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 5, 20, step=0.5),
        'min_child_samples': trial.suggest_float('min_child_samples', 0.1, 1, step=0.1),
        "depth": trial.suggest_int("depth", 1, 12, step=1),
        "bootstrap_type": trial.suggest_categorical(
            "bootstrap_type", ["Bayesian", "Bernoulli"]
        )
    }

    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1)

    gbm = CatBoostClassifier(**param)
    gbm.fit(train_x, train_y, eval_set=[(valid_x, valid_y)], verbose=0, early_stopping_rounds=100)

    preds = gbm.predict(valid_x)
    preds = preds.flatten()
    
    accuracy = accuracy_score(valid_y, preds)
    return accuracy

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

# Validate

In [None]:
model = CatBoostClassifier(
    task_type='GPU',
    loss_function='MultiClass',
    random_seed=42,
    logging_level='Silent',
    **trial.params
)

In [None]:
scores = cross_validate(model, 
                        X_train, y_train, 
                        cv=5,                      
                        scoring=('accuracy'))

In [None]:
print(f'Mean validation accuracy score: {scores["test_score"].mean()}')

# Train model

In [None]:
%%time
model.fit(X_train, y_train)

# Predict

In [None]:
y_pred = model.predict(X_test)

# Submission

In [None]:
def save_submission(y_pred):  
    y_pred = y_pred.astype(np.int64)
    y_pred_class = le.inverse_transform(y_pred)
    submission = test_df[['row_id']].copy() 
    submission["target"] = y_pred_class
    
    assert len(y_pred_class) == submission.shape[0]
    assert 2 == submission.shape[1]
    
    submission.to_csv("submission.csv",index=False)
    return submission

In [None]:
save_submission(y_pred.flatten()).head()