### In this notebook, I'm trying `CatBoost` extensively by finding right parameters using Optuna
### Do comment if you find something can be improved, & what else to try next.

In [None]:
# imports
import os
from pathlib import Path
import warnings

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

In [None]:
pd.options.display.max_rows = 100
pd.options.display.max_columns = 100
plt.style.use('ggplot')
warnings.filterwarnings('ignore')

## Data load

In [None]:
path = Path('/kaggle/input/tabular-playground-series-jun-2021/')
train_df = pd.read_csv(path/'train.csv')
test_df = pd.read_csv(path/'test.csv')
sample_submission = pd.read_csv(path/'sample_submission.csv')

In [None]:
sample_submission.head()

In [None]:
train_df.shape, test_df.shape

In [None]:
# remove unnecessary columns like id column
train_ids = train_df.id
train_df.drop('id', axis=1, inplace=True)
test_ids = test_df.id
test_df.drop('id', axis=1, inplace=True)

In [None]:
target = train_df.target
train_df.drop('target', axis=1, inplace=True)

In [None]:
# Check for duplicates
train_df = train_df.drop_duplicates(keep='first')
target = target[train_df.index]

train_df.shape, target.shape

In [None]:
# Encode target variable
from sklearn.preprocessing import LabelEncoder

label_enc = LabelEncoder()
target_labels = pd.Series(label_enc.fit_transform(target), name='target')
target_labels[:5]

In [None]:
train_df.shape, test_df.shape

In [None]:
# null value check
train_df.isnull().sum().sum(), test_df.isnull().sum().sum()

*Seems no null values*

## EDA

In [None]:
# target distribution
target_value_counts = target.value_counts()
target_value_percent = target.value_counts()*100/len(train_df)

plt.figure(figsize=(10, 4))
plt.bar(target_value_counts.keys(), target_value_counts.values)
for (label, value), percent in zip(target_value_counts.items(), target_value_percent.values):
    plt.text(label, value+1000, f'{percent:.1f}%', ha='center')
plt.ylim(0, 60000)
plt.title('Target class counts')
plt.xlabel('Target class')
plt.ylabel('Count')
plt.grid(False)
plt.tight_layout()
plt.show()

> #### Some of target classes are heavily imbalanced

In [None]:
def get_num_unique(x):
    return len(x.unique())

feature_unique_df = train_df.apply(lambda x: get_num_unique(x), axis=0).to_frame('n_unique')
zero_percent_df = train_df.apply(lambda x: x.value_counts()[0]*100/len(train_df), axis=0).to_frame('per_zero')

train_describe_df = pd.concat([
    train_df.describe(percentiles=[0.25, 0.5, 0.75, 0.95]).T.drop('count', axis=1),
    feature_unique_df,
    zero_percent_df
], axis=1)
train_describe_df.head()

In [None]:
train_describe_df.sort_values(by=['std','n_unique']) \
                 .style.bar(subset=['mean', 'per_zero']) \
                 .background_gradient(subset=['n_unique', 'std'])

## Data prep

In [None]:
drop_the_zero_features = 0
zero_feature_columns = ['feature_15', 'feature_17', 'feature_22', 'feature_36', 'feature_47', 'feature_49', 'feature_66', 'feature_74']
if drop_the_zero_features:
    train_df.drop(zero_feature_columns, axis=1, inplace=True)
    test_df.drop(zero_feature_columns, axis=1, inplace=True)

train_df.shape, test_df.shape

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_df, target_labels, test_size=0.15, random_state=13,
                                                    shuffle=True, stratify=target_labels)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

## CATBoost

In [None]:
import catboost
import optuna
from sklearn.metrics import log_loss

In [None]:
def objective(trial):
    param = {
        "task_type": "GPU",
        "loss_function": 'MultiClass',
        "eval_metric": 'MultiClass',
        "learning_rate": trial.suggest_uniform("learning_rate", 0.02, 1),
        "leaf_estimation_method": "Newton",
        "reg_lambda": trial.suggest_uniform("reg_lambda", 1e-5, 100),
        "subsample": trial.suggest_uniform("subsample", 0, 1),
        "random_strength": trial.suggest_uniform("random_strength", 10, 50),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 30),
        "depth": trial.suggest_int("depth", 1, 12),
        "bootstrap_type": "Bernoulli",
    }
    gbm = catboost.CatBoostClassifier(**param)
    gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=0, early_stopping_rounds=25)
    preds = gbm.predict_proba(X_test)
    loss = log_loss(y_test, preds)
    return loss

In [None]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50, timeout=600)

In [None]:
study.best_params

In [None]:
study.best_value

In [None]:
# Final model build with best_params
cat_params = study.best_trial.params
cat_params['loss_function'] = 'MultiClass'
cat_params['eval_metric'] = 'MultiClass'
cat_params['bootstrap_type'] = 'Bernoulli'
cat_params['leaf_estimation_method'] = 'Newton'
cat_params['random_state'] = 13
cat_params['task_type'] = 'GPU'

In [None]:
from sklearn.model_selection import StratifiedKFold

test_preds = None
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=13)
for fold, (train_index, test_index) in enumerate(skf.split(train_df.values, target.values)):
    print(f'Fold: {fold+1}')
    X_train_sub, X_test_sub = train_df.values[train_index], train_df.values[test_index]
    y_train_sub, y_test_sub = target.values[train_index], target.values[test_index]
    eval_set = [(X_test_sub, y_test_sub)]
    model = catboost.CatBoostClassifier(**cat_params)
    model.fit(X_train_sub, y_train_sub, eval_set=eval_set, verbose=False)
    print(f'log loss: {log_loss(y_test_sub, model.predict_proba(X_test_sub))}')
    if test_preds is None:
        test_preds = model.predict_proba(test_df)
    else:
        test_preds = model.predict_proba(test_df)

test_preds /= 10

In [None]:
sample_submission['Class_1']=test_preds[:,0]
sample_submission['Class_2']=test_preds[:,1]
sample_submission['Class_3']=test_preds[:,2]
sample_submission['Class_4']=test_preds[:,3]
sample_submission['Class_5']=test_preds[:,4]
sample_submission['Class_6']=test_preds[:,5]
sample_submission['Class_7']=test_preds[:,6]
sample_submission['Class_8']=test_preds[:,7]
sample_submission['Class_9']=test_preds[:,8]
sample_submission.head()

In [None]:
sample_submission.to_csv('./catboost.csv', index=False)