# Imports and Configs

In [1]:
!pip install -q ucimlrepo

In [2]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import matthews_corrcoef
from sklearn.base import clone
from catboost import CatBoostClassifier
import matplotlib.pyplot as plt
from ucimlrepo import fetch_ucirepo 
import seaborn as sns
import pandas as pd
import numpy as np
import warnings
import pickle
import gc

warnings.filterwarnings('ignore')

In [3]:
TARGET = 'class'
N_FOLDS = 5
SEED = 6
ES_ROUNDS = 100
ES_LOG_STEPS = 100
THRESHOLD = 0.5

# Loading and Processing Data

In [4]:
train = pd.read_csv('/kaggle/input/playground-series-s4e8/train.csv', index_col='id')
test = pd.read_csv('/kaggle/input/playground-series-s4e8/test.csv', index_col='id')
original = fetch_ucirepo(id=848)['data']['original']

train.shape, test.shape, original.shape

((3116945, 21), (2077964, 20), (61069, 21))

In [5]:
cat_features = list(test.select_dtypes('object').columns)

# Reference: https://www.kaggle.com/code/ambrosm/pss4e8-eda-which-makes-sense
for feature in cat_features:            
    categories = sorted(list(set(original[feature].dropna())))
    dtype = pd.CategoricalDtype(categories=categories, ordered=False)
    
    train.loc[~train[feature].isin(categories), feature] = np.nan
    test.loc[~test[feature].isin(categories), feature] = np.nan
    
    train[feature] = train[feature].astype(dtype)
    test[feature] = test[feature].astype(dtype)
    
    train[feature] = train[feature].astype(str).fillna('NaN')
    test[feature] = test[feature].astype(str).fillna('NaN')

In [6]:
X = train.drop([TARGET], axis=1)
y = train[TARGET].map({'e': 0, 'p': 1})
X_test = test

In [7]:
def reduce_mem_usage(dataframe):        
    print('Reducing memory usage')
    initial_mem_usage = dataframe.memory_usage().sum() / 1024**2
    
    for col in dataframe.columns:
        col_type = dataframe[col].dtype

        if col_type.name in ['category', 'object']:
            continue

        c_min = dataframe[col].min()
        c_max = dataframe[col].max()
        if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
            dataframe[col] = dataframe[col].astype(np.float16)
        elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
            dataframe[col] = dataframe[col].astype(np.float32)
        else:
            dataframe[col] = dataframe[col].astype(np.float64)

    final_mem_usage = dataframe.memory_usage().sum() / 1024**2
    print('--- Memory usage before: {:.2f} MB'.format(initial_mem_usage))
    print('--- Memory usage after: {:.2f} MB'.format(final_mem_usage))
    print('--- Decreased memory usage by {:.1f}%\n'.format(100 * (initial_mem_usage - final_mem_usage) / initial_mem_usage))

    return dataframe

In [8]:
X = reduce_mem_usage(X)
X_test = reduce_mem_usage(X_test)

Reducing memory usage
--- Memory usage before: 499.39 MB
--- Memory usage after: 445.88 MB
--- Decreased memory usage by 10.7%

Reducing memory usage
--- Memory usage before: 332.93 MB
--- Memory usage after: 297.26 MB
--- Decreased memory usage by 10.7%



# Training

In [9]:
class Trainer:
    def __init__(self, model, n_folds=N_FOLDS):
        self.model = model
        self.n_folds = n_folds

    def fit_predict(self, X, y, X_test):
        print(f'Training {self.model.__class__.__name__}\n')
        
        scores = []        
        oof_pred_probs = np.zeros((X.shape[0], len(np.unique(y))))
        test_pred_probs = np.zeros((X_test.shape[0], len(np.unique(y))))
        
        skf = StratifiedKFold(n_splits=self.n_folds, random_state=SEED, shuffle=True)
        for fold_idx, (train_idx, val_idx) in enumerate(skf.split(X, y)):
            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y[train_idx], y[val_idx]
            
            model = clone(self.model)
            model.fit(
                X_train,
                y_train,
                eval_set=[(X_val, y_val)], 
                verbose=ES_LOG_STEPS, 
                early_stopping_rounds=ES_ROUNDS,
                use_best_model=True
            )
            
            y_pred_probs = model.predict_proba(X_val)
            oof_pred_probs[val_idx] = y_pred_probs 
            y_preds = y_pred_probs[:, 1] >= THRESHOLD         
            
            temp_test_pred_probs = model.predict_proba(X_test)
            test_pred_probs += temp_test_pred_probs / self.n_folds
            
            score = matthews_corrcoef(y_val, y_preds)
            scores.append(score)
            
            del model, X_train, y_train, X_val, y_val, y_pred_probs, temp_test_pred_probs, y_preds
            gc.collect()
            
            print(f'\n--- Fold {fold_idx + 1} - MCC: {score:.6f}\n\n')
            
        self._save_pred_probs(oof_pred_probs, np.mean(scores), 'oof')
        self._save_pred_probs(test_pred_probs, np.mean(scores), 'test')
        self._save_submission(test_pred_probs, np.mean(scores))
        
        print(f'------ Average MCC: {np.mean(scores):.6f} ± {np.std(scores):.6f}\n\n')
        
    def _save_pred_probs(self, pred_probs, cv_score, name):
        model_name = self.model.__class__.__name__.lower().replace('classifier', '')
        with open(f'{model_name}_{name}_pred_probs_{cv_score:.6f}.pkl', 'wb') as f:
            pickle.dump(pred_probs, f)
    
    def _save_submission(self, test_pred_probs, score):
        name = self.model.__class__.__name__.lower().replace('classifier', '')
        sub = pd.read_csv('/kaggle/input/playground-series-s4e8/sample_submission.csv')
        sub[TARGET] = test_pred_probs[:, 1] >= THRESHOLD
        sub[TARGET] = sub[TARGET].map({False: 'e', True: 'p'})
        sub.to_csv(f'sub_{name}_{score:.6f}.csv', index=False)

In [10]:
params = {    
    'border_count': 277,
    'colsample_bylevel': 0.10259634252627849,
    'depth': 12,
    'eval_metric': "MCC",
    'iterations': 5000,
    'l2_leaf_reg': 0.8547747497093361,
    'learning_rate': 0.028124773056202795,
    'loss_function': "Logloss",
    'min_data_in_leaf': 223,
    'random_state': SEED,
    'random_strength': 0.4703879895777699,
    'subsample': 0.5101541154482044,
    'thread_count': -1,
    'verbose': False,
    'cat_features': cat_features
}

model = CatBoostClassifier(**params)
trainer = Trainer(model)
trainer.fit_predict(X, y, X_test)

Training CatBoostClassifier

0:	learn: 0.8104832	test: 0.8108990	best: 0.8108990 (0)	total: 5.98s	remaining: 8h 18m 34s
100:	learn: 0.9812387	test: 0.9805204	best: 0.9805204 (100)	total: 5m 44s	remaining: 4h 38m 9s
200:	learn: 0.9830731	test: 0.9823701	best: 0.9823701 (200)	total: 12m 12s	remaining: 4h 51m 19s
300:	learn: 0.9839098	test: 0.9830145	best: 0.9830145 (300)	total: 18m 12s	remaining: 4h 44m 17s
400:	learn: 0.9844609	test: 0.9834256	best: 0.9834321 (394)	total: 24m 24s	remaining: 4h 39m 57s
500:	learn: 0.9848082	test: 0.9836425	best: 0.9836425 (500)	total: 30m 22s	remaining: 4h 32m 47s
600:	learn: 0.9850987	test: 0.9837786	best: 0.9837786 (599)	total: 36m 52s	remaining: 4h 29m 57s
700:	learn: 0.9853189	test: 0.9839404	best: 0.9839404 (700)	total: 43m 29s	remaining: 4h 26m 41s
800:	learn: 0.9855300	test: 0.9840150	best: 0.9840247 (784)	total: 50m 8s	remaining: 4h 22m 52s
900:	learn: 0.9856943	test: 0.9840670	best: 0.9840670 (900)	total: 56m 8s	remaining: 4h 15m 26s
1000:	learn