# Setup

In [1]:
import gc
import warnings
import subprocess

gc.enable()
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

pd.set_option('display.max_columns', None)

from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold

import catboost as cb

SEED = 55

In [2]:
assert cb.__version__ == '1.2.2', 'CatBoost version differs from original notebook.'

In [3]:
#Check GPU availability
try:
    subprocess.check_output('nvidia-smi')
    DEVICE = 'GPU'
except Exception:
    DEVICE = 'CPU'

print(f'Available device: {DEVICE}')

Available device: CPU


In [4]:
DATA_DIR = '/kaggle/input/bitgrit-ai-generated-text-classification/ai-text-competition'
train = pd.read_csv(f'{DATA_DIR}/training_set.csv')
test = pd.read_csv(f'{DATA_DIR}/test_set.csv')
sample_sub = pd.read_csv(f'{DATA_DIR}/solution_format.csv')

In [5]:
train = train.drop('ID', axis=1)
test = test.drop('ID', axis=1)

features = ['word_count', 'punc_num'] + [f'feature_{i}' for i in range(768)]
TARGET = 'ind'

# Modeling framework

In [6]:
def probs_to_labels(probs, threshold=0.5):
    return (probs >= threshold).astype('int')

In [7]:
def get_best_threshold(y_true, y_probs):
    candidates = np.arange(0, 1, 0.0005)
    scores = [f1_score(y_true, probs_to_labels(y_probs, t)) for t in candidates]
    best_threshold = candidates[np.argmax(scores)]
    return best_threshold

In [8]:
def custom_cv(features, model, folds=7, seed=SEED):
    X, y = train[features], train[TARGET]
    X_test = test[features]
    
    oof_probs = {}
    oof_preds = {}
    test_probs = {}
    test_preds = {}
    scores = []

    cv = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)
    for fold, (train_ids, val_ids) in enumerate(cv.split(X, y)):
        X_train, y_train = X.iloc[train_ids], y.iloc[train_ids]
        X_val, y_val = X.iloc[val_ids], y.iloc[val_ids]

        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            verbose=False)

        val_probs = model.predict_proba(X_val)[:, 1]
        oof_probs.update(dict(zip(val_ids, val_probs)))
        
        best_threshold = get_best_threshold(y_val, val_probs)
        val_preds = probs_to_labels(val_probs, best_threshold)
        oof_preds.update(dict(zip(val_ids, val_preds)))
        
        test_probs[f'fold{fold}'] = model.predict_proba(X_test)[:, 1]
        test_preds[f'fold{fold}'] = probs_to_labels(test_probs[f'fold{fold}'], best_threshold)

        score = f1_score(y_val, val_preds)
        scores.append(score)
        print(f'Fold #{fold:>2}: {score:.5f} ({model.best_iteration_:>5} rounds, best threshold: {best_threshold:.4f})')
        _ = gc.collect()

    test_probs = pd.DataFrame.from_dict(test_probs)
    test_probs['mean'] = test_probs.mean(axis=1) # mean of fold-wise probabilities
    
    test_preds = pd.DataFrame.from_dict(test_preds)
    test_preds['mode'] = test_preds.mode(axis=1)[0].astype('int') # mode of fold-wise predictions
    
    oof_probs = pd.Series(oof_probs).sort_index()
    oof_preds = pd.Series(oof_preds).sort_index()
    print(f'\nAvg. score: {np.mean(scores):.5f} +/- {np.std(scores):.5f}')
    print(f'OOF score: {f1_score(y, oof_preds):.5f}\n')
    
    return oof_probs, oof_preds, test_probs, test_preds

In [9]:
def oof_thresholding(oof_probs, test_probs):
    # custom_cv has fold-wise thresholding, here we do it based on overall OOF preds
    threshold = get_best_threshold(train[TARGET], oof_probs)
    print(f'OOF best threshold: {threshold}')
    oof_preds = probs_to_labels(oof_probs, threshold)
    print(f'OOF score: {f1_score(train[TARGET], oof_preds):.5f}\n')
    test_preds = probs_to_labels(test_probs['mean'], threshold)
    return oof_preds, test_preds

# Experiments

In [10]:
base_params = {
    'random_seed': SEED,
    'loss_function': 'Logloss',
    'iterations': 10000,
    'learning_rate': 0.01,
    'early_stopping_rounds': 200,
    'use_best_model': True,
    'task_type': DEVICE,
    'thread_count': -1
}

In [11]:
fold_preds = {}
ot_preds = {}

### Expt 1 - eval_metric='F1'

In [12]:
%%time
expt = 1

params = {
    'eval_metric': 'F1'
}

model = cb.CatBoostClassifier(**base_params, **params)

fold_preds[expt] = custom_cv(features, model)

ot_preds[expt] = oof_thresholding(fold_preds[expt][0], fold_preds[expt][2])

Fold # 0: 0.59438 ( 1028 rounds, best threshold: 0.2755)
Fold # 1: 0.60188 ( 1427 rounds, best threshold: 0.1875)
Fold # 2: 0.61806 (  933 rounds, best threshold: 0.2065)
Fold # 3: 0.68132 ( 1304 rounds, best threshold: 0.2420)
Fold # 4: 0.63230 ( 1174 rounds, best threshold: 0.2185)
Fold # 5: 0.61481 (  804 rounds, best threshold: 0.2390)
Fold # 6: 0.68667 ( 2192 rounds, best threshold: 0.2365)

Avg. score: 0.63277 +/- 0.03429
OOF score: 0.63317

OOF best threshold: 0.2185
OOF score: 0.62117

CPU times: user 1h 18min 3s, sys: 1min 5s, total: 1h 19min 8s
Wall time: 21min 10s


### Expt 2: eval_metric='PRAUC'

In [13]:
%%time
expt = 2

params = {
    'eval_metric': 'PRAUC:type=Classic'
}

model = cb.CatBoostClassifier(**base_params, **params)

fold_preds[expt] = custom_cv(features, model)

ot_preds[expt] = oof_thresholding(fold_preds[expt][0], fold_preds[expt][2])

Fold # 0: 0.63241 ( 2397 rounds, best threshold: 0.3040)
Fold # 1: 0.63768 ( 6044 rounds, best threshold: 0.1825)
Fold # 2: 0.66897 ( 2754 rounds, best threshold: 0.1990)
Fold # 3: 0.70508 ( 3589 rounds, best threshold: 0.1885)
Fold # 4: 0.65563 ( 3757 rounds, best threshold: 0.1845)
Fold # 5: 0.64964 ( 1895 rounds, best threshold: 0.2175)
Fold # 6: 0.69935 ( 6216 rounds, best threshold: 0.1600)

Avg. score: 0.66411 +/- 0.02654
OOF score: 0.66533

OOF best threshold: 0.1845
OOF score: 0.65634

CPU times: user 3h 29min 54s, sys: 2min 55s, total: 3h 32min 50s
Wall time: 56min 9s


### Expt 3 - eval_metric='Logloss'

In [14]:
%%time
expt = 3

params = {
    'eval_metric': 'Logloss'
}

model = cb.CatBoostClassifier(**base_params, **params)

fold_preds[expt] = custom_cv(features, model)

ot_preds[expt] = oof_thresholding(fold_preds[expt][0], fold_preds[expt][2])

Fold # 0: 0.63241 ( 2398 rounds, best threshold: 0.3045)
Fold # 1: 0.62745 ( 2474 rounds, best threshold: 0.2745)
Fold # 2: 0.66897 ( 2757 rounds, best threshold: 0.1990)
Fold # 3: 0.70548 ( 3572 rounds, best threshold: 0.1905)
Fold # 4: 0.65278 ( 2712 rounds, best threshold: 0.2200)
Fold # 5: 0.65480 ( 2713 rounds, best threshold: 0.1970)
Fold # 6: 0.69625 ( 3534 rounds, best threshold: 0.2380)

Avg. score: 0.66259 +/- 0.02756
OOF score: 0.66393

OOF best threshold: 0.199
OOF score: 0.65012

CPU times: user 2h 41min 38s, sys: 2min 15s, total: 2h 43min 53s
Wall time: 43min


### Expt 4 - eval_metric='PRAUC', class imbalance: auto_class_weights='Balanced'

In [23]:
%%time
expt = 4

params = {
    'eval_metric': 'PRAUC:type=Classic',
    'auto_class_weights': 'Balanced'
}

model = cb.CatBoostClassifier(**base_params, **params)

fold_preds[expt] = custom_cv(features, model)

ot_preds[expt] = oof_thresholding(fold_preds[expt][0], fold_preds[expt][2])

Fold # 0: 0.61484 ( 2244 rounds, best threshold: 0.4880)
Fold # 1: 0.64567 ( 3744 rounds, best threshold: 0.5850)
Fold # 2: 0.66920 ( 3029 rounds, best threshold: 0.5010)
Fold # 3: 0.69935 ( 3143 rounds, best threshold: 0.3685)
Fold # 4: 0.63950 ( 1880 rounds, best threshold: 0.4360)
Fold # 5: 0.64748 ( 2753 rounds, best threshold: 0.4880)
Fold # 6: 0.66917 ( 2251 rounds, best threshold: 0.6395)

Avg. score: 0.65503 +/- 0.02497
OOF score: 0.65515

OOF best threshold: 0.459
OOF score: 0.64201

CPU times: user 2h 37min 30s, sys: 2min 9s, total: 2h 39min 39s
Wall time: 42min 15s


### Expt 5 - eval_metric='Logloss', class imbalance: auto_class_weights='SqrtBalanced'

In [24]:
%%time
expt = 5

params = {
    'eval_metric': 'PRAUC:type=Classic',
    'auto_class_weights': 'SqrtBalanced'
}

model = cb.CatBoostClassifier(**base_params, **params)

fold_preds[expt] = custom_cv(features, model)

ot_preds[expt] = oof_thresholding(fold_preds[expt][0], fold_preds[expt][2])

Fold # 0: 0.63241 ( 3908 rounds, best threshold: 0.3630)
Fold # 1: 0.64906 ( 4564 rounds, best threshold: 0.3040)
Fold # 2: 0.68942 ( 3736 rounds, best threshold: 0.2355)
Fold # 3: 0.70946 ( 2423 rounds, best threshold: 0.3030)
Fold # 4: 0.64207 ( 2352 rounds, best threshold: 0.3640)
Fold # 5: 0.67143 ( 2921 rounds, best threshold: 0.3095)
Fold # 6: 0.68635 ( 3618 rounds, best threshold: 0.3775)

Avg. score: 0.66860 +/- 0.02624
OOF score: 0.66978

OOF best threshold: 0.307
OOF score: 0.65310

CPU times: user 3h 9min 44s, sys: 2min 36s, total: 3h 12min 21s
Wall time: 50min 45s


### Expt 6 - eval_metric='Logloss', class imbalance: scale_pos_weight=class_ratio

In [25]:
class_ratio = train[TARGET].value_counts()[0] / train[TARGET].value_counts()[1]

In [26]:
%%time
expt = 6

params = {
    'eval_metric': 'PRAUC:type=Classic',
    'scale_pos_weight': class_ratio
}

model = cb.CatBoostClassifier(**base_params, **params)

fold_preds[expt] = custom_cv(features, model)

ot_preds[expt] = oof_thresholding(fold_preds[expt][0], fold_preds[expt][2])

Fold # 0: 0.61538 ( 1661 rounds, best threshold: 0.4820)
Fold # 1: 0.64822 ( 4724 rounds, best threshold: 0.5620)
Fold # 2: 0.70270 ( 3389 rounds, best threshold: 0.3520)
Fold # 3: 0.68401 ( 2488 rounds, best threshold: 0.5135)
Fold # 4: 0.63023 ( 2066 rounds, best threshold: 0.4370)
Fold # 5: 0.67128 ( 2888 rounds, best threshold: 0.4145)
Fold # 6: 0.65306 ( 2361 rounds, best threshold: 0.5085)

Avg. score: 0.65784 +/- 0.02818
OOF score: 0.65738

OOF best threshold: 0.47400000000000003
OOF score: 0.64253

CPU times: user 2h 42min 7s, sys: 2min 14s, total: 2h 44min 21s
Wall time: 43min 30s


### Exp 7 - best config, 5 folds

In [28]:
%%time
expt = 7

params = {
    'eval_metric': 'PRAUC:type=Classic',
    'auto_class_weights': 'SqrtBalanced'
}

model = cb.CatBoostClassifier(**base_params, **params)

fold_preds[expt] = custom_cv(features, model, folds=5)

ot_preds[expt] = oof_thresholding(fold_preds[expt][0], fold_preds[expt][2])

Fold # 0: 0.62050 ( 2764 rounds, best threshold: 0.3490)
Fold # 1: 0.67513 ( 3211 rounds, best threshold: 0.2955)
Fold # 2: 0.66387 ( 2451 rounds, best threshold: 0.2130)
Fold # 3: 0.67153 ( 3528 rounds, best threshold: 0.2565)
Fold # 4: 0.63840 ( 2927 rounds, best threshold: 0.3165)

Avg. score: 0.65389 +/- 0.02106
OOF score: 0.65492

OOF best threshold: 0.299
OOF score: 0.64816

CPU times: user 2h 11s, sys: 1min 40s, total: 2h 1min 51s
Wall time: 32min 17s


### Exp 8 - best config, 10 folds

In [29]:
%%time
expt = 8

params = {
    'eval_metric': 'PRAUC:type=Classic',
    'auto_class_weights': 'SqrtBalanced'
}

model = cb.CatBoostClassifier(**base_params, **params)

fold_preds[expt] = custom_cv(features, model, folds=10)

ot_preds[expt] = oof_thresholding(fold_preds[expt][0], fold_preds[expt][2])

Fold # 0: 0.66304 ( 3352 rounds, best threshold: 0.3715)
Fold # 1: 0.61321 ( 3253 rounds, best threshold: 0.2165)
Fold # 2: 0.67403 ( 4447 rounds, best threshold: 0.3695)
Fold # 3: 0.73430 ( 2326 rounds, best threshold: 0.3005)
Fold # 4: 0.62376 ( 1954 rounds, best threshold: 0.3385)
Fold # 5: 0.68783 ( 2972 rounds, best threshold: 0.3510)
Fold # 6: 0.66341 ( 1652 rounds, best threshold: 0.3875)
Fold # 7: 0.68367 ( 2312 rounds, best threshold: 0.3630)
Fold # 8: 0.62245 ( 3224 rounds, best threshold: 0.3190)
Fold # 9: 0.71429 ( 2911 rounds, best threshold: 0.3355)

Avg. score: 0.66800 +/- 0.03781
OOF score: 0.66768

OOF best threshold: 0.3375
OOF score: 0.65611

CPU times: user 3h 57min 30s, sys: 3min 35s, total: 4h 1min 5s
Wall time: 1h 3min 14s


### Exp 9 - best config, 15 folds

In [30]:
%%time
expt = 9

params = {
    'eval_metric': 'PRAUC:type=Classic',
    'auto_class_weights': 'SqrtBalanced'
}

model = cb.CatBoostClassifier(**base_params, **params)

fold_preds[expt] = custom_cv(features, model, folds=15)

ot_preds[expt] = oof_thresholding(fold_preds[expt][0], fold_preds[expt][2])

Fold # 0: 0.67717 ( 1460 rounds, best threshold: 0.4450)
Fold # 1: 0.66207 ( 2394 rounds, best threshold: 0.2655)
Fold # 2: 0.54167 ( 2088 rounds, best threshold: 0.2715)
Fold # 3: 0.63248 ( 1408 rounds, best threshold: 0.4760)
Fold # 4: 0.71074 ( 2824 rounds, best threshold: 0.3815)
Fold # 5: 0.76316 ( 2543 rounds, best threshold: 0.2360)
Fold # 6: 0.61538 ( 1996 rounds, best threshold: 0.3005)
Fold # 7: 0.74809 ( 1703 rounds, best threshold: 0.3980)
Fold # 8: 0.64062 ( 1248 rounds, best threshold: 0.3895)
Fold # 9: 0.65854 ( 2021 rounds, best threshold: 0.2760)
Fold #10: 0.67143 ( 1734 rounds, best threshold: 0.3695)
Fold #11: 0.70504 ( 2472 rounds, best threshold: 0.2805)
Fold #12: 0.62774 ( 2628 rounds, best threshold: 0.3200)
Fold #13: 0.69504 ( 2512 rounds, best threshold: 0.3570)
Fold #14: 0.68148 ( 2271 rounds, best threshold: 0.3260)

Avg. score: 0.66871 +/- 0.05301
OOF score: 0.66860

OOF best threshold: 0.3605
OOF score: 0.64232

CPU times: user 4h 30min 44s, sys: 4min 5s, t

# Submission files

In [42]:
def create_submission_files(preds, config):
    sub = sample_sub.copy()
    sub[TARGET] = preds
    sub.to_csv(f'{config}.csv', index=False)

In [43]:
for expt in range(1, 10):
    create_submission_files(fold_preds[expt][-1]['mode'], f'expt{expt}_fold')
    create_submission_files(ot_preds[expt][1], f'expt{expt}_ot')