# Setup

In [1]:
import gc
import warnings
import subprocess

gc.enable()
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

pd.set_option('display.max_columns', None)

from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold

import catboost as cb
# import optuna

SEED = 55

In [2]:
assert cb.__version__ == '1.2.2', 'CatBoost version differs from original notebook.'

In [3]:
#Check GPU availability
try:
    subprocess.check_output('nvidia-smi')
    DEVICE = 'GPU'
except Exception:
    DEVICE = 'CPU'

print(f'Available device: {DEVICE}')

Available device: GPU


# Data preparation

In [4]:
DATA_DIR = '/kaggle/input/bitgrit-ai-generated-text-classification/ai-text-competition'
train = pd.read_csv(f'{DATA_DIR}/training_set.csv')
test = pd.read_csv(f'{DATA_DIR}/test_set.csv')
sample_sub = pd.read_csv(f'{DATA_DIR}/solution_format.csv')

In [5]:
train = train.drop('ID', axis=1)
test = test.drop('ID', axis=1)

TARGET = 'ind'
og_features = test.columns

In [6]:
train['word_punc_ratio'] = (train['word_count'] / train['punc_num']).round(4)
test['word_punc_ratio'] = (test['word_count'] / test['punc_num']).round(4)

train['word_punc_diff'] = train['word_count'] - train['punc_num']
test['word_punc_diff'] = test['word_count'] - test['punc_num']

In [7]:
embeddings = [f'feature_{i}' for i in range(768)]

train['embedding_vector'] = list(train[embeddings].values)
test['embedding_vector'] = list(test[embeddings].values)

In [8]:
features = test.columns
features[-6:]

Index(['feature_767', 'word_count', 'punc_num', 'word_punc_ratio',
       'word_punc_diff', 'embedding_vector'],
      dtype='object')

# Modeling framework

In [9]:
def probs_to_labels(probs, threshold=0.5):
    return (probs >= threshold).astype('int')

In [10]:
def get_best_threshold(y_true, y_probs):
    candidates = np.arange(0, 1, 0.0005)
    scores = [f1_score(y_true, probs_to_labels(y_probs, t)) for t in candidates]
    best_threshold = candidates[np.argmax(scores)]
    return best_threshold

In [11]:
def custom_cv(features, model, folds=7, seed=SEED):
    X, y = train[features], train[TARGET]
    X_test = test[features]
    
    oof_probs = {}
    oof_preds = {}
    test_probs = {}
    test_preds = {}
    scores = []

    cv = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)
    for fold, (train_ids, val_ids) in enumerate(cv.split(X, y)):
        X_train, y_train = X.iloc[train_ids], y.iloc[train_ids]
        X_val, y_val = X.iloc[val_ids], y.iloc[val_ids]

        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            embedding_features=['embedding_vector'],
            verbose=False)

        val_probs = model.predict_proba(X_val)[:, 1]
        oof_probs.update(dict(zip(val_ids, val_probs)))
        
        best_threshold = get_best_threshold(y_val, val_probs)
        val_preds = probs_to_labels(val_probs, best_threshold)
        oof_preds.update(dict(zip(val_ids, val_preds)))
        
        test_probs[f'fold{fold}'] = model.predict_proba(X_test)[:, 1]
        test_preds[f'fold{fold}'] = probs_to_labels(test_probs[f'fold{fold}'], best_threshold)

        score = f1_score(y_val, val_preds)
        scores.append(score)
        print(f'Fold #{fold:>2}: {score:.5f} ({model.best_iteration_:>4} rounds, best threshold: {best_threshold:.4f})')
        _ = gc.collect()

    test_probs = pd.DataFrame.from_dict(test_probs)
    test_probs['mean'] = test_probs.mean(axis=1) # mean of fold-wise probabilities
    
    test_preds = pd.DataFrame.from_dict(test_preds)
    test_preds['mode'] = test_preds.mode(axis=1)[0].astype('int') # mode of fold-wise predictions
    
    oof_probs = pd.Series(oof_probs).sort_index()
    oof_preds = pd.Series(oof_preds).sort_index()
    print(f'\nAvg. score: {np.mean(scores):.5f} +/- {np.std(scores):.5f}')
    print(f'OOF score: {f1_score(y, oof_preds):.5f}\n')
    
    return oof_probs, oof_preds, test_probs, test_preds

In [12]:
def oof_thresholding(oof_probs, test_probs):
    # custom_cv has fold-wise thresholding, here we do it based on overall OOF preds
    threshold = get_best_threshold(train[TARGET], oof_probs)
    print(f'OOF best threshold: {threshold}')
    oof_preds = probs_to_labels(oof_probs, threshold)
    print(f'OOF score: {f1_score(train[TARGET], oof_preds):.5f}\n')
    test_preds = probs_to_labels(test_probs['mean'], threshold)
    return oof_preds, test_preds

# Experiments

In [13]:
base_params = {
    'random_seed': SEED,
    'loss_function': 'Logloss',
    'iterations': 10000,
    'learning_rate': 0.01,
    'early_stopping_rounds': 200,
    'use_best_model': True,
    'task_type': DEVICE,
    'thread_count': -1
}

In [14]:
fold_preds = {}
ot_preds = {}

### Expt 1

In [15]:
%%time
expt = '1'

params = {
    'eval_metric': 'PRAUC:type=Classic'
}

model = cb.CatBoostClassifier(**base_params, **params)

fold_preds[expt] = custom_cv(features, model)

ot_preds[expt] = oof_thresholding(fold_preds[expt][0], fold_preds[expt][2])

Default metric period is 5 because PRAUC is/are not implemented for GPU
Metric PRAUC:type=Classic is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


Fold # 0: 0.65600 (3202 rounds, best threshold: 0.5185)


Default metric period is 5 because PRAUC is/are not implemented for GPU
Metric PRAUC:type=Classic is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


Fold # 1: 0.64057 (1935 rounds, best threshold: 0.2190)


Default metric period is 5 because PRAUC is/are not implemented for GPU
Metric PRAUC:type=Classic is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


Fold # 2: 0.71587 (2785 rounds, best threshold: 0.3250)


Default metric period is 5 because PRAUC is/are not implemented for GPU
Metric PRAUC:type=Classic is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


Fold # 3: 0.72727 (2532 rounds, best threshold: 0.1880)


Default metric period is 5 because PRAUC is/are not implemented for GPU
Metric PRAUC:type=Classic is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


Fold # 4: 0.65714 (2259 rounds, best threshold: 0.2205)


Default metric period is 5 because PRAUC is/are not implemented for GPU
Metric PRAUC:type=Classic is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


Fold # 5: 0.65134 (1119 rounds, best threshold: 0.2950)


Default metric period is 5 because PRAUC is/are not implemented for GPU
Metric PRAUC:type=Classic is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


Fold # 6: 0.70470 (5281 rounds, best threshold: 0.1315)

Avg. score: 0.67898 +/- 0.03295
OOF score: 0.68035

OOF best threshold: 0.221
OOF score: 0.66467

CPU times: user 48min 52s, sys: 1min 43s, total: 50min 36s
Wall time: 27min 41s


### Expt 2: eval_metric

In [16]:
%%time
expt = '2_1'

params = {
    'eval_metric': 'F1'
}

model = cb.CatBoostClassifier(**base_params, **params)

fold_preds[expt] = custom_cv(features, model)

ot_preds[expt] = oof_thresholding(fold_preds[expt][0], fold_preds[expt][2])

Fold # 0: 0.63281 ( 676 rounds, best threshold: 0.4980)
Fold # 1: 0.64234 ( 346 rounds, best threshold: 0.2210)
Fold # 2: 0.69531 ( 662 rounds, best threshold: 0.4155)
Fold # 3: 0.70552 ( 208 rounds, best threshold: 0.1430)
Fold # 4: 0.63004 (   1 rounds, best threshold: 0.4935)
Fold # 5: 0.64615 ( 420 rounds, best threshold: 0.2670)
Fold # 6: 0.66463 ( 418 rounds, best threshold: 0.0925)

Avg. score: 0.65954 +/- 0.02798
OOF score: 0.66092

OOF best threshold: 0.4935
OOF score: 0.60468

CPU times: user 20min 17s, sys: 25.6 s, total: 20min 42s
Wall time: 13min 53s


In [17]:
%%time
expt = '2_2'

params = {
    'eval_metric': 'Logloss'
}

model = cb.CatBoostClassifier(**base_params, **params)

fold_preds[expt] = custom_cv(features, model)

ot_preds[expt] = oof_thresholding(fold_preds[expt][0], fold_preds[expt][2])

Fold # 0: 0.65600 (3048 rounds, best threshold: 0.5260)
Fold # 1: 0.64286 (1710 rounds, best threshold: 0.2385)
Fold # 2: 0.71380 (2427 rounds, best threshold: 0.2090)
Fold # 3: 0.73139 (2698 rounds, best threshold: 0.1850)
Fold # 4: 0.65714 (2269 rounds, best threshold: 0.2200)
Fold # 5: 0.65152 (1963 rounds, best threshold: 0.2840)
Fold # 6: 0.70130 (2783 rounds, best threshold: 0.1265)

Avg. score: 0.67914 +/- 0.03278
OOF score: 0.68109

OOF best threshold: 0.2265
OOF score: 0.66600

CPU times: user 45min, sys: 1min 37s, total: 46min 37s
Wall time: 25min 54s


### Expt 3 - class balancing

In [18]:
%%time
expt = '3_1'

params = {
    'eval_metric': 'Logloss',
    'auto_class_weights': 'Balanced'
}

model = cb.CatBoostClassifier(**base_params, **params)

fold_preds[expt] = custom_cv(features, model)

ot_preds[expt] = oof_thresholding(fold_preds[expt][0], fold_preds[expt][2])

Fold # 0: 0.64174 ( 787 rounds, best threshold: 0.6165)
Fold # 1: 0.61433 ( 512 rounds, best threshold: 0.6735)
Fold # 2: 0.68864 ( 696 rounds, best threshold: 0.7990)
Fold # 3: 0.71622 (1046 rounds, best threshold: 0.6355)
Fold # 4: 0.65068 ( 693 rounds, best threshold: 0.6725)
Fold # 5: 0.64567 ( 864 rounds, best threshold: 0.7840)
Fold # 6: 0.67361 ( 534 rounds, best threshold: 0.6265)

Avg. score: 0.66156 +/- 0.03130
OOF score: 0.66138

OOF best threshold: 0.6775
OOF score: 0.65141

CPU times: user 24min 24s, sys: 36.1 s, total: 25min
Wall time: 15min 52s


In [19]:
%%time
expt = '3_2'

params = {
    'eval_metric': 'Logloss',
    'auto_class_weights': 'SqrtBalanced'
}

model = cb.CatBoostClassifier(**base_params, **params)

fold_preds[expt] = custom_cv(features, model)

ot_preds[expt] = oof_thresholding(fold_preds[expt][0], fold_preds[expt][2])

Fold # 0: 0.64495 (1504 rounds, best threshold: 0.3900)
Fold # 1: 0.63345 ( 765 rounds, best threshold: 0.4310)
Fold # 2: 0.70139 (1439 rounds, best threshold: 0.4355)
Fold # 3: 0.71034 (1920 rounds, best threshold: 0.4335)
Fold # 4: 0.65993 (1212 rounds, best threshold: 0.3665)
Fold # 5: 0.65672 (1289 rounds, best threshold: 0.5130)
Fold # 6: 0.67636 (1192 rounds, best threshold: 0.4020)

Avg. score: 0.66902 +/- 0.02642
OOF score: 0.66899

OOF best threshold: 0.447
OOF score: 0.65684

CPU times: user 31min 42s, sys: 58 s, total: 32min 40s
Wall time: 19min 24s


In [20]:
%%time
expt = '3_3'

class_ratio = train[TARGET].value_counts()[0] / train[TARGET].value_counts()[1]
params = {
    'eval_metric': 'Logloss',
    'scale_pos_weight': class_ratio
}

model = cb.CatBoostClassifier(**base_params, **params)

fold_preds[expt] = custom_cv(features, model)

ot_preds[expt] = oof_thresholding(fold_preds[expt][0], fold_preds[expt][2])

Fold # 0: 0.64539 ( 865 rounds, best threshold: 0.7685)
Fold # 1: 0.61484 ( 490 rounds, best threshold: 0.7125)
Fold # 2: 0.70629 ( 806 rounds, best threshold: 0.7375)
Fold # 3: 0.71186 ( 900 rounds, best threshold: 0.6525)
Fold # 4: 0.65052 ( 692 rounds, best threshold: 0.6770)
Fold # 5: 0.64286 ( 747 rounds, best threshold: 0.7105)
Fold # 6: 0.66667 ( 579 rounds, best threshold: 0.6270)

Avg. score: 0.66263 +/- 0.03266
OOF score: 0.66300

OOF best threshold: 0.6685
OOF score: 0.65112

CPU times: user 24min 15s, sys: 35.6 s, total: 24min 50s
Wall time: 15min 44s


### Exp 4 - folds

In [21]:
%%time
expt = '4_1'

params = {
    'eval_metric': 'Logloss'
}

model = cb.CatBoostClassifier(**base_params, **params)

fold_preds[expt] = custom_cv(features, model, folds=5)

ot_preds[expt] = oof_thresholding(fold_preds[expt][0], fold_preds[expt][2])

Fold # 0: 0.62234 (1868 rounds, best threshold: 0.1465)
Fold # 1: 0.69588 (3590 rounds, best threshold: 0.4365)
Fold # 2: 0.71000 (2550 rounds, best threshold: 0.2415)
Fold # 3: 0.66109 (2501 rounds, best threshold: 0.1130)
Fold # 4: 0.64230 (2687 rounds, best threshold: 0.2605)

Avg. score: 0.66632 +/- 0.03262
OOF score: 0.66667

OOF best threshold: 0.2705
OOF score: 0.65779

CPU times: user 33min 49s, sys: 1min 14s, total: 35min 4s
Wall time: 19min 12s


In [22]:
%%time
expt = '4_2'

params = {
    'eval_metric': 'Logloss'
}

model = cb.CatBoostClassifier(**base_params, **params)

fold_preds[expt] = custom_cv(features, model, folds=10)

ot_preds[expt] = oof_thresholding(fold_preds[expt][0], fold_preds[expt][2])

Fold # 0: 0.65556 (2778 rounds, best threshold: 0.6240)
Fold # 1: 0.62439 (2241 rounds, best threshold: 0.1435)
Fold # 2: 0.67391 (2130 rounds, best threshold: 0.2230)
Fold # 3: 0.73096 (2480 rounds, best threshold: 0.2810)
Fold # 4: 0.67327 (2284 rounds, best threshold: 0.2265)
Fold # 5: 0.72727 (2184 rounds, best threshold: 0.3315)
Fold # 6: 0.66667 (2014 rounds, best threshold: 0.2600)
Fold # 7: 0.68085 (1894 rounds, best threshold: 0.2540)
Fold # 8: 0.61376 (2662 rounds, best threshold: 0.3355)
Fold # 9: 0.71296 (3530 rounds, best threshold: 0.2580)

Avg. score: 0.67596 +/- 0.03754
OOF score: 0.67677

OOF best threshold: 0.258
OOF score: 0.66567

CPU times: user 1h 3min 52s, sys: 2min 16s, total: 1h 6min 8s
Wall time: 36min 42s


In [23]:
%%time
expt = '4_3'

params = {
    'eval_metric': 'Logloss'
}

model = cb.CatBoostClassifier(**base_params, **params)

fold_preds[expt] = custom_cv(features, model, folds=15)

ot_preds[expt] = oof_thresholding(fold_preds[expt][0], fold_preds[expt][2])

Fold # 0: 0.68293 (2520 rounds, best threshold: 0.3835)
Fold # 1: 0.68852 (2585 rounds, best threshold: 0.3730)
Fold # 2: 0.54098 ( 208 rounds, best threshold: 0.3055)
Fold # 3: 0.65517 ( 608 rounds, best threshold: 0.4545)
Fold # 4: 0.71429 (2977 rounds, best threshold: 0.2610)
Fold # 5: 0.78322 (5030 rounds, best threshold: 0.2290)
Fold # 6: 0.68613 (2305 rounds, best threshold: 0.2365)
Fold # 7: 0.74809 (2886 rounds, best threshold: 0.2185)
Fold # 8: 0.67797 (1997 rounds, best threshold: 0.4720)
Fold # 9: 0.68966 (2386 rounds, best threshold: 0.1215)
Fold #10: 0.64430 (2737 rounds, best threshold: 0.1335)
Fold #11: 0.71533 (2061 rounds, best threshold: 0.2330)
Fold #12: 0.60317 (2171 rounds, best threshold: 0.2945)
Fold #13: 0.72727 (2504 rounds, best threshold: 0.3860)
Fold #14: 0.69291 (3149 rounds, best threshold: 0.3065)

Avg. score: 0.68333 +/- 0.05615
OOF score: 0.68475

OOF best threshold: 0.254
OOF score: 0.66500

CPU times: user 1h 35min 55s, sys: 3min 25s, total: 1h 39min 

### Exp 5 - feature set

In [24]:
%%time
expt = '5_1'

params = {
    'eval_metric': 'Logloss'
}

model = cb.CatBoostClassifier(**base_params, **params)

fold_preds[expt] = custom_cv(['embedding_vector'], model, folds=15)

ot_preds[expt] = oof_thresholding(fold_preds[expt][0], fold_preds[expt][2])

Fold # 0: 0.65625 ( 423 rounds, best threshold: 0.2210)
Fold # 1: 0.68182 (1498 rounds, best threshold: 0.2310)
Fold # 2: 0.55652 ( 200 rounds, best threshold: 0.3405)
Fold # 3: 0.62712 ( 302 rounds, best threshold: 0.3615)
Fold # 4: 0.67200 (1451 rounds, best threshold: 0.2825)
Fold # 5: 0.74453 ( 781 rounds, best threshold: 0.3010)
Fold # 6: 0.62411 ( 308 rounds, best threshold: 0.1835)
Fold # 7: 0.72857 (1084 rounds, best threshold: 0.1830)
Fold # 8: 0.66667 ( 364 rounds, best threshold: 0.5120)
Fold # 9: 0.67606 ( 457 rounds, best threshold: 0.1490)
Fold #10: 0.59302 ( 323 rounds, best threshold: 0.1050)
Fold #11: 0.68750 ( 784 rounds, best threshold: 0.3045)
Fold #12: 0.57325 ( 339 rounds, best threshold: 0.1255)
Fold #13: 0.67550 ( 785 rounds, best threshold: 0.2270)
Fold #14: 0.65693 ( 956 rounds, best threshold: 0.1685)

Avg. score: 0.65466 +/- 0.05063
OOF score: 0.65392

OOF best threshold: 0.342
OOF score: 0.63983

CPU times: user 38min 46s, sys: 1min 8s, total: 39min 54s
Wal

In [25]:
%%time
expt = '5_2'

params = {
    'eval_metric': 'Logloss'
}

model = cb.CatBoostClassifier(**base_params, **params)

fold_preds[expt] = custom_cv(['embedding_vector'] + embeddings, model, folds=15)

ot_preds[expt] = oof_thresholding(fold_preds[expt][0], fold_preds[expt][2])

Fold # 0: 0.68800 (2830 rounds, best threshold: 0.3520)
Fold # 1: 0.67769 (1915 rounds, best threshold: 0.3465)
Fold # 2: 0.53333 ( 324 rounds, best threshold: 0.2960)
Fold # 3: 0.66102 ( 990 rounds, best threshold: 0.4465)
Fold # 4: 0.70677 (3169 rounds, best threshold: 0.1900)
Fold # 5: 0.79389 (4125 rounds, best threshold: 0.3470)
Fold # 6: 0.66667 (1975 rounds, best threshold: 0.1420)
Fold # 7: 0.73600 (2525 rounds, best threshold: 0.4580)
Fold # 8: 0.67606 (3221 rounds, best threshold: 0.2070)
Fold # 9: 0.69065 (2324 rounds, best threshold: 0.1495)
Fold #10: 0.64000 (2861 rounds, best threshold: 0.1295)
Fold #11: 0.72180 (1453 rounds, best threshold: 0.2615)
Fold #12: 0.60256 (1982 rounds, best threshold: 0.1280)
Fold #13: 0.71318 (3358 rounds, best threshold: 0.3920)
Fold #14: 0.68254 (1896 rounds, best threshold: 0.3570)

Avg. score: 0.67934 +/- 0.05746
OOF score: 0.67864

OOF best threshold: 0.262
OOF score: 0.66099

CPU times: user 1h 30min 11s, sys: 3min 22s, total: 1h 33min 

In [26]:
%%time
expt = '5_3'

params = {
    'eval_metric': 'Logloss'
}

model = cb.CatBoostClassifier(**base_params, **params)

fold_preds[expt] = custom_cv(['embedding_vector', 'word_count', 'punc_num', 'word_punc_ratio'], model, folds=15)

ot_preds[expt] = oof_thresholding(fold_preds[expt][0], fold_preds[expt][2])

Fold # 0: 0.67647 (1306 rounds, best threshold: 0.2170)
Fold # 1: 0.70423 (1198 rounds, best threshold: 0.1970)
Fold # 2: 0.53846 ( 221 rounds, best threshold: 0.2470)
Fold # 3: 0.61538 ( 732 rounds, best threshold: 0.4225)
Fold # 4: 0.68493 (1811 rounds, best threshold: 0.1530)
Fold # 5: 0.72593 ( 810 rounds, best threshold: 0.3795)
Fold # 6: 0.62745 ( 334 rounds, best threshold: 0.1390)
Fold # 7: 0.75758 (1739 rounds, best threshold: 0.2705)
Fold # 8: 0.66667 ( 651 rounds, best threshold: 0.2775)
Fold # 9: 0.68254 ( 537 rounds, best threshold: 0.3035)
Fold #10: 0.58929 ( 277 rounds, best threshold: 0.4450)
Fold #11: 0.69173 ( 466 rounds, best threshold: 0.2430)
Fold #12: 0.58333 (2618 rounds, best threshold: 0.4260)
Fold #13: 0.68148 ( 636 rounds, best threshold: 0.3430)
Fold #14: 0.66165 (2508 rounds, best threshold: 0.1885)

Avg. score: 0.65914 +/- 0.05623
OOF score: 0.66096

OOF best threshold: 0.2775
OOF score: 0.64077

CPU times: user 45min 52s, sys: 1min 38s, total: 47min 31s
W

# Submission files

In [27]:
def create_submission_files(preds, config):
    sub = sample_sub.copy()
    sub[TARGET] = preds
    sub.to_csv(f'{config}.csv', index=False)

In [28]:
configs = ['1', '2_1', '2_2', '3_1', '3_2', '3_3', '4_1', '4_2', '4_3', '5_1', '5_2', '5_3']

for expt in configs:
    create_submission_files(fold_preds[expt][-1]['mode'], f'expt{expt}_fold')
    create_submission_files(ot_preds[expt][1], f'expt{expt}_ot')

# ----------------------------------------