# Imports

In [1]:
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score
import numpy as np

from sklearn.metrics import (
    precision_recall_curve, average_precision_score,
    roc_auc_score, f1_score, balanced_accuracy_score,
    matthews_corrcoef, confusion_matrix
)

In [4]:
X_train = np.load("../concated_embs_npy/X_train_aug_1v10.npy")
y_train = np.load("../concated_embs_npy/y_train_aug_1v10.npy")

X_test  = np.load("../concated_embs_npy/X_test.npy")
y_test  = np.load("../concated_embs_npy/y_test.npy")

X_val   = np.load("../concated_embs_npy/X_val.npy")
y_val   = np.load("../concated_embs_npy/y_val.npy")

# Process

In [5]:
model_cb = CatBoostClassifier(
    loss_function='Logloss',
    eval_metric='F1',
    depth=8,
    learning_rate=0.05,
    iterations=500,
    random_seed=42,
    verbose=100
)

model_cb.fit(X_train, y_train, eval_set=(X_test, y_test))

y_prob = model_cb.predict_proba(X_val)[:, 1]
y_pred = (y_prob > 0.5).astype(int)

print("PR-AUC:", average_precision_score(y_val, y_prob))
print("ROC-AUC:", roc_auc_score(y_val, y_prob))
print("F1-macro:", f1_score(y_val, y_pred, average='macro'))
print("F1-pos:", f1_score(y_val, y_pred, pos_label=1))
print("BalancedAcc:", balanced_accuracy_score(y_val, y_pred))
print("MCC:", matthews_corrcoef(y_val, y_pred))
print("Confusion:\n", confusion_matrix(y_val, y_pred))


0:	learn: 0.7343388	test: 0.6670850	best: 0.6670850 (0)	total: 3.48s	remaining: 28m 55s
100:	learn: 0.8863803	test: 0.8797046	best: 0.9094284 (3)	total: 1m 30s	remaining: 5m 58s
200:	learn: 0.9020167	test: 0.9807565	best: 0.9816461 (176)	total: 2m 50s	remaining: 4m 14s
300:	learn: 0.9094809	test: 0.9885717	best: 0.9886335 (296)	total: 4m 8s	remaining: 2m 44s
400:	learn: 0.9147739	test: 0.9895833	best: 0.9900022 (374)	total: 5m 27s	remaining: 1m 20s
499:	learn: 0.9186740	test: 0.9899899	best: 0.9900145 (498)	total: 6m 45s	remaining: 0us

bestTest = 0.9900145088
bestIteration = 498

Shrink model to first 499 iterations.
PR-AUC: 0.9999349002369937
ROC-AUC: 0.7433973641140494
F1-macro: 0.39478114787482743
F1-pos: 0.7883310719131614
BalancedAcc: 0.8253079507278835
MCC: 0.020019331261472362
Confusion:
 [[    5     0]
 [ 8112 15106]]


In [8]:
from xgboost import XGBClassifier

model_xgb = XGBClassifier(
    n_estimators=500,
    max_depth=7,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.8,
    objective='binary:logistic',
    random_state=42,
    n_jobs=-1
)

model_xgb.fit(X_train, y_train.ravel())
y_pred_xgb = model_xgb.predict_proba(X_val)[:, 1].ravel()
y_pred = (y_pred_xgb > 0.5).astype(int)

print("PR-AUC:", average_precision_score(y_val, y_prob))
print("ROC-AUC:", roc_auc_score(y_val, y_prob))
print("F1-macro:", f1_score(y_val, y_pred, average='macro'))
print("F1-pos:", f1_score(y_val, y_pred, pos_label=1))
print("BalancedAcc:", balanced_accuracy_score(y_val, y_pred))
print("MCC:", matthews_corrcoef(y_val, y_pred))
print("Confusion:\n", confusion_matrix(y_val, y_pred))


PR-AUC: 0.9999477921279158
ROC-AUC: 0.7892497200447928
F1-macro: 0.4812705220130001
F1-pos: 0.9625410440260002
BalancedAcc: 0.46399345335515546
MCC: -0.0040874929992031645
Confusion:
 [[    0     5]
 [ 1672 21546]]
