In [None]:
import os
import sys

ROOT_DIR = os.path.abspath(os.path.join('..'))
sys.path.append(ROOT_DIR)

os.environ["WANDB_SILENT"] = "true"

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from wandb.xgboost import WandbCallback
import sklearn.metrics as metrics
import xgboost as xgb
import wandb

from data.dataloader import NoReCDataLoader
from data.preprocessor import NoReCDataPreprocessor
from utils.utils import init_run

In [None]:
config = init_run(config_name="binary_xgboost", run_name="Binary-XGBoost")

# Loading and processing data

In [None]:
train_df, val_df, test_df = NoReCDataLoader(**config.dataloader).load_binary_dataset()

preprocessor = NoReCDataPreprocessor()

train_df = preprocessor.sanitize(train_df, "train")
val_df = preprocessor.sanitize(val_df, "val")
test_df = preprocessor.sanitize(test_df, "test")

train_df["label"] = train_df["label"].astype(int)
val_df["label"] = val_df["label"].astype(int)
test_df["label"] = test_df["label"].astype(int)

In [None]:
vectorizer = TfidfVectorizer(**config.vectorizer)
feature_vectorizer = vectorizer.fit(train_df['text'])
print("Number of features: ", len(feature_vectorizer.get_feature_names_out()))

x_train = feature_vectorizer.transform(train_df['text'])
x_val = feature_vectorizer.transform(val_df['text'])
x_test = feature_vectorizer.transform(test_df['text'])

y_train = train_df['label']
y_val = val_df['label']
y_test = test_df['label']

print("Train shape: ", x_train.shape)
print("Val shape: ", x_val.shape)
print("Test shape: ", x_test.shape)

# Modeling and Training

In [None]:
model = xgb.XGBClassifier(
    **config.model,
    objective="binary:logistic",
    eval_metric=["logloss", "auc", "error"],
    callbacks=[WandbCallback(log_model=False)]
)
model.fit(
    x_train,
    y_train,
    eval_set=[(x_train, y_train), (x_val, y_val)]
)

# Testing

In [None]:
y_preds = model.predict(x_test)    
y_test = y_test.to_numpy()

auc = metrics.roc_auc_score(y_test, y_preds)
accuracy = metrics.accuracy_score(y_test, y_preds)
f1 = metrics.f1_score(y_test, y_preds)

print('AUC: %.4f' % auc)   
print('Accuracy: %.4f' % accuracy)
print('F1: %.4f' % f1)

In [None]:
wandb.run.summary['test_auc'] = auc
wandb.run.summary['test_accuracy'] = accuracy
wandb.run.summary['test_f1'] = f1
wandb.log({"confusion_matrix": wandb.plot.confusion_matrix(
    preds=y_preds,
    y_true=y_test,
    class_names=["negative", "positive"]
)})
wandb.finish()