## XGBoost Model With Oversampling

#### Import Libraries

In [1]:
from sklearn.model_selection import GroupKFold
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder
import scipy
from scipy.sparse import csr_matrix
import numpy as np
import nltk
nltk.download('punkt')
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
import sys

[nltk_data] Downloading package punkt to /Users/soconr/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


#### Write outputs to log file

In [2]:
class Tee:
    def __init__(self, *streams):
        self.streams = streams

    def write(self, data):
        for stream in self.streams:
            stream.write(data)

    def flush(self):
        for stream in self.streams:
            stream.flush()

log_file = open("xgboost_full_oversampling_results.txt", "w") #rename as needed
sys.stdout = Tee(sys.__stdout__, log_file)

[{"name": "GroupKFold", "type": "ABCMeta", "fullType": "abc.ABCMeta"}, {"name": "LabelEncoder", "type": "type", "fullType": "type"}, {"name": "RandomOverSampler", "type": "ABCMeta", "fullType": "abc.ABCMeta"}, {"name": "StandardScaler", "type": "type", "fullType": "type"}, {"name": "Tee", "type": "type", "fullType": "type"}, {"name": "XGBClassifier", "type": "type", "fullType": "type"}, {"name": "accuracy_score", "type": "function", "fullType": "function"}, {"name": "confusion_matrix", "type": "function", "fullType": "function"}, {"name": "csr_matrix", "type": "type", "fullType": "type"}, {"name": "f1_score", "type": "function", "fullType": "function"}, {"name": "log_file", "type": "TextIOWrapper", "fullType": "_io.TextIOWrapper"}, {"name": "nltk", "type": "module", "fullType": "module"}, {"name": "np", "type": "module", "fullType": "module"}, {"name": "precision_score", "type": "function", "fullType": "function"}, {"name": "recall_score", "type": "function", "fullType": "function"}, {

In [8]:
def classify(classifier, params, X, y, groups):
    class_labels = np.unique(y)
    gkf = GroupKFold(n_splits=10)

    precisions = []
    recalls = []
    f1_scores = []
    accuracies = []


    for fold, (train_index, test_index) in enumerate(gkf.split(X, y, groups=groups)):
        X_train_fold, X_test_fold = X[train_index], X[test_index]
        y_train_fold, y_test_fold = y[train_index], y[test_index]

        # Oversample training data
        ros = RandomOverSampler(random_state=42) #set sampling_strategy parameter here for other distributions than full balance oversampling
        X_train_resampled, y_train_resampled = ros.fit_resample(X_train_fold, y_train_fold)
        
        model = classifier(**params)
        model.fit(X_train_resampled, y_train_resampled)

        y_pred_fold = model.predict(X_test_fold)

        # Get metrics per class for this fold
        precision = precision_score(y_test_fold, y_pred_fold, labels=class_labels, average=None, zero_division=0)
        recall = recall_score(y_test_fold, y_pred_fold, labels=class_labels, average=None, zero_division=0)
        f1 = f1_score(y_test_fold, y_pred_fold, labels=class_labels, average=None, zero_division=0)
        accuracy = accuracy_score(y_test_fold, y_pred_fold)

        precisions.append(precision)
        recalls.append(recall)
        f1_scores.append(f1)
        accuracies.append(accuracy)

        cm = confusion_matrix(y_test_fold, y_pred_fold, labels=class_labels)
        print(f"Confusion matrix fold {fold}\n", cm)

    avg_class_precision = np.mean(precisions, axis=0)
    avg_class_recall = np.mean(recalls, axis=0)
    avg_class_f1_score = np.mean(f1_scores, axis=0)

    avg_accuracy = np.mean(accuracies)
    avg_recall = np.mean(recalls)
    avg_f1_score = np.mean(f1_scores)
    avg_precision = np.mean(precisions)

    print("--------------------------------------------------------\n")
    print("Average Accuracy: ", avg_accuracy, "\n")
    print("Average Precision: ", avg_precision)
    print("Averaged Precision per Class:")
    for i, label in enumerate(class_labels):
        print(f"Class {label}: {avg_class_precision[i]:.4f}")
    
    print("\nAverage Recall: ", avg_recall)
    print("Averaged Recall per Class:")
    for i, label in enumerate(class_labels):
        print(f"Class {label}: {avg_class_recall[i]:.4f}")
    
    print("\nAverage F1-Score: ", avg_f1_score)
    print("Averaged F1-Score per Class:")
    for i, label in enumerate(class_labels):
        print(f"Class {label}: {avg_class_f1_score[i]:.4f}")

In [3]:
feature_vector = scipy.sparse.load_npz("vectorized_data.npz")
y = np.load("labels.npy", allow_pickle=True)
# transform labels into numerical values
labelencoder = LabelEncoder()
labelencoder.fit(y)
y = labelencoder.transform(y)
le_name_mapping = dict(zip(labelencoder.classes_, labelencoder.transform(labelencoder.classes_)))
print("Label mapping: ", le_name_mapping)

filenames = np.load("filenames.npy", allow_pickle=True)
X = csr_matrix(feature_vector)
scaler = StandardScaler(with_mean=False)
X_scaled = scaler.fit_transform(X)

xgb_params = {
    "n_estimators": 200,
    "learning_rate": 0.1,
    "max_depth": 5,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "eval_metric": "mlogloss"
}


In [10]:
classify(XGBClassifier, xgb_params, X_scaled, y, filenames)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
