## Hyperparameter Tuning on Logistic Regression, Random Forest and XGBoostClassifier Models

#### Import libraries
Necessary installs are in requirements.txt

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold, GroupKFold
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder
import scipy
from scipy.sparse import csr_matrix
import numpy as np
from sklearn.model_selection import ParameterGrid
from sklearn.preprocessing import StandardScaler
import sys

#### Write output to log file (hyperaparameter_tuning_results.txt)

In [2]:
class Tee:
    def __init__(self, *streams):
        self.streams = streams

    def write(self, data):
        for stream in self.streams:
            stream.write(data)

    def flush(self):
        for stream in self.streams:
            stream.flush()

log_file = open("hyperparameter_tuning_results.txt", "w") #rename as needed
sys.stdout = Tee(sys.__stdout__, log_file)

[{"name": "GroupKFold", "type": "ABCMeta", "fullType": "abc.ABCMeta"}, {"name": "LogisticRegression", "type": "type", "fullType": "type"}, {"name": "ParameterGrid", "type": "type", "fullType": "type"}, {"name": "RandomForestClassifier", "type": "ABCMeta", "fullType": "abc.ABCMeta"}, {"name": "StandardScaler", "type": "type", "fullType": "type"}, {"name": "StratifiedKFold", "type": "ABCMeta", "fullType": "abc.ABCMeta"}, {"name": "Tee", "type": "type", "fullType": "type"}, {"name": "XGBClassifier", "type": "type", "fullType": "type"}, {"name": "accuracy_score", "type": "function", "fullType": "function"}, {"name": "confusion_matrix", "type": "function", "fullType": "function"}, {"name": "csr_matrix", "type": "type", "fullType": "type"}, {"name": "f1_score", "type": "function", "fullType": "function"}, {"name": "log_file", "type": "TextIOWrapper", "fullType": "_io.TextIOWrapper"}, {"name": "np", "type": "module", "fullType": "module"}, {"name": "precision_score", "type": "function", "full

#### Evaluation / Hyperparametertuning Function

In [17]:
def evaluate(classifier_class, param_grid, X, y, group):
    """
    Evaluate a classifier using 5-fold GroupKFold cross-validation with hyperparameter tuning.
    Parameters:
    classifier_class: The classifier class to be evaluated (e.g., LogisticRegression, RandomForestClassifier). Note: XGBClassifier cannot be tuned in this script.
    param_grid: A dictionary specifying the hyperparameter grid for tuning.
    X: Feature matrix (numpy array or sparse matrix).
    y: Target vector (numpy array).
    group: Array-like structure containing group labels for each sample (e.g., filenames).
    """

    
    class_labels = np.unique(y)
    #skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    gkf = GroupKFold(n_splits=5)
    
    precisions = []
    recalls = []
    f1_scores = []
    accuracies = []

    # Generate all combinations and filter invalid ones for LogisticRegression
    valid_param_combinations = []
    for params in ParameterGrid(param_grid):
        if classifier_class == LogisticRegression:
            solver = params.get("solver")
            penalty = params.get("penalty")
            if penalty == "l1" and solver != "saga":
                continue  # Skip invalid
        valid_param_combinations.append(params)

    #for fold, (train_index, test_index) in enumerate(skf.split(X, y)):
    for fold, (train_index, test_index) in enumerate(gkf.split(X, y, groups=filenames)): 
        X_train_fold, X_test_fold = X[train_index], X[test_index]
        y_train_fold, y_test_fold = y[train_index], y[test_index]

        best_score = -np.inf
        best_model = None
        best_params = None

        for params in valid_param_combinations:
            try:
                # Special handling for XGBClassifier
                if classifier_class == XGBClassifier and "eval_metric" in params:
                    params_copy = params.copy()
                    eval_metric = params_copy.pop("eval_metric")  # remove before init
                    model = classifier_class(**params_copy)
                    model.fit(X_train_fold, y_train_fold, eval_metric=eval_metric)
                else:
                    model = classifier_class(**params)
                    model.fit(X_train_fold, y_train_fold)

                y_val_pred = model.predict(X_test_fold)
                score = f1_score(y_test_fold, y_val_pred, average="weighted")

                if score > best_score:
                    best_score = score
                    best_model = model
                    best_params = params

            except Exception as e:
                print(f"Skipping params {params} due to error: {e}")
                continue

        print(f"Best parameters for fold {fold}: {best_params}")

        print(best_model)

        y_pred_fold = best_model.predict(X_test_fold)

        precision = precision_score(y_test_fold, y_pred_fold, average=None, zero_division=0)
        recall = recall_score(y_test_fold, y_pred_fold, average=None, zero_division=0)
        f1 = f1_score(y_test_fold, y_pred_fold, average=None, zero_division=0)
        accuracy = accuracy_score(y_test_fold, y_pred_fold)

        precisions.append(precision)
        recalls.append(recall)
        f1_scores.append(f1)
        accuracies.append(accuracy)

        cm = confusion_matrix(y_test_fold, y_pred_fold, labels=class_labels)
        print(f"Confusion matrix fold {fold}\n", cm)

    avg_class_precision = np.mean(precisions, axis=0)
    avg_class_recall = np.mean(recalls, axis=0)
    avg_class_f1_score = np.mean(f1_scores, axis=0)

    avg_accuracy = np.mean(accuracies)
    avg_recall = np.mean(recalls)
    avg_f1_score = np.mean(f1_scores)
    avg_precision = np.mean(precisions)

    print("--------------------------------------------------------\n")
    print("Average Accuracy: ", avg_accuracy, "\n")
    print("Average Precision: ", avg_precision)
    print("Averaged Precision per Class:")
    for i, label in enumerate(class_labels):
        print(f"Class {label}: {avg_class_precision[i]:.4f}")
    
    print("\nAverage Recall: ", avg_recall)
    print("Averaged Recall per Class:")
    for i, label in enumerate(class_labels):
        print(f"Class {label}: {avg_class_recall[i]:.4f}")
    
    print("\nAverage F1-Score: ", avg_f1_score)
    print("Averaged F1-Score per Class:")
    for i, label in enumerate(class_labels):
        print(f"Class {label}: {avg_class_f1_score[i]:.4f}")



#### Load Data

In [4]:
feature_vector_all = scipy.sparse.load_npz("vectorized_data.npz")
y = np.load("labels.npy", allow_pickle=True)
filenames = np.load("filenames.npy", allow_pickle=True)
X = csr_matrix(feature_vector_all)
scaler = StandardScaler(with_mean=False)
X_scaled = scaler.fit_transform(X)

#### Run Hyperparameter tuning on different models

### Random Forest

In [9]:
rf_param_grid = {
    "n_estimators": [100, 200],
    "max_depth": [None, 10, 20],
    "class_weight": [None, "balanced"],
    "n_jobs": [-1]
}

print("\n=== Random Forest ===")
evaluate(RandomForestClassifier, rf_param_grid, X, y, filenames)

#### Logistic Regression


In [10]:
lr_param_grid = {
    "max_iter": [400], 
    "C": [1, 10, 0.1],
    "solver": ["lbfgs", "saga"],
    "penalty": ["l1", "l2"],
    "class_weight": [None, "balanced"],
    "n_jobs": [-1]
}
print("\n=== Logistic Regression ===")
evaluate(LogisticRegression, lr_param_grid, X_scaled, y, filenames)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=400).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=400).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=400).
You might also want to sca

#### DIfferent Code for XGBoost

In [12]:
def classify_xgboost(model_class, parameters, X, y, groups):
    class_labels = np.unique(y)
    gkf = GroupKFold(n_splits=10)

    precisions = []
    recalls = []
    f1_scores = []
    accuracies = []


    for fold, (train_index, test_index) in enumerate(gkf.split(X, y, groups=groups)):
        X_train_fold, X_test_fold = X[train_index], X[test_index]
        y_train_fold, y_test_fold = y[train_index], y[test_index]

        best_score = -np.inf
        best_model = None
        best_params = None
        best_overall_params = None
        best_overall_score = -np.inf

        for params in ParameterGrid(parameters):
            model = model_class(**params)
            model.fit(X_train_fold, y_train_fold)
            score = f1_score(y_test_fold, model.predict(X_test_fold), average="weighted")

            if score > best_score:
                best_score = score
                best_model = model
                best_params = params
                best_overall_score = best_score
                best_overall_params = best_params

        y_pred_fold = best_model.predict(X_test_fold)

        precision = precision_score(y_test_fold, y_pred_fold, labels=class_labels, average=None, zero_division=0)
        recall = recall_score(y_test_fold, y_pred_fold, labels=class_labels, average=None, zero_division=0)
        f1 = f1_score(y_test_fold, y_pred_fold, labels=class_labels, average=None, zero_division=0)
        accuracy = accuracy_score(y_test_fold, y_pred_fold)

        # now all arrays have length = len(class_labels)
        precisions.append(precision)
        recalls.append(recall)
        f1_scores.append(f1)
        accuracies.append(accuracy)

        cm = confusion_matrix(y_test_fold, y_pred_fold, labels=class_labels)
        print(f"Confusion matrix fold {fold}\n", cm)
        print(f"Best Parameters fold {fold}: {best_params}")

    avg_class_precision = np.mean(precisions, axis=0)
    avg_class_recall = np.mean(recalls, axis=0)
    avg_class_f1_score = np.mean(f1_scores, axis=0)

    avg_accuracy = np.mean(accuracies)
    avg_recall = np.mean(recalls, axis=0)
    avg_f1_score = np.mean(f1_scores, axis=0)
    avg_precision = np.mean(precisions, axis=0)

    print("--------------------------------------------------------\n")
    print("Average Accuracy: ", avg_accuracy, "\n")
    print("Average Precision: ", avg_precision)
    print("Averaged Precision per Class:")
    for i, label in enumerate(class_labels):
        print(f"Class {label}: {avg_class_precision[i]:.4f}")
    
    print("\nAverage Recall: ", avg_recall)
    print("Averaged Recall per Class:")
    for i, label in enumerate(class_labels):
        print(f"Class {label}: {avg_class_recall[i]:.4f}")
    
    print("\nAverage F1-Score: ", avg_f1_score)
    print("Averaged F1-Score per Class:")
    for i, label in enumerate(class_labels):
        print(f"Class {label}: {avg_class_f1_score[i]:.4f}")


    print("Best Overall Parameters: ", best_overall_params)
    print("Best Overall F1-Score: ", best_overall_score)


    


In [5]:
xgb_parameters = {
        "n_estimators": [100, 200],
        "learning_rate": [0.1, 0.05],
        "max_depth": [3, 5],
        "subsample": [0.8, 1.0],
        "colsample_bytree": [0.8, 1.0],
        "eval_metric": ["mlogloss"]
    }

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
le_name_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Label mapping: ", le_name_mapping)

classify_xgboost(XGBClassifier, xgb_parameters, X_scaled, y, filenames)


Label mapping:  {'com': 0, 'dec': 1, 'dir': 2, 'exp': 3, 'icu': 4, 'rep': 5, 'soc': 6, 'xpa': 7}
