## Setup
For benchmarking classical ML models, deep learning approaches, and transformer-based pre-trained language models against our LLM applications, we trained each with the same annotated training data as our LLM fine-tuning, albeit without the prompts. To identify the best-performing model setup, we conducted several iterations with different embedding approaches and hyperparameter settings.
For the main analysis in Section 4 of the paper, we used representative training sets consisting of n = 2,000 release notes and labels. For the additional analysis in Section 5.4 of the paper, we varied the training data distribution (representative and balanced) and size (n = 100, 250, 500, 1,000). To ensure compatibility with scikit-learn, labels were zero-indexed. A fixed hold-out validation set (n = 1,000) was reserved for final evaluation.

#### Imports
 See `requirements.txt` for full dependency versions

In [None]:
import os
import glob
import json
import pandas  as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import torch
import torch.nn as nn
import optuna
import sys
sys.path.insert(0, os.path.abspath(os.path.join("..", "src")))

from scipy.stats import randint, uniform
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import make_scorer, f1_score, classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from xgboost import XGBClassifier
from glove_vectorizer import GloveVectorizer
from skorch import NeuralNetClassifier
from textcnn import TorchTokenizer, TextCNN
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from optuna.samplers import TPESampler

#### Global Paths, Directories, Variables, and Classifier Instances

In [None]:
# Define Demo Study path
DEMO_PATH   = os.path.abspath(os.path.join(".."))

# Define relevant paths 
TRAIN_GLOB =os.path.join(DEMO_PATH,'training_validation_data', 'demo_app_updates_train*.csv')
VAL_PATH   = os.path.join(DEMO_PATH,'training_validation_data', 'demo_app_updates_validation_real_1000.csv')
OUTPUT_DIR = os.path.join(DEMO_PATH,'output_data')
TABLE_DIR  = os.path.join(DEMO_PATH,'tables')
FIG_DIR    = os.path.join(DEMO_PATH,'figures')
GLOVE_DIR = os.path.abspath(os.path.join("..", "embedding_files"))
MODELS_DIR = os.path.join(DEMO_PATH,'models')

# Define parallelism structure (note: INNER_JOBS * OUT_JOBS < max cores)
INNER_JOBS = 5
OUTER_JOBS = 10

# CV & reproducibility
RANDOM_STATE = 94032
CV     = StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE)
SCORER = make_scorer(f1_score, average="macro")

# Define classifier instances
CLASSIFIERS = {
    "RandomForest":      RandomForestClassifier(random_state=RANDOM_STATE, n_jobs=INNER_JOBS),
    "LogisticRegression":LogisticRegression(random_state=RANDOM_STATE, n_jobs=INNER_JOBS, max_iter=1000),
    "KNeighbors":        KNeighborsClassifier(n_jobs=INNER_JOBS),
    "NaiveBayes":        MultinomialNB(),
    "SVM":               SVC(random_state=RANDOM_STATE, probability=True),
    "XGBoost":           XGBClassifier(random_state=RANDOM_STATE, n_jobs=INNER_JOBS, eval_metric="mlogloss"),
}

# Define hyperparameter ranges for each classifier
CLASSIFIER_PARAM_TEMPLATES = {
    "RandomForest": {
        "clf__n_estimators"      : randint(50, 501),    # Number of trees (50-500).  Vary forest size to trade off bias (few trees) vs. variance & runtime (many trees).
        "clf__max_depth"         : randint(5, 101),     # Maximum tree depth (5-100).  Controls model complexity; shallow trees generalise, deep trees capture nuance.
        "clf__min_samples_split" : randint(2, 21),      # Min samples to split (2-20).  Larger values prevent very small, noisy splits.
        "clf__min_samples_leaf"  : randint(1, 11),      # Min samples at leaf (1-10).  Smooths predictions and combats over-fitting on rare n-grams.
        "clf__max_features"      : uniform(0.1, 0.9),   # Feature subsampling (10-90 %).  Forces diversity among trees; useful when TF-IDF has many correlated features.
        "clf__criterion"         : ["gini", "entropy"], # Split impurity measure.  Both are common; entropy can capture class imbalance nuances.
        "clf__bootstrap"         : [True, False],       # Use bootstrap samples or full data.  Testing both may improve stability for sparse text.
    },
    "LogisticRegression": {
        "clf__C"      : uniform(0.01, 10),              # Inverse reg. strength (0.01-10).  Explores from strong regularisation (simple model) to weak (complex).
        "clf__penalty": ["l2"],                         # L2 penalty (stable for multiclass TF-IDF).  Kept fixed per literature best-practice.
        "clf__solver" : ["lbfgs", "saga"],              # Optimisers: LBFGS (dense) vs. SAGA (sparse, large data).  Choice may affect convergence speed.
    },
    "KNeighbors": {
        "clf__n_neighbors": randint(1, 31),             # k (1-30).  Smaller k captures local nuance; larger k smooths decision boundaries.
        "clf__weights"    : ["uniform", "distance"],    # Voting scheme.  Distance weighting often helps when neighbours at differing distances.
        "clf__leaf_size"  : randint(10, 51),            # Ball-tree leaf size (10-50).  Impacts search speed vs. memory for high-dim. TF-IDF vectors.
    },

    # NaiveBayes hyperparameters:
    "NaiveBayes": {
        "clf__alpha"    : uniform(1e-6, 1.0),           # Additive smoothing (1e-6-1).  Controls how aggressively rare terms are down-weighted.
        "clf__fit_prior": [True, False],                # Learn class priors or assume uniform.  Testing both handles potential label imbalance.
    },

    # SVM hyperparameters:
    "SVM": {
        "clf__C"     : uniform(0.1, 10),                # Soft-margin cost (0.1-10).  Balances margin width vs. mis-classification tolerance.
        "clf__kernel": ["linear", "rbf", "poly"],       # Kernels: linear (fast for TF-IDF), RBF & poly for non-linear patterns.
        "clf__gamma" : ["scale", "auto"],               # Kernel coefficient heuristics.  Both common; effect only for RBF/poly.
        "clf__degree": randint(2, 6),                   # Polynomial degree (2-5).  Higher degree leads to more complex decision surface (poly kernel only).
    },

    # XGBoost hyperparameters:
    "XGBoost": {
        "clf__n_estimators"     : randint(50, 501),     # Boosting rounds (50-500).  More rounds improve fit but risk overfitting & longer training.
        "clf__max_depth"        : randint(3, 11),       # Tree depth (3-10).  Shallow trees reduce overfitting on sparse high-dim. text features.
        "clf__learning_rate"    : uniform(0.01, 0.3),   # Shrinkage (0.01-0.3).  Lower LR needs more trees but can yield better generalisation.
        "clf__subsample"        : uniform(0.5, 0.5),    # Row subsampling centred at 0.5.  Encourages diversity among trees; mitigates overfitting.
        "clf__colsample_bytree" : uniform(0.5, 0.5),    # Column subsampling (~50 %).  Helpful with large TF-IDF vocabularies to speed up training.
    },
}

## Classical ML models

### TF-IDF Pipeline
To identify the optimal model configuration for update classification, we systematically tuned feature representations and classifier hyperparameters using an automated and reproducible pipeline via the scikit-learn (Pedregosa et al., 2011) and XGBoost  (Chen & Guestrin, 2016) libraries.
We used scikit-learn’s `RandomizedSearchCV` with 50 randomly sampled parameter sets per classifier for hyperparameter optimization. Feature representations and classifiers were tuned jointly.

In [None]:
# Define hyperparameter search space for the TF-IDF vectorizer
TFIDF_PARAMS = {
    "tfidf__max_features" : randint(2000, 10001),      # Vocabulary size (2 k – 10 k).  Controls richness of feature set vs. sparsity and runtime.
    "tfidf__ngram_range"  : [(1, 1), (1, 2), (1, 3)],  # Unigrams, bigrams or trigrams.  Enables both single words and frequent phrases as features.
    "tfidf__min_df"       : randint(1, 5),             # Min-doc frequency (1–4).  Prunes extremely rare tokens that add noise or inflate vocab.
    "tfidf__max_df"       : uniform(0.6, 0.4),         # Max-doc frequency (0.6–1.0).  Filters very common terms that carry little discriminative power.
}

# Combine TF-IDF and model-specific hyperparameters for joint optimization in RandomizedSearchCV
PARAM_DISTS = {
    model_name: { **TFIDF_PARAMS, **CLASSIFIER_PARAM_TEMPLATES[model_name] }
    for model_name in CLASSIFIERS
}

#### Model Training and Evaluation

The following function `run_search_tfidf` handles model training, hyperparameter search, and evaluation on a fixed validation set. Cross-validation results are visualized.

In [None]:
def run_search_tfidf(clf_name, train_path, df_val):
    """
    Trains and tunes a model pipeline with TF-IDF and the specified classifier.
    """
    dataset_name = os.path.splitext(os.path.basename(train_path))[0]
    fig_sub = os.path.join(FIG_DIR, clf_name)
    os.makedirs(fig_sub, exist_ok=True)

    # Load training data
    df_train = pd.read_csv(train_path)
    y_train = df_train['update_classification'] - 1
    X_train = df_train['whats_new']

    # Extract target and features from the preloaded validation DataFrame
    y_val = df_val['update_classification'] - 1
    X_val = df_val['whats_new']

    # Build sklearn pipeline: TF-IDF vectorizer followed by the selected classifier
    pipe = Pipeline([
        ("tfidf", TfidfVectorizer()),
        ("clf", CLASSIFIERS[clf_name]),
    ])

    # Perform randomized search over TF-IDF and classifier parameters using cross-validation
    rs = RandomizedSearchCV(
        pipe,
        param_distributions=PARAM_DISTS[clf_name],
        n_iter=50,
        scoring=SCORER,
        cv=CV,
        n_jobs=OUTER_JOBS,
        random_state=RANDOM_STATE,
        verbose=1,
        refit=True
    )
    rs.fit(X_train, y_train)

    # Extract best CV score and optimal parameter combination
    best_cv = rs.best_score_
    best_params = {k: (v.item() if hasattr(v, "item") else v)
                   for k, v in rs.best_params_.items()}
    
    # Evaluate best model on the hold-out validation set
    y_pred = rs.predict(X_val)

    # Generate classification report and summarize key hold-out metrics
    report = classification_report(
        y_val, y_pred,
        output_dict=True,
        zero_division=0
    )
    hold_macro = report['macro avg']['f1-score']
    hold_wgtd = report['weighted avg']['f1-score']
    hold_acc = accuracy_score(y_val, y_pred)
    
    # Compile summary dictionary with performance metrics and best parameters
    summary = {
        'dataset': dataset_name,
        'model': clf_name,
        'best_cv_macro_f1': best_cv,
        'best_params': json.dumps(best_params),
        'holdout_macro_f1': hold_macro,
        'holdout_weighted_f1': hold_wgtd,
        'holdout_accuracy': hold_acc
    }
    for label, m in report.items():
        if label.isdigit():
            summary[f'label_{label}_f1'] = m['f1-score']

    # Plot histogram of mean CV scores from the search results
    cv_df = pd.DataFrame(rs.cv_results_)
    cv_df['dataset'] = dataset_name

    fig, ax = plt.subplots(figsize=(8, 5))
    ax.hist(cv_df['mean_test_score'], bins=10, color='grey', edgecolor='white')
    ax.yaxis.set_major_locator(ticker.MaxNLocator(integer=True))
    ax.set_xlabel('Mean (CV) Macro Avg. F1-Score')
    ax.set_ylabel('Count')
    ax.grid(False)

    plt.savefig(
        os.path.join(fig_sub, f"tfidf_{dataset_name}_histogram.jpg"),
        dpi=300, bbox_inches='tight'
    )
    plt.close()

    return summary, cv_df, y_pred

#### Execution
We loop over all available training splits and classifiers, optimizing each configuration. Results and predictions are saved for further analysis.


In [None]:
# Load validation data
df_val = pd.read_csv(VAL_PATH)

# Initialize containers for results and predictions
all_summaries = []
cv_results_by_model = {name: [] for name in CLASSIFIERS}
model_preds = {}

# Loop through different training files and classifiers
for train_csv in glob.glob(TRAIN_GLOB):
    base = os.path.splitext(os.path.basename(train_csv))[0]
    parts = base.split('_')  # parts == ["demo","app","updates","train","real","2000"]
    split_type = parts[-2]   # "real" or "equal"
    size       = parts[-1]   # "2000", etc.
    for name in CLASSIFIERS:
        print(f"Running {name} on {train_csv}")
        summary, cv_df, y_pred = run_search_tfidf(name, train_csv, df_val)
        all_summaries.append(summary)
        cv_results_by_model[name].append(cv_df)
        # Store predictions for this model and dataset combination
        key = f"tfidf__{split_type}__{size}__{name}"
        model_preds[key] = y_pred

# Combine all summary metrics into a single DataFrame and export to CSV
combined_df = pd.DataFrame(all_summaries)
combined_df.to_csv(
    os.path.join(TABLE_DIR, 'tfidf_all_models_all_datasets_summary.csv'),
    index=False
)

# Append model predictions to the validation set and export results
val_with_preds = df_val.copy()
for key, preds in model_preds.items():
    colname = f"{key}_pred"
    val_with_preds[colname] = preds
val_with_preds.to_csv(os.path.join(OUTPUT_DIR, 'validation_with_model_preds_NLP.csv'), index=False)

### Glove Pipeline
To benchmark word-embedding features, we convert each release note into a dense vector using pre-trained GloVe models.
For every embedding size, we keep the vectors fixed and tune only the downstream classifier hyperparameters (identical search spaces as in the TF-IDF pipeline).

#### GloVe Embedding Preparation
We load several pre-trained GloVe models (e.g., 100 d, 200 d) from disk, fit a `GloveVectorizer` once per training split, and cache the resulting document embeddings for efficient reuse during hyperparameter search.

In [None]:
# Define available GloVe models
GLOVE_FILES = {
    name: os.path.join(GLOVE_DIR, fname)
    for name, fname in {
        "6B-100d":"glove.6B.100d.txt",
        "6B-300d":"glove.6B.300d.txt",
        "42B-300d":"glove.42B.300d.txt",
        "840B-300d":"glove.840B.300d.txt"
    }.items()
}

In [None]:
# Load validation set
df_val = pd.read_csv(VAL_PATH)
y_val = df_val['update_classification'] - 1
X_val_text = df_val['whats_new']

# Fit one GloveVectorizer per embedding file
glove_vecs = {}
for name, path in GLOVE_FILES.items():
    print(f"[{name}] loading embeddings…")
    vec = GloveVectorizer(glove_path=path)
    vec.fit(X_val_text)  # just loads the file
    glove_vecs[name] = vec

# Transform validation text once per embedding
X_val_glove = {
    name: vec.transform(X_val_text)
    for name, vec in glove_vecs.items()
}

# Precompute train features for each size & each embedding
X_train_glove = {name: {} for name in glove_vecs}
y_train_glove = {}
train_splits  = {}

for train_csv in glob.glob(TRAIN_GLOB):
    base       = os.path.splitext(os.path.basename(train_csv))[0]
    parts      = base.split('_')            # ["...","train","real","2000"]
    split_type = parts[-2]                  # "real" or "equal"
    size       = int(parts[-1])             # 2000, 3000, etc.

    df_tr = pd.read_csv(train_csv)
    key = (split_type, size)
    y_train_glove[key] = df_tr['update_classification'] - 1

    texts = df_tr['whats_new']
    for name, vec in glove_vecs.items():
        X_train_glove[name][key] = vec.transform(texts)

#### Model Training and Evaluation
The helper function `run_search_glove` mirrors `run_search_tfidf`:
it trains a pipeline on pre-computed GloVe features, performs `RandomizedSearchCV`, evaluates on the hold-out validation set, and returns summary metrics plus predictions.

In [None]:
def run_search_glove(clf_name, glove_name, Xg, yg, X_val, y_val, split_type, size):
    """
    Train and tune a classifier on fixed GloVe embeddings.
    """
    dataset_name = f"{glove_name}_{split_type}_{size}"
    fig_sub = os.path.join(FIG_DIR, glove_name, clf_name)
    os.makedirs(fig_sub, exist_ok=True)

    # Build sklearn pipeline with classifier (no vectorizer)
    pipe = Pipeline([('clf', CLASSIFIERS[clf_name])])

    # Perform randomized search over classifier parameters using cross-validation
    rs = RandomizedSearchCV(
        estimator=pipe,
        param_distributions=CLASSIFIER_PARAM_TEMPLATES[clf_name],
        n_iter=50,
        scoring=SCORER,
        cv=CV,
        n_jobs=OUTER_JOBS,
        random_state=RANDOM_STATE,
        verbose=1,
        refit=True
    )
    rs.fit(Xg, yg)

    # Extract best CV score and optimal parameter combination
    best_cv = rs.best_score_
    best_params = {k: (v.item() if hasattr(v, "item") else v)
                   for k, v in rs.best_params_.items()}

    # Evaluate best model on the hold-out validation set
    y_pred = rs.predict(X_val)

    # Generate classification report and summarize key hold-out metrics
    report = classification_report(
        y_val, y_pred,
        output_dict=True,
        zero_division=0
    )
    hold_macro = report['macro avg']['f1-score']
    hold_wgtd = report['weighted avg']['f1-score']
    hold_acc = accuracy_score(y_val, y_pred)

    # Compile summary dictionary with performance metrics and best parameters
    summary = {
        'dataset': dataset_name,
        'model': clf_name,
        'vectorizer': f"glove-{glove_name}",
        'best_cv_macro_f1': best_cv,
        'best_params': json.dumps(best_params),
        'holdout_macro_f1': hold_macro,
        'holdout_weighted_f1': hold_wgtd,
        'holdout_accuracy': hold_acc
    }
    for label, m in report.items():
        if label.isdigit():
            summary[f'label_{label}_f1'] = m['f1-score']

    # Plot histogram of mean CV scores from the search results
    cv_df = pd.DataFrame(rs.cv_results_)
    cv_df['dataset'] = dataset_name

    fig, ax = plt.subplots(figsize=(8, 5))
    ax.hist(
        cv_df['mean_test_score'],
        bins=10,
        color='grey',
        edgecolor='white'
    )
    ax.yaxis.set_major_locator(ticker.MaxNLocator(integer=True))
    ax.set_xlabel('Mean CV Macro Avg. F1-Score')
    ax.set_ylabel('Count')
    ax.grid(False)

    plt.savefig(
        os.path.join(fig_sub, f"glove_{dataset_name}_histogram.jpg"),
        dpi=300, bbox_inches='tight'
    )
    plt.close()

    return summary, cv_df, y_pred

#### Execution

We iterate over every **training split × GloVe model × classifier** combination.
For each configuration we run the search, store cross-validation results, and append hold-out predictions for later analysis.

In [None]:
# Initialize containers for results and predictions
all_summaries       = []
cv_results_by_model = {c: [] for c in CLASSIFIERS}
model_preds         = {}

# Loop through each GloVe model, then each training-set size and classifier
for glove_name, size_dict in X_train_glove.items():
    for (split_type, size), Xg in sorted(size_dict.items()):
        yg = y_train_glove[(split_type, size)]

        for clf_name in CLASSIFIERS:
            if clf_name == "NaiveBayes":
                continue

            print(f"Running {clf_name} on glove={glove_name}, {split_type} n={size}")
            summary, cv_df, y_pred = run_search_glove(
                clf_name, glove_name,
                Xg, yg,
                X_val_glove[glove_name],
                y_val,
                split_type,
                size
            )
            all_summaries.append(summary)
            cv_results_by_model[clf_name].append(cv_df)
            key_name = f"{glove_name}_{split_type}_{size}__{clf_name}"
            model_preds[key_name] = y_pred

# Combine summary metrics into a single DataFrame and export to CSV
glove_df = pd.DataFrame(all_summaries)
glove_df.to_csv(os.path.join(TABLE_DIR, 'glove_all_models_all_datasets_summary.csv'), index=False)

# Merge GloVe predictions into the existing TF-IDF output file
val_with_glove_preds = pd.read_csv(os.path.join(OUTPUT_DIR, 'validation_with_model_preds_NLP.csv')) # Assuming tf-idf run before
for key, preds in model_preds.items():
    colname = f"{key}_pred"
    val_with_glove_preds[colname] = preds
out_path = os.path.join(OUTPUT_DIR, 'validation_with_model_preds_NLP.csv')
val_with_glove_preds.to_csv(out_path, index=False)

## Convolutional Neural Network
We evaluate a convolutional neural network (TextCNN) that leverages the large 42 B-token, 300-d GloVe embeddings. For every training split, we tune CNN hyper-parameters with `RandomizedSearchCV` and evaluate on the fixed validation set.

In [None]:
# Path to the 42 B-token GloVe file (300-d vectors)
GLOVE_42B_PATH = GLOVE_FILES["42B-300d"]  # Important: need to load GLOVE_FILES before

# Define hyper-parameter search space for the TextCNN pipeline
CNN_PARAM_DIST = {
    "tok__num_words"           : randint(10_000, 30_001),  # Vocabulary size (10–30 k).  Balances lexical coverage against memory & training time.
    "tok__seq_len"             : randint(100, 301),        # Max sequence length (100–300).  Truncates / pads texts; trades context for speed.
    "clf__module__n_filters"   : randint(64, 257),         # Conv-filter count (64–256).  Controls capacity to capture local n-gram patterns.
    "clf__module__kernel_size" : randint(3, 9),            # Kernel width (3–8).  Learns n-gram windows from tri-grams to octo-grams.
    "clf__module__dense_units" : randint(32, 257),         # Dense layer width (32–256).  Governs representational power before softmax.
    "clf__module__dropout"     : uniform(0.1, 0.4),        # Dropout rate (0.1–0.5).  Regularises network to mitigate over-fitting.
    "clf__module__trainable"   : [False, True],            # Freeze vs. fine-tune embeddings.  Tests benefit of adapting GloVe vectors.
    "clf__batch_size"          : randint(32, 129),         # Mini-batch size (32–128).  Larger batches speed training but need more GPU RAM.
    "clf__max_epochs"          : randint(5, 16),           # Training epochs (5–15).  Allows early stopping to pick optimal training length.
}

#### Embedding Matrix Helper
Utility to construct a PyTorch tensor from the 42 B GloVe file for the current vocabulary.

In [None]:
def glove42b_matrix(vocab, glove_path, dim=300):
    """Return embedding matrix aligned with `vocab`."""
    mat = np.random.normal(0, .05, (len(vocab), dim)).astype('float32')
    with open(glove_path, encoding='utf-8') as fh:
        for line in fh:
            w, *vals = line.rstrip().split()
            if w in vocab.stoi:
                mat[vocab[w]] = np.array(vals, dtype='float32')
    return torch.tensor(mat)

#### Model Training and Evaluation
The helper `run_search_cnn` mirrors the TF-IDF and GloVe routines:
it builds a `Pipeline(tok, TextCNN)`, performs random search, evaluates on the hold-out set, and returns metrics plus predictions.

In [None]:
def run_search_cnn(train_path, df_val, glove_path):
    """ Train & tune TextCNN with fixed 42B GloVe embeddings."""
    dataset = os.path.splitext(os.path.basename(train_path))[0]
    fig_sub = os.path.join(FIG_DIR, "TextCNN42B")
    os.makedirs(fig_sub, exist_ok=True)

    # Load training data
    df_train = pd.read_csv(train_path)
    y_train = df_train['update_classification'] - 1
    X_train = df_train['whats_new']

    # Extract target and features from the preloaded validation DataFrame
    y_val = df_val['update_classification'] - 1
    X_val = df_val['whats_new']

    # Tokenize training text and build embedding matrix
    tok       = TorchTokenizer().fit(X_train)
    emb       = glove42b_matrix(tok.vocab, glove_path)
    n_classes = len(np.unique(y_train))

    # Wrap TextCNN in skorch and build sklearn pipeline
    net = NeuralNetClassifier(
        TextCNN,
        module__vocab_size     = len(tok.vocab),
        module__emb_matrix     = emb,
        module__n_classes      = n_classes,
        device                 = 'cuda' if torch.cuda.is_available() else 'cpu',
        criterion              = nn.CrossEntropyLoss,
        optimizer              = torch.optim.Adam,
        iterator_train__shuffle= True,
        verbose                = 0
    )

    # Perform randomized search over CNN hyper-parameters
    pipe = Pipeline([('tok', tok), ('clf', net)])
    rs   = RandomizedSearchCV(
        pipe,
        CNN_PARAM_DIST,
        n_iter        = 30,
        scoring       = SCORER,
        cv            = CV,
        n_jobs        = OUTER_JOBS,
        random_state  = RANDOM_STATE,
        verbose       = 2,
        refit         = True
    ).fit(X_train, y_train)

    # Evaluate best model on the hold-out validation set
    y_pred = rs.predict(X_val)
    rpt    = classification_report(y_val, y_pred, output_dict=True, zero_division=0)

    # Compile summary dictionary with performance metrics & best params
    summary = {
        'dataset'             : dataset,
        'model'               : "TextCNN42B",
        'best_cv_macro_f1'    : rs.best_score_,
        'best_params'         : json.dumps({
                                     k:(v.item() if hasattr(v,'item') else v)
                                     for k,v in rs.best_params_.items()
                                 }),
        'holdout_macro_f1'    : rpt['macro avg']['f1-score'],
        'holdout_weighted_f1' : rpt['weighted avg']['f1-score'],
        'holdout_accuracy'    : accuracy_score(y_val, y_pred)
    }
    for lbl, m in rpt.items():
        if str(lbl).isdigit():
            summary[f'label_{lbl}_f1'] = m['f1-score']

    # Plot histogram of mean CV scores from the search results
    cv_df = pd.DataFrame(rs.cv_results_)
    cv_df['dataset'] = dataset
    fig, ax = plt.subplots(figsize=(8,5))
    ax.hist(cv_df['mean_test_score'], bins=10,color='grey', edgecolor='white')
    ax.yaxis.set_major_locator(ticker.MaxNLocator(integer=True))
    ax.set_xlabel('Mean CV Macro Avg. F1-Score')
    ax.set_ylabel('Count')
    ax.grid(False)
    plt.savefig(os.path.join(fig_sub, f"cnn_{dataset}_hist.jpg"), dpi=300, bbox_inches='tight')
    plt.close()

    # Save the best model parameters for reuse
    os.makedirs(MODELS_DIR, exist_ok=True)
    best_net    = rs.best_estimator_.named_steps['clf']
    model_path  = os.path.join(MODELS_DIR, f"TextCNN42B_{dataset}.pt")
    best_net.save_params(f_params=model_path)
    print(f"Saved best model to {model_path}")

    return summary, cv_df, y_pred

#### Execution
We loop over every training split, run `run_search_cnn`, and append predictions for downstream comparison.

In [None]:
# Load validation data
df_val = pd.read_csv(VAL_PATH)

# Initialize containers for results and predictions
cnn_summaries   = []
cnn_cv_results  = []
model_preds     = {}

# Loop through different training files
for train_csv in glob.glob(TRAIN_GLOB):
    base = os.path.splitext(os.path.basename(train_csv))[0]
    parts = base.split('_')  # parts == ["demo","app","updates","train","real","2000"]
    split_type = parts[-2]   # "real" or "equal"
    size       = parts[-1]   # "2000", etc.
    print(f"[CNN] TextCNN42B on {train_csv}")

    summary, cv_df, y_pred = run_search_cnn(train_csv, df_val, GLOVE_42B_PATH)
    cnn_summaries.append(summary)
    cnn_cv_results.append(cv_df)
    key = f"TextCNN42B__{split_type}__{size}"
    model_preds[key] = y_pred

# Combine all summary metrics into a single DataFrame and export to CSV
cnn_df = pd.DataFrame(cnn_summaries)
cnn_df.to_csv(
    os.path.join(TABLE_DIR, "cnn_textcnn42b_all_datasets_summary.csv"),
    index=False
)

# Merge CNN predictions into the existing output file
val_with_glove_preds = pd.read_csv(os.path.join(OUTPUT_DIR, 'validation_with_model_preds_NLP.csv')) # Assuming other models run before
for key, preds in model_preds.items():
    colname = f"{key}_pred"
    val_with_glove_preds[colname] = preds
out_path = os.path.join(OUTPUT_DIR, 'validation_with_model_preds_NLP.csv')
val_with_glove_preds.to_csv(out_path, index=False)

## Pre-Trained Language Models
We fine-tune four transformer checkpoints (`BERT`, `RoBERTa`, `XLNet`, `ELECTRA`) on each train split.
Hyper-parameters are tuned with **Optuna** (`NUM_TRIALS = 5`) and the best model is evaluated on the houldout validation set.

In [None]:
# Hugging Face model checkpoints
MODEL_CHECKPOINTS = {
    "xlnet"   : "xlnet-base-cased",
    "roberta" : "roberta-base",
    "electra" : "google/electra-base-discriminator",
    "bert"    : "bert-base-cased",
}

# Base TrainingArguments shared by all fine-tuning runs
BASE_ARGS = {
    "eval_strategy"           : "epoch",   # Evaluate once per epoch for clear learning curves.
    "save_strategy"           : "epoch",   # Save a checkpoint after every epoch for rollback.
    "load_best_model_at_end"  : True,      # Restore best epoch automatically.
    "metric_for_best_model"   : "f1_macro",# Macro-F1 chosen to weight classes equally.
    "greater_is_better"       : True,      # Higher F1 = better.
    "seed"                    : RANDOM_STATE,
    "logging_steps"           : 50,        # Frequent logging; low overhead on modern GPUs.
    "save_total_limit"        : 1,         # Keep only the best checkpoint → disk-friendly.
    "disable_tqdm"            : True,      # Cleaner notebook output.
    "report_to"               : [],        # Disable WandB/MLflow unless explicitly enabled.
}

# Optuna trials per checkpoint
NUM_TRIALS = 5

#### Helper: Load CSVs as HuggingFace Datasets
Zero-bases the labels so they align with `transformers` expectations.

In [None]:
def load_datasets(train_csv, val_csv):
    """Return Hugging Face Datasets for train / validation splits."""
    df_train = pd.read_csv(train_csv)
    df_val = pd.read_csv(val_csv)
    df_train['label'] = df_train['update_classification'] - 1
    df_val['label'] = df_val['update_classification'] - 1
    return (
        Dataset.from_pandas(df_train[['whats_new','label']]),
        Dataset.from_pandas(df_val[['whats_new','label']]),
    )

#### Model Training and Hyperparameter Search
`run_search_transformer` performs an Optuna search, saves the best model, and returns metrics plus predictions.

In [None]:
def run_search_transformer(model_key, checkpoint, train_csv, val_csv):
    """
    Fine-tune a transformer checkpoint, tune HPs with Optuna, evaluate on hold-out.
    Returns summary dict, Optuna trials DataFrame, and validation predictions.
    """
    split_name = os.path.basename(train_csv).rsplit('.', 1)[0]
    train_ds, val_ds = load_datasets(train_csv, val_csv)

    tok = AutoTokenizer.from_pretrained(checkpoint, clean_up_tokenization_spaces=True)

    # Tokenization
    def preprocess(batch):
        toks = tok(batch["whats_new"], truncation=True, padding="max_length", max_length=512)
        toks["labels"] = batch["label"]
        return toks

    train_ds = train_ds.map(preprocess, batched=True, remove_columns=train_ds.column_names)
    val_ds   = val_ds.map(preprocess, batched=True, remove_columns=val_ds.column_names)

    # Optuna setup
    sampler = TPESampler(seed=RANDOM_STATE)
    study   = optuna.create_study(direction="maximize", sampler=sampler)

    def objective(trial):
        # Reset PyTorch RNGs for reproducibility
        torch.manual_seed(RANDOM_STATE)
        torch.cuda.manual_seed_all(RANDOM_STATE)

        # Set hyperparameters
        lr     = trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True)  # Learning rate (1e-5–5e-5).  Standard fine-tune range; log-scale search.
        bs     = trial.suggest_categorical("batch_size", [8, 16])            # Batch size 8/16.  16 fits 12–16 GB GPUs; 8 for memory-constrained cases.
        epochs = trial.suggest_int("num_train_epochs", 3, 5)                 # Training epochs (3–5).  Balances convergence vs. over-fitting.
        wd     = trial.suggest_float("weight_decay", 0.0, 0.01)              # L2 weight decay (0–0.01).  Light regularisation for stability.


        trial_dir = os.path.join(MODELS_DIR, f"{model_key}_{split_name}", f"trial_{trial.number}")
        os.makedirs(trial_dir, exist_ok=True)

        args = TrainingArguments(
            output_dir=trial_dir,
            learning_rate=lr,
            per_device_train_batch_size=bs,
            per_device_eval_batch_size=bs,
            num_train_epochs=epochs,
            weight_decay=wd,
            **BASE_ARGS,
        )

        model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=len(set(train_ds["labels"])))

        trainer = Trainer(
            model=model,
            args=args,
            train_dataset=train_ds,
            eval_dataset=val_ds,
            compute_metrics=lambda p: {
                "f1_macro": f1_score(p.label_ids, np.argmax(p.predictions, axis=1), average="macro"),
                "f1_weighted": f1_score(p.label_ids, p.predictions.argmax(-1), average="weighted"),
                "accuracy": accuracy_score(p.label_ids, np.argmax(p.predictions, axis=1)),
            },
        )

        trainer.train()
        metrics = trainer.evaluate()
        trial.set_user_attr("best_ckpt", trainer.state.best_model_checkpoint)

        # Clean up GPU mem
        del trainer, model
        torch.cuda.empty_cache()

        return metrics["eval_f1_macro"]

    # Run Optuna
    study.optimize(objective, n_trials=NUM_TRIALS)

    # Save all trial results + best flag
    trials_df = study.trials_dataframe()
    trials_df["is_best"] = trials_df["number"] == study.best_trial.number
    trials_out = os.path.join(TABLE_DIR, f"{model_key}_{split_name}_optuna_trials.csv")
    trials_df.to_csv(trials_out, index=False)
    print(f"Saved {len(trials_df)} trials to {trials_out} (best trial = {study.best_trial.number})")

    # Reload & save best model centrally
    best = study.best_trial
    best_ckpt = best.user_attrs["best_ckpt"]
    master_dir = os.path.join(MODELS_DIR, f"{model_key}_{split_name}_best")
    os.makedirs(master_dir, exist_ok=True)

    best_model = AutoModelForSequenceClassification.from_pretrained(best_ckpt)
    best_tok = AutoTokenizer.from_pretrained(checkpoint)
    best_model.save_pretrained(master_dir)
    best_tok.save_pretrained(master_dir)

    # Compute per-label F1 on hold-out
    eval_trainer = Trainer(
        model=best_model,
        args=TrainingArguments(
            output_dir=master_dir,
            per_device_eval_batch_size=best.params["batch_size"],
        ),
        tokenizer=best_tok,
    )
    preds_out = eval_trainer.predict(val_ds)
    y_pred    = preds_out.predictions.argmax(-1)
    y_true    = preds_out.label_ids
    rpt = classification_report(y_true, y_pred, labels=list(range(7)), output_dict=True, zero_division=0)

    # Compile summary dictionary with performance metrics and best parameters
    summary = {
        "dataset":              split_name,
        "model":                model_key,
        "best_cv_macro_f1":     study.best_trial.value,
        "best_params":          json.dumps(best.params),
        "holdout_accuracy":     accuracy_score(y_true, y_pred),
        "holdout_macro_f1":     rpt["macro avg"]["f1-score"],
        "holdout_weighted_f1":  rpt["weighted avg"]["f1-score"],
    }
    for lbl, m in rpt.items():
        if lbl.isdigit():
            summary[f"label_{lbl}_f1"] = m["f1-score"]

    return summary, trials_df, y_pred

#### Execution
Loop over every **train split × checkpoint** combination, run `run_search_transformer`, and append predictions for downstream comparison.

In [None]:
# Initialize containers for results and predictions
all_results = []
model_preds = {}

# Loop through different training files and transformer checkpoints
for train_csv in glob.glob(TRAIN_GLOB):
    # Extract split_type and size from the filename
    base       = os.path.splitext(os.path.basename(train_csv))[0]
    parts      = base.split('_')       # e.g. ["demo","app","updates","train","real","2000"]
    split_type = parts[-2]             # "real" or "equal"
    size       = parts[-1]             # "2000", etc.

    for model_key, checkpoint in MODEL_CHECKPOINTS.items():
        print(f"[FT] {model_key} on {split_type} n={size}")
        summary, trials_df, y_pred = run_search_transformer(
            model_key,
            checkpoint,
            train_csv,
            VAL_PATH
        )
        all_results.append(summary)
        key = f"{model_key}_{split_type}_{size}"
        model_preds[key] = y_pred

# Combine all summary metrics into a single DataFrame and export to CSV
df = pd.DataFrame(all_results)
df.to_csv(os.path.join(TABLE_DIR, "transformers_optuna_summary.csv"), index=False)

# Merge PLM predictions into the existing output file
val_with_preds = pd.read_csv(os.path.join(OUTPUT_DIR, 'validation_with_model_preds_NLP.csv'))
for key, preds in model_preds.items():
    val_with_preds[f"{key}_pred"] = preds
val_with_preds.to_csv(
    os.path.join(OUTPUT_DIR, 'validation_with_model_preds_NLP.csv'), index=False)