## Setup
We use a high-performing framework specification as a teacher (GPT-4o-mini) to generate labels for progressively larger sets (n = 10,000 & n = 20,000) of additional release notes. We then train and tune a set of student models, including classical ML baselines and smaller transformer-based models, on these pseudo-labeled datasets and evaluate them on the same expert-labeled holdout validation set (n =1,000) used throughout the study. To ensure compatibility with scikit-learn, labels were zero-indexed.

#### Imports
 See `requirements.txt` for full dependency versions

In [None]:
import os
import glob
import json
import pandas  as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import torch
import torch.nn as nn
import optuna
import sys
sys.path.insert(0, os.path.abspath(os.path.join("..", "src")))

from scipy.stats import randint, uniform
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import make_scorer, f1_score, classification_report, accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from xgboost import XGBClassifier
from glove_vectorizer import GloveVectorizer
from skorch import NeuralNetClassifier
from textcnn import TorchTokenizer, TextCNN
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from optuna.samplers import TPESampler

#### Global Paths, Directories, Variables, and Classifier Instances

In [None]:
# Define Demo Study path
DEMO_PATH   = os.path.abspath(os.path.join("..."))

# Define relevant paths 
TRAIN_GLOB =os.path.join(DEMO_PATH,'training_validation_data', 'sample_10000_openai_for_ml.csv') # adjust sample size file if needed
VAL_PATH   = os.path.join(DEMO_PATH,'training_validation_data', 'demo_app_updates_validation_real_1000.csv')
OUTPUT_DIR = os.path.join(DEMO_PATH,'output_data')
TABLE_DIR  = os.path.join(DEMO_PATH,'tables')
FIG_DIR    = os.path.join(DEMO_PATH,'figures')
MODELS_DIR = os.path.join(DEMO_PATH,'models')

# Define parallelism structure (note: INNER_JOBS * OUT_JOBS < max cores)
INNER_JOBS = 5
OUTER_JOBS = 10

# CV & reproducibility
RANDOM_STATE = 94032
CV     = StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE)
SCORER = make_scorer(f1_score, average="macro")

# Define classifier instances
CLASSIFIERS = {
    "NaiveBayes":        MultinomialNB(),
    "SVM":               SVC(random_state=RANDOM_STATE, probability=True),
    "XGBoost":           XGBClassifier(random_state=RANDOM_STATE, n_jobs=INNER_JOBS, eval_metric="mlogloss"),
}

# Define hyperparameter ranges for each classifier
CLASSIFIER_PARAM_TEMPLATES = {
    # NaiveBayes hyperparameters:
    "NaiveBayes": {
        "clf__alpha"    : uniform(1e-6, 1.0),           # Additive smoothing (1e-6-1).  Controls how aggressively rare terms are down-weighted.
        "clf__fit_prior": [True, False],                # Learn class priors or assume uniform.  Testing both handles potential label imbalance.
    },

    # SVM hyperparameters:
    "SVM": {
        "clf__C"     : uniform(0.1, 10),                # Soft-margin cost (0.1-10).  Balances margin width vs. mis-classification tolerance.
        "clf__kernel": ["linear", "rbf", "poly"],       # Kernels: linear (fast for TF-IDF), RBF & poly for non-linear patterns.
        "clf__gamma" : ["scale", "auto"],               # Kernel coefficient heuristics.  Both common; effect only for RBF/poly.
        "clf__degree": randint(2, 6),                   # Polynomial degree (2-5).  Higher degree leads to more complex decision surface (poly kernel only).
    },

    # XGBoost hyperparameters:
    "XGBoost": {
        "clf__n_estimators"     : randint(50, 501),     # Boosting rounds (50-500).  More rounds improve fit but risk overfitting & longer training.
        "clf__max_depth"        : randint(3, 11),       # Tree depth (3-10).  Shallow trees reduce overfitting on sparse high-dim. text features.
        "clf__learning_rate"    : uniform(0.01, 0.3),   # Shrinkage (0.01-0.3).  Lower LR needs more trees but can yield better generalisation.
        "clf__subsample"        : uniform(0.5, 0.5),    # Row subsampling centred at 0.5.  Encourages diversity among trees; mitigates overfitting.
        "clf__colsample_bytree" : uniform(0.5, 0.5),    # Column subsampling (~50 %).  Helpful with large TF-IDF vocabularies to speed up training.
    },
}

## Classical ML models

### TF-IDF Pipeline
To identify the optimal model configuration for update classification, we systematically tuned feature representations and classifier hyperparameters using an automated and reproducible pipeline via the scikit-learn (Pedregosa et al., 2011) and XGBoost  (Chen & Guestrin, 2016) libraries.
We used scikit-learn’s `RandomizedSearchCV` with 50 randomly sampled parameter sets per classifier for hyperparameter optimization. Feature representations and classifiers were tuned jointly.

In [None]:
# Define hyperparameter search space for the TF-IDF vectorizer
TFIDF_PARAMS = {
    "tfidf__max_features" : randint(2000, 10001),      # Vocabulary size (2 k – 10 k).  Controls richness of feature set vs. sparsity and runtime.
    "tfidf__ngram_range"  : [(1, 1), (1, 2), (1, 3)],  # Unigrams, bigrams or trigrams.  Enables both single words and frequent phrases as features.
    "tfidf__min_df"       : randint(1, 5),             # Min-doc frequency (1–4).  Prunes extremely rare tokens that add noise or inflate vocab.
    "tfidf__max_df"       : uniform(0.6, 0.4),         # Max-doc frequency (0.6–1.0).  Filters very common terms that carry little discriminative power.
}

# Combine TF-IDF and model-specific hyperparameters for joint optimization in RandomizedSearchCV
PARAM_DISTS = {
    model_name: { **TFIDF_PARAMS, **CLASSIFIER_PARAM_TEMPLATES[model_name] }
    for model_name in CLASSIFIERS
}

#### Model Training and Evaluation

The following function `run_search_tfidf` handles model training, hyperparameter search, and evaluation on a fixed validation set. Cross-validation results are visualized.

In [None]:
def run_search_tfidf(clf_name, train_path, df_val):
    """
    Trains and tunes a model pipeline with TF-IDF and the specified classifier.
    """
    dataset_name = os.path.splitext(os.path.basename(train_path))[0]
    fig_sub = os.path.join(FIG_DIR, clf_name)
    os.makedirs(fig_sub, exist_ok=True)

    # Load training data
    df_train = pd.read_csv(train_path)
    y_train = df_train['update_classification'] - 1
    X_train = df_train['whats_new']

    # Extract target and features from the preloaded validation DataFrame
    y_val = df_val['update_classification'] - 1
    X_val = df_val['whats_new']

    # Build sklearn pipeline: TF-IDF vectorizer followed by the selected classifier
    pipe = Pipeline([
        ("tfidf", TfidfVectorizer()),
        ("clf", CLASSIFIERS[clf_name]),
    ])

    # Perform randomized search over TF-IDF and classifier parameters using cross-validation
    rs = RandomizedSearchCV(
        pipe,
        param_distributions=PARAM_DISTS[clf_name],
        n_iter=20,
        scoring=SCORER,
        cv=CV,
        n_jobs=OUTER_JOBS,
        random_state=RANDOM_STATE,
        verbose=1,
        refit=True
    )
    rs.fit(X_train, y_train)

    # Extract best CV score and optimal parameter combination
    best_cv = rs.best_score_
    best_params = {k: (v.item() if hasattr(v, "item") else v)
                   for k, v in rs.best_params_.items()}
    
    # Evaluate best model on the hold-out validation set
    y_pred = rs.predict(X_val)

    # Generate classification report and summarize key hold-out metrics
    report = classification_report(
        y_val, y_pred,
        output_dict=True,
        zero_division=0
    )
    hold_macro = report['macro avg']['f1-score']
    hold_wgtd = report['weighted avg']['f1-score']
    hold_acc = accuracy_score(y_val, y_pred)
    
    # Compile summary dictionary with performance metrics and best parameters
    summary = {
        'dataset': dataset_name,
        'model': clf_name,
        'best_cv_macro_f1': best_cv,
        'best_params': json.dumps(best_params),
        'holdout_macro_f1': hold_macro,
        'holdout_weighted_f1': hold_wgtd,
        'holdout_accuracy': hold_acc
    }
    for label, m in report.items():
        if label.isdigit():
            summary[f'label_{label}_f1'] = m['f1-score']

    # Plot histogram of mean CV scores from the search results
    cv_df = pd.DataFrame(rs.cv_results_)
    cv_df['dataset'] = dataset_name

    fig, ax = plt.subplots(figsize=(8, 5))
    ax.hist(cv_df['mean_test_score'], bins=10, color='grey', edgecolor='white')
    ax.yaxis.set_major_locator(ticker.MaxNLocator(integer=True))
    ax.set_xlabel('Mean (CV) Macro Avg. F1-Score')
    ax.set_ylabel('Count')
    ax.grid(False)

    plt.savefig(
        os.path.join(fig_sub, f"20000_tfidf_{dataset_name}_histogram.jpg"),
        dpi=300, bbox_inches='tight'
    )
    plt.close()

    return summary, cv_df, y_pred

#### Execution
We loop over all available training splits and classifiers, optimizing each configuration. Results and predictions are saved for further analysis.


In [None]:
# Load validation data
df_val = pd.read_csv(VAL_PATH)

# Initialize containers for results and predictions
all_summaries = []
cv_results_by_model = {name: [] for name in CLASSIFIERS}
model_preds = {}

# Loop through different training files and classifiers
for train_csv in glob.glob(TRAIN_GLOB):
    base = os.path.splitext(os.path.basename(train_csv))[0]
    parts = base.split('_')  # parts == ["demo","app","updates","train","real","2000"]
    split_type = parts[-2]   # "real" or "equal"
    size       = parts[-1]   # "2000", etc.
    for name in CLASSIFIERS:
        print(f"Running {name} on {train_csv}")
        summary, cv_df, y_pred = run_search_tfidf(name, train_csv, df_val)
        all_summaries.append(summary)
        cv_results_by_model[name].append(cv_df)
        # Store predictions for this model and dataset combination
        key = f"tfidf__{split_type}__{size}__{name}"
        model_preds[key] = y_pred

# Combine all summary metrics into a single DataFrame and export to CSV
combined_df = pd.DataFrame(all_summaries)
combined_df.to_csv(
    os.path.join(TABLE_DIR, '20000_tfidf_all_models_all_datasets_summary.csv'),
    index=False
)

# Append model predictions to the validation set and export results
val_with_preds = df_val.copy()
for key, preds in model_preds.items():
    colname = f"{key}_pred"
    val_with_preds[colname] = preds
val_with_preds.to_csv(os.path.join(OUTPUT_DIR, '20000_validation_with_model_preds_NLP.csv'), index=False)

## Pre-Trained Language Models
We fine-tune four transformer checkpoints (`BERT` & `XLNet`) on each train split.
Hyper-parameters are tuned with **Optuna** (`NUM_TRIALS = 1`) and the best model is evaluated on the houldout validation set. To keep this extension efficient compared to the specification of the main article, we limit optuna trials to 1.

In [None]:
# Hugging Face model checkpoints
MODEL_CHECKPOINTS = {
    "xlnet"   : "xlnet-base-cased",
    "bert"    : "bert-base-cased",
}

# Base TrainingArguments shared by all fine-tuning runs
BASE_ARGS = {
    "eval_strategy"           : "epoch",   # Evaluate once per epoch for clear learning curves.
    "save_strategy"           : "epoch",   # Save a checkpoint after every epoch for rollback.
    "load_best_model_at_end"  : True,      # Restore best epoch automatically.
    "metric_for_best_model"   : "f1_macro",# Macro-F1 chosen to weight classes equally.
    "greater_is_better"       : True,      # Higher F1 = better.
    "seed"                    : RANDOM_STATE,
    "logging_steps"           : 50,        # Frequent logging; low overhead on modern GPUs.
    "save_total_limit"        : 1,         # Keep only the best checkpoint → disk-friendly.
    "disable_tqdm"            : True,      # Cleaner notebook output.
    "report_to"               : [],        # Disable WandB/MLflow unless explicitly enabled.
}

# Optuna trials per checkpoint
NUM_TRIALS = 1

#### Helper: Load CSVs as HuggingFace Datasets
Zero-bases the labels so they align with `transformers` expectations.

In [None]:
def load_datasets(train_csv, val_csv):
    """Return Hugging Face Datasets for train / validation splits."""
    df_train = pd.read_csv(train_csv)
    df_val = pd.read_csv(val_csv)
    df_train['label'] = df_train['update_classification'] - 1
    df_val['label'] = df_val['update_classification'] - 1
    return (
        Dataset.from_pandas(df_train[['whats_new','label']]),
        Dataset.from_pandas(df_val[['whats_new','label']]),
    )

#### Model Training and Hyperparameter Search
`run_search_transformer` performs an Optuna search, saves the best model, and returns metrics plus predictions.

In [None]:
def run_search_transformer(model_key, checkpoint, train_csv, val_csv):
    """
    Fine-tune a transformer checkpoint, tune HPs with Optuna, evaluate on hold-out.
    Returns summary dict, Optuna trials DataFrame, and validation predictions.
    """
    split_name = os.path.basename(train_csv).rsplit('.', 1)[0]
    train_ds, val_ds = load_datasets(train_csv, val_csv)

    tok = AutoTokenizer.from_pretrained(checkpoint, clean_up_tokenization_spaces=True)

    # Tokenization
    def preprocess(batch):
        toks = tok(batch["whats_new"], truncation=True, padding="max_length", max_length=512)
        toks["labels"] = batch["label"]
        return toks

    train_ds = train_ds.map(preprocess, batched=True, remove_columns=train_ds.column_names)
    val_ds   = val_ds.map(preprocess, batched=True, remove_columns=val_ds.column_names)

    # Optuna setup
    sampler = TPESampler(seed=RANDOM_STATE)
    study   = optuna.create_study(direction="maximize", sampler=sampler)

    def objective(trial):
        # Reset PyTorch RNGs for reproducibility
        torch.manual_seed(RANDOM_STATE)
        torch.cuda.manual_seed_all(RANDOM_STATE)

        # Set hyperparameters
        lr     = trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True)  # Learning rate (1e-5–5e-5).  Standard fine-tune range; log-scale search.
        bs     = trial.suggest_categorical("batch_size", [8, 16])            # Batch size 8/16.  16 fits 12–16 GB GPUs; 8 for memory-constrained cases.
        epochs = trial.suggest_int("num_train_epochs", 3, 5)                 # Training epochs (3–5).  Balances convergence vs. over-fitting.
        wd     = trial.suggest_float("weight_decay", 0.0, 0.01)              # L2 weight decay (0–0.01).  Light regularisation for stability.


        trial_dir = os.path.join(MODELS_DIR, f"{model_key}_{split_name}", f"trial_{trial.number}")
        os.makedirs(trial_dir, exist_ok=True)

        args = TrainingArguments(
            output_dir=trial_dir,
            learning_rate=lr,
            per_device_train_batch_size=bs,
            per_device_eval_batch_size=bs,
            num_train_epochs=epochs,
            weight_decay=wd,
            **BASE_ARGS,
        )

        model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=len(set(train_ds["labels"])))

        trainer = Trainer(
            model=model,
            args=args,
            train_dataset=train_ds,
            eval_dataset=val_ds,
            compute_metrics=lambda p: {
                "f1_macro": f1_score(p.label_ids, np.argmax(p.predictions, axis=1), average="macro"),
                "f1_weighted": f1_score(p.label_ids, p.predictions.argmax(-1), average="weighted"),
                "accuracy": accuracy_score(p.label_ids, np.argmax(p.predictions, axis=1)),
            },
        )

        trainer.train()
        metrics = trainer.evaluate()
        trial.set_user_attr("best_ckpt", trainer.state.best_model_checkpoint)

        # Clean up GPU mem
        del trainer, model
        torch.cuda.empty_cache()

        return metrics["eval_f1_macro"]

    # Run Optuna
    study.optimize(objective, n_trials=NUM_TRIALS)

    # Save all trial results + best flag
    trials_df = study.trials_dataframe()
    trials_df["is_best"] = trials_df["number"] == study.best_trial.number
    trials_out = os.path.join(TABLE_DIR, f"{model_key}_{split_name}_optuna_trials.csv")
    trials_df.to_csv(trials_out, index=False)
    print(f"Saved {len(trials_df)} trials to {trials_out} (best trial = {study.best_trial.number})")

    # Reload & save best model centrally
    best = study.best_trial
    best_ckpt = best.user_attrs["best_ckpt"]
    master_dir = os.path.join(MODELS_DIR, f"{model_key}_{split_name}_best")
    os.makedirs(master_dir, exist_ok=True)

    best_model = AutoModelForSequenceClassification.from_pretrained(best_ckpt)
    best_tok = AutoTokenizer.from_pretrained(checkpoint)
    best_model.save_pretrained(master_dir)
    best_tok.save_pretrained(master_dir)

    # Compute per-label F1 on hold-out
    eval_trainer = Trainer(
        model=best_model,
        args=TrainingArguments(
            output_dir=master_dir,
            per_device_eval_batch_size=best.params["batch_size"],
        ),
        tokenizer=best_tok,
    )
    preds_out = eval_trainer.predict(val_ds)
    y_pred    = preds_out.predictions.argmax(-1)
    y_true    = preds_out.label_ids
    rpt = classification_report(y_true, y_pred, labels=list(range(7)), output_dict=True, zero_division=0)

    # Compile summary dictionary with performance metrics and best parameters
    summary = {
        "dataset":              split_name,
        "model":                model_key,
        "best_cv_macro_f1":     study.best_trial.value,
        "best_params":          json.dumps(best.params),
        "holdout_accuracy":     accuracy_score(y_true, y_pred),
        "holdout_macro_f1":     rpt["macro avg"]["f1-score"],
        "holdout_weighted_f1":  rpt["weighted avg"]["f1-score"],
    }
    for lbl, m in rpt.items():
        if lbl.isdigit():
            summary[f"label_{lbl}_f1"] = m["f1-score"]

    return summary, trials_df, y_pred

#### Execution
Loop over every **train split × checkpoint** combination, run `run_search_transformer`, and append predictions for downstream comparison.

In [None]:
# Initialize containers for results and predictions
all_results = []
model_preds = {}

# Loop through different training files and transformer checkpoints
for train_csv in glob.glob(TRAIN_GLOB):
    # Extract split_type and size from the filename
    base       = os.path.splitext(os.path.basename(train_csv))[0]
    parts      = base.split('_')       # e.g. ["demo","app","updates","train","real","2000"]
    split_type = parts[-2]             # "real" or "equal"
    size       = parts[-1]             # "2000", etc.

    for model_key, checkpoint in MODEL_CHECKPOINTS.items():
        print(f"[FT] {model_key} on {split_type} n={size}")
        summary, trials_df, y_pred = run_search_transformer(
            model_key,
            checkpoint,
            train_csv,
            VAL_PATH
        )
        all_results.append(summary)
        key = f"{model_key}_{split_type}_{size}"
        model_preds[key] = y_pred

# Combine all summary metrics into a single DataFrame and export to CSV
df = pd.DataFrame(all_results)
df.to_csv(os.path.join(TABLE_DIR, "10000_transformers_optuna_summary.csv"), index=False)

# Merge PLM predictions into the existing output file
val_with_preds = pd.read_csv(os.path.join(OUTPUT_DIR, '10000_validation_with_model_preds_NLP.csv'))
for key, preds in model_preds.items():
    val_with_preds[f"{key}_pred"] = preds
val_with_preds.to_csv(
    os.path.join(OUTPUT_DIR, '10000_validation_with_model_preds_NLP.csv'), index=False)