In [1]:
from pathlib import Path

import datasets
import matplotlib.pyplot as plt
import numpy as np
import umap

from nukelm.analyze.BERTopic import BERTopic
from nukelm.analyze.umap_comparisons import PLOT_KWARGS, UMAP_KWARGS, plot_points


PROJECT_DIR = Path.cwd().parent
output_dir = PROJECT_DIR / "data" / "08_reporting" / "bertopic"
output_dir.mkdir(exist_ok=True)

AGG_METHOD = "CLS"

ValueError: numpy.ndarray size changed, may indicate binary incompatibility. Expected 88 from C header, got 80 from PyObject

In [None]:
dataset_trained_1 = datasets.load_from_disk(str(PROJECT_DIR / "data" / "07_model_output" / "roberta-large-trained-1"))
dataset_ots_1 = datasets.load_from_disk(str(PROJECT_DIR / "data" / "07_model_output" / "roberta-large-ots-1"))

In [None]:
mapper_trained_1 = umap.UMAP(**UMAP_KWARGS).fit(dataset_trained_1[AGG_METHOD])

In [None]:
mapper_ots_1 = umap.UMAP(**UMAP_KWARGS).fit(dataset_ots_1[AGG_METHOD])

In [None]:
points_trained_1 = mapper_trained_1.transform(dataset_trained_1[AGG_METHOD])
points_ots_1 = mapper_ots_1.transform(dataset_ots_1[AGG_METHOD])

# BERTopic applied to model with continued pre-training


In [None]:
BERTOPIC_KWARGS = {
    "n_neighbors": 15,
    "n_components": 100,
    "min_dist": 0.1,
    "umap_metric": "euclidean",
    "random_state": 42,
    "min_cluster_size": 25,
    "min_samples": None,
    "cluster_selection_epsilon": 0.0,
    "hdbscan_metric": "euclidean",
    "alpha": 1.0,
    "cluster_selection_method": "eom",
    "verbose": True,
}

In [None]:
model_trained_1 = BERTopic(**BERTOPIC_KWARGS)
labels_trained_1, _ = model_trained_1.fit_transform(dataset_trained_1["text"], np.array(dataset_trained_1[AGG_METHOD]))
labels_ots_1 = labels_trained_1  # plot with labels from pre-trained model

In [None]:
labels_set_trained_1 = set(labels_trained_1 + labels_ots_1)
label_map_trained_1 = {i: f"Cluster {i + 1: 2d}" for i in range(max(labels_set_trained_1) + 1)}
label_map_trained_1[-1] = "None"

In [None]:
{f"Cluster {i+1: 2d}" if i + 1 > 0 else "None": model_trained_1.get_topic(i) for i in labels_set_trained_1}

In [None]:
fig_trained_1 = plot_points(
    (points_trained_1, points_ots_1),
    (labels_trained_1, labels_ots_1),
    (r"\textsc{RoBERTa} Large + OSTI Pre-Training", r"\textsc{RoBERTa} Large"),
    label_map_trained_1,
    True,
    **PLOT_KWARGS,
)
fig_trained_1.savefig(output_dir / "trained-clusters.png", dpi=300)

# BERTopic applied to model without continued pre-training

In [None]:
BERTOPIC_KWARGS = {
    "n_neighbors": 15,
    "n_components": 100,
    "min_dist": 0.1,
    "umap_metric": "euclidean",
    "random_state": 42,
    "min_cluster_size": 25,
    "min_samples": None,
    "cluster_selection_epsilon": 0.0,
    "hdbscan_metric": "euclidean",
    "alpha": 1.0,
    "cluster_selection_method": "eom",
    "verbose": True,
}

In [None]:
model_ots_1 = BERTopic(**BERTOPIC_KWARGS)
labels_ots_1, _ = model_ots_1.fit_transform(dataset_ots_1["text"], np.array(dataset_ots_1[AGG_METHOD]))
labels_trained_1 = labels_ots_1  # plot with labels from off-the-shelf model

In [None]:
labels_set_ots_1 = set(labels_trained_1 + labels_ots_1)
label_map_ots_1 = {i: f"Cluster {i + 1: 2d}" for i in range(max(labels_set_ots_1) + 1)}
label_map_ots_1[-1] = "None"

In [None]:
{f"Cluster {i+1: 2d}" if i + 1 > 0 else "None": model_ots_1.get_topic(i) for i in labels_set_ots_1}

In [None]:
fig_ots_1 = plot_points(
    (points_trained_1, points_ots_1),
    (labels_trained_1, labels_ots_1),
    (r"\textsc{RoBERTa} Large + OSTI Pre-Training", r"\textsc{RoBERTa} Large"),
    label_map_ots_1,
    True,
    **PLOT_KWARGS,
)
fig_ots_1.savefig(output_dir / "ots-clusters.png", dpi=300)

In [None]:
# fig_1 = plot_points(
#     (points_trained_1, points_ots_1),
#     (_labels_trained_1, _labels_ots_1),
#     None,
# #     (r"\textsc{RoBERTa} Large + OSTI Pre-Training", r"\textsc{RoBERTa} Large"),
#     label_map_1,
#     False,
#     **PLOT_KWARGS
# )

In [None]:
# fig_1.axes[0].set_xlim(-2.5, 17.5)
# fig_1.axes[1].set_xlim(-2.5, 17.5)
# fig_1.axes[0].set_ylim(-2.5, 17.5)
# fig_1.axes[1].set_ylim(-2.5, 17.5)
# fig_1.axes[0].set_xticks([0, 5, 10, 15])
# fig_1.axes[0].set_yticks([0, 5, 10, 15])
# fig_1.axes[1].set_xticks([0, 5, 10, 15])
# fig_1.axes[1].set_yticks([0, 5, 10, 15])
# fig_1

In [None]:
# fig_1.axes[1].legend(loc='center left', bbox_to_anchor=(1, 0.5))

# fig_1

In [None]:
# fig_1.savefig("clusters.png", dpi=300, bbox_inches="tight")

In [None]:
# fig_1 = plot_points(
#     (points_trained_1, points_ots_1),
#     (_labels_trained_1, _labels_ots_1),
#     None,
# #     (r"\textsc{RoBERTa} Large + OSTI Pre-Training", r"\textsc{RoBERTa} Large"),
#     label_map_1,
#     False,
#     **PLOT_KWARGS
# )

In [None]:
# fig_1 = plot_points(
#     (points_trained_1, points_ots_1),
#     (dataset_trained_1["label"], dataset_ots_1["label"]),
#     (r"\textsc{RoBERTa} Large + OSTI Pre-Training", r"\textsc{RoBERTa} Large"),
#     None,
#     False,
#     **PLOT_KWARGS
# )

In [None]:
# fig_1.axes[0].set_xlim(-2.5, 17.5)
# fig_1.axes[1].set_xlim(-2.5, 17.5)
# fig_1.axes[0].set_ylim(-2.5, 17.5)
# fig_1.axes[1].set_ylim(-2.5, 17.5)
# fig_1.axes[0].set_xticks([0, 5, 10, 15])
# fig_1.axes[0].set_yticks([0, 5, 10, 15])
# fig_1.axes[1].set_xticks([0, 5, 10, 15])
# fig_1.axes[1].set_yticks([0, 5, 10, 15])

# fig_1.axes[1].legend(loc='center left', bbox_to_anchor=(1, 0.5))

# fig_1

In [None]:
# fig_1.savefig("umap.png", dpi=300, bbox_inches="tight")

# Final plot for publication

In [None]:
LABEL_MAP = {
    "nuke": "NFC-Related",
    "not-nuke": "Other",
}
PLOT_KWARGS = {
    "linestyle": "None",
    "marker": ".",
    "alpha": 0.5,
}

In [None]:
_labels_trained_1 = [label if label >= 0 else int(1e5) for label in labels_trained_1]
_labels_ots_1 = [label if label >= 0 else int(1e5) for label in labels_ots_1]

In [None]:
labels_set_1 = set(_labels_trained_1 + _labels_ots_1)
label_map_1 = {i: f"Cluster {i + 1: 2d}" for i in range(max(labels_set_1) + 1)}

label_map_1[int(1e5)] = "Outlier"

In [None]:
points = (points_trained_1, points_ots_1)[::-1]
labels = ((dataset_trained_1["label"], dataset_ots_1["label"])[::-1], (_labels_trained_1, _labels_ots_1)[::-1])
label_maps = (LABEL_MAP, label_map_1)
titles = (r"\textsc{RoBERTa} Large + OSTI Pre-Training", r"\textsc{RoBERTa} Large")[::-1]

In [None]:
from matplotlib import rc


rc("text", usetex=True)

fig, axes = plt.subplots(2, 2, figsize=(7, 7))

for i in range(2):
    for j in range(2):
        _points = points[j]
        ax = axes[i, j]
        _labels = labels[i][j]
        unique_labels = sorted(list(set(_labels)))
        idx = {}
        for class_name in unique_labels:
            idx[class_name] = [i for i, label in enumerate(_labels) if label == class_name]
        for class_name in unique_labels:
            ax.plot(
                _points[idx[class_name], 0],
                _points[idx[class_name], 1],
                label=label_maps[i][class_name],
                **PLOT_KWARGS,
            )
        if i == 0:
            ax.set_title(titles[j])
        ax.legend(loc="upper right")  # loc='center left', bbox_to_anchor=(1, 0.5))
        if j == 0:
            if i == 0:
                ax.set_ylabel("NFC Labels")
            if i == 1:
                ax.set_ylabel("BERTopic Cluster Labels")
        ax.set_xlim(-2.5, 25)
        ax.set_ylim(-2.5, 18)
        ax.set_xticks([0, 5, 10, 15, 20, 25])
        ax.set_yticks([0, 5, 10, 15])

In [None]:
fig.savefig(output_dir / "combined-plots.png", dpi=300, bbox_inches="tight")