In [None]:
import numpy as np
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from sklearn.manifold import MDS
from sklearn.metrics import adjusted_rand_score
from scipy.cluster.hierarchy import linkage, fcluster
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from tqdm import tqdm
from itertools import islice
import warnings

warnings.filterwarnings("ignore")

NUM_PEOPLE_PER_CLUSTER = 40
DOCS_PER_PERSON = 400
MODEL_NAME = "all-MiniLM-L6-v2"
TOTAL_PEOPLE = 3 * NUM_PEOPLE_PER_CLUSTER


def igw_distance_from_eigs(e_vals1, e_vals2):
    d1, d2 = len(e_vals1), len(e_vals2)
    if d1 > d2:
        e_vals1, e_vals2, d1, d2 = e_vals2, e_vals1, d2, d1

    sum_common = np.sum((e_vals1 - e_vals2[:d1]) ** 2)
    sum_remaining = np.sum(e_vals2[d1:] ** 2)
    return np.sqrt(sum_common + sum_remaining)

docs_to_take = NUM_PEOPLE_PER_CLUSTER * DOCS_PER_PERSON

streamed_formal = load_dataset(
    "scientific_papers", "arxiv", split="train", streaming=True
)
formal_texts = [item["abstract"] for item in islice(streamed_formal, docs_to_take)]

streamed_journalistic = load_dataset("ag_news", split="train", streaming=True)
journalistic_texts = [
    item["text"] for item in islice(streamed_journalistic, docs_to_take)
]

streamed_informal = load_dataset("imdb", split="train", streaming=True)
informal_texts = [item["text"] for item in islice(streamed_informal, docs_to_take)]

all_texts = formal_texts + journalistic_texts + informal_texts
print(f"Loaded a total of {len(all_texts)} documents.")

model = SentenceTransformer(MODEL_NAME)
embeddings = model.encode(all_texts, show_progress_bar=True)

all_eigenvalues = []
people_labels = []
cluster_names = [
    "Formal (scientific_papers)",
    "Journalistic (ag_news)",
    "Informal (imdb)",
]

for i in tqdm(range(TOTAL_PEOPLE), desc="Computing covariances and eigenvalues"):
    start_idx = i * DOCS_PER_PERSON
    end_idx = start_idx + DOCS_PER_PERSON
    person_embeddings = embeddings[start_idx:end_idx]

    centered_embeddings = person_embeddings - np.mean(person_embeddings, axis=0)
    cov_matrix = np.cov(centered_embeddings, rowvar=False)

    e_vals = np.sort(np.linalg.eigvalsh(cov_matrix))[::-1]
    all_eigenvalues.append(e_vals)

    people_labels.append(cluster_names[i // NUM_PEOPLE_PER_CLUSTER])

print(f"Extracted {len(all_eigenvalues)} eigenvalue vectors.")

distance_matrix = np.zeros((TOTAL_PEOPLE, TOTAL_PEOPLE))
for i in tqdm(range(TOTAL_PEOPLE), desc="Computing IGW distance matrix"):
    for j in range(i + 1, TOTAL_PEOPLE):
        dist = igw_distance_from_eigs(all_eigenvalues[i], all_eigenvalues[j])
        distance_matrix[i, j] = distance_matrix[j, i] = dist

mds = MDS(
    n_components=2,
    dissimilarity="precomputed",
    random_state=42,
    n_init=10,
    max_iter=500,
)
embedding_2d = mds.fit_transform(distance_matrix)
print("MDS 2D embedding computed.")

In [None]:
import matplotlib_inline

matplotlib_inline.backend_inline.set_matplotlib_formats("svg")
plt.style.use("math.mplstyle")

people_labels = []
cluster_names = [
    "Formal (scientific_papers)",
    "Journalistic (ag_news)",
    "Informal (imdb)",
]

for i in range(TOTAL_PEOPLE):
    people_labels.append(cluster_names[i // NUM_PEOPLE_PER_CLUSTER])

cluster_colors = ["#1f77b4", "#ff7f0e", "#2ca02c"]
color_map = dict(zip(cluster_names, cluster_colors))
true_colors = [color_map[label] for label in people_labels]
plt.axis("equal")
plt.scatter(
    embedding_2d[:, 0],
    embedding_2d[:, 1],
    c=true_colors,
    s=30,
    alpha=0.8,
    edgecolors="w",
)
legend_patches = [
    mpatches.Patch(color=color, label=name) for name, color in color_map.items()
]
plt.legend(handles=legend_patches, fontsize=12)
plt.title("MDS embedding of IGW distance matrix", fontsize=14)
plt.savefig("igw_mds_embedding.pdf")
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt

TOP_N_EIGENVALUES = 50

for i, cluster_name in enumerate(cluster_names):
    cluster_eigenvalues = [
        all_eigenvalues[j][:TOP_N_EIGENVALUES]
        for j, label in enumerate(people_labels)
        if label == cluster_name
    ]

    average_spectrum = np.mean(cluster_eigenvalues, axis=0)

    plt.plot(
        range(1, TOP_N_EIGENVALUES + 1),
        average_spectrum,
        label=f'{cluster_name}',
        color=cluster_colors[i],
        marker="o",
        markersize=3,
    )

plt.title(
    f"Average eigenspectra of archetypes", fontsize=14
)
plt.xlabel("Eigenvalue index (sorted descending)", fontsize=12)
plt.ylabel("Average eigenvalue", fontsize=12)
plt.legend(fontsize=12)

plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import adjusted_rand_score
from tqdm import tqdm
import random


def calculate_barycenter_eigenvalues(eigenvalue_list):
    if not eigenvalue_list:
        return None
    max_dim = max(len(eigs) for eigs in eigenvalue_list)
    padded_eigenvalues = [
        np.pad(eigs, (0, max_dim - len(eigs)), "constant") for eigs in eigenvalue_list
    ]
    return np.mean(padded_eigenvalues, axis=0)


def kmeans_plusplus_init_eigs(data_eigenvalues, k):
    num_points = len(data_eigenvalues)
    centroids = [data_eigenvalues[random.randint(0, num_points - 1)]]
    for _ in range(k - 1):
        dist_sq = np.array(
            [
                min([igw_distance_from_eigs(p, c) ** 2 for c in centroids])
                for p in data_eigenvalues
            ]
        )
        probs = dist_sq / dist_sq.sum()
        next_centroid_idx = np.random.choice(num_points, p=probs)
        centroids.append(data_eigenvalues[next_centroid_idx])
    return centroids


def run_lloyds_algorithm_on_eigs(data_eigenvalues, k=3, max_iterations=20):
    centroids = kmeans_plusplus_init_eigs(data_eigenvalues, k)
    assignments = np.zeros(len(data_eigenvalues), dtype=int)
    for iteration in tqdm(range(max_iterations), desc="k-means++ algorithm"):
        new_assignments = np.array(
            [
                np.argmin([igw_distance_from_eigs(p, c) for c in centroids])
                for p in data_eigenvalues
            ]
        )
        if np.array_equal(assignments, new_assignments):
            break
        assignments = new_assignments
        for j in range(k):
            cluster_points = [
                data_eigenvalues[i] for i, label in enumerate(assignments) if label == j
            ]
            if cluster_points:
                centroids[j] = calculate_barycenter_eigenvalues(cluster_points)
    return assignments


assignments_kmeans_eigs = run_lloyds_algorithm_on_eigs(all_eigenvalues, k=3)

label_to_int = {label: i for i, label in enumerate(np.unique(people_labels))}
true_assignments = [label_to_int[label] for label in people_labels]

plot_colors = [plt.cm.viridis(val) for val in np.linspace(.3, .8, 3)]
assigned_colors = [plot_colors[label] for label in assignments_kmeans_eigs]

plt.axis("equal")
plt.scatter(
    embedding_2d[:, 0],
    embedding_2d[:, 1],
    c=assigned_colors,
    s=30,
    edgecolors="w",
    alpha=0.8,
)
plt.title("$k$-means++ output clusters", fontsize=14)
plt.savefig("kmeans_clusters.pdf")
plt.show()

In [None]:
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.manifold import MDS
from sklearn.metrics import adjusted_rand_score
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from tqdm import tqdm
import random
import warnings

warnings.filterwarnings("ignore")

MODEL_NAMES = {
    "all-MiniLM-L6-v2 (384D)": "all-MiniLM-L6-v2",
    "all-mpnet-base-v2 (768D)": "all-mpnet-base-v2",
}

def igw_distance_from_eigs(e_vals1, e_vals2):
    d1, d2 = len(e_vals1), len(e_vals2)
    if d1 > d2:
        e_vals1, e_vals2, d1, d2 = e_vals2, e_vals1, d2, d1
    sum_common = np.sum((e_vals1 - e_vals2[:d1]) ** 2)
    sum_remaining = np.sum(e_vals2[d1:] ** 2)
    return np.sqrt(sum_common + sum_remaining)


def calculate_barycenter_eigenvalues(eigenvalue_list):
    if not eigenvalue_list:
        return None
    max_dim = max(len(eigs) for eigs in eigenvalue_list)
    padded_eigenvalues = [
        np.pad(eigs, (0, max_dim - len(eigs)), "constant") for eigs in eigenvalue_list
    ]
    return np.mean(padded_eigenvalues, axis=0)


def kmeans_plusplus_init_eigs(data_eigenvalues, k):
    num_points = len(data_eigenvalues)
    centroids = [data_eigenvalues[random.randint(0, num_points - 1)]]
    for _ in range(k - 1):
        dist_sq = np.array(
            [
                min([igw_distance_from_eigs(p, c) ** 2 for c in centroids])
                for p in data_eigenvalues
            ]
        )
        probs = dist_sq / dist_sq.sum()
        next_centroid_idx = np.random.choice(num_points, p=probs)
        centroids.append(data_eigenvalues[next_centroid_idx])
    return centroids


def run_lloyds_algorithm_on_eigs(data_eigenvalues, k=3, max_iterations=20):
    centroids = kmeans_plusplus_init_eigs(data_eigenvalues, k)
    assignments = np.zeros(len(data_eigenvalues), dtype=int)
    for _ in tqdm(range(max_iterations), desc="k-means++ algorithm", leave=False):
        new_assignments = np.array(
            [
                np.argmin([igw_distance_from_eigs(p, c) for c in centroids])
                for p in data_eigenvalues
            ]
        )
        if np.array_equal(assignments, new_assignments):
            break
        assignments = new_assignments
        for j in range(k):
            cluster_points = [
                data_eigenvalues[i] for i, label in enumerate(assignments) if label == j
            ]
            if cluster_points:
                centroids[j] = calculate_barycenter_eigenvalues(cluster_points)
    return assignments

all_texts_by_person = [
    all_texts[i * DOCS_PER_PERSON : (i + 1) * DOCS_PER_PERSON]
    for i in range(TOTAL_PEOPLE)
]

models = {name: SentenceTransformer(path) for name, path in MODEL_NAMES.items()}

person_indices = list(range(TOTAL_PEOPLE))
random.shuffle(person_indices)
model_keys = list(MODEL_NAMES.keys())
all_eigenvalues_mixed, model_assignments_temp = [], []

original_people_labels = people_labels

for i in tqdm(range(TOTAL_PEOPLE), desc="Embedding users"):
    person_idx = person_indices[i]
    model_idx = i % len(models)
    model_key = model_keys[model_idx]
    model = models[model_key]

    person_texts = all_texts_by_person[person_idx]
    embeddings = model.encode(person_texts, show_progress_bar=False)

    cov_matrix = np.cov(embeddings - np.mean(embeddings, axis=0), rowvar=False)
    all_eigenvalues_mixed.append(
        (person_idx, np.sort(np.linalg.eigvalsh(cov_matrix))[::-1])
    )
    model_assignments_temp.append((person_idx, model_key))

all_eigenvalues_mixed.sort(key=lambda x: x[0])
all_eigenvalues_mixed = [item[1] for item in all_eigenvalues_mixed]
model_assignments_temp.sort(key=lambda x: x[0])
model_assignments = [item[1] for item in model_assignments_temp]

distance_matrix_mixed = np.zeros((TOTAL_PEOPLE, TOTAL_PEOPLE))
for i in tqdm(range(TOTAL_PEOPLE), desc="Computing IGW distance matrix"):
    for j in range(i + 1, TOTAL_PEOPLE):
        dist = igw_distance_from_eigs(
            all_eigenvalues_mixed[i], all_eigenvalues_mixed[j]
        )
        distance_matrix_mixed[i, j] = distance_matrix_mixed[j, i] = dist

mds = MDS(
    n_components=2,
    dissimilarity="precomputed",
    random_state=42,
    n_init=10,
    max_iter=500,
)
embedding_2d_mixed = mds.fit_transform(distance_matrix_mixed)

assignments_kmeans_eigs = run_lloyds_algorithm_on_eigs(all_eigenvalues_mixed, k=3)

model_color_map = {
    "all-MiniLM-L6-v2 (384D)": "#d62728",
    "all-mpnet-base-v2 (768D)": "#9467bd",
}
model_colors = [model_color_map[name] for name in model_assignments]

plt.figure()
plt.axis("equal")
plt.scatter(
    embedding_2d_mixed[:, 0],
    embedding_2d_mixed[:, 1],
    c=model_colors,
    s=30,
    alpha=0.8,
    edgecolors="w",
)

model_color_map = {
    "all-MiniLM-L6-v2 (384-dim.)": "#d62728",
    "all-mpnet-base-v2 (768-dim.)": "#9467bd",
}
legend_patches_model = [
    mpatches.Patch(color=color, label=name) for name, color in model_color_map.items()
]
plt.legend(handles=legend_patches_model, fontsize=11, facecolor="white", framealpha=1)
plt.title("MDS embeddings colored by embedding model", fontsize=14)
plt.savefig("heterogeneous_mds_embeddings.pdf")
plt.show()

style_color_map = {
    "Formal (scientific_papers)": "#1f77b4",
    "Journalistic (ag_news)": "#ff7f0e",
    "Informal (imdb)": "#2ca02c",
}
style_colors = [style_color_map[label] for label in original_people_labels]

plt.figure()
plt.axis("equal")
plt.scatter(
    embedding_2d_mixed[:, 0],
    embedding_2d_mixed[:, 1],
    c=style_colors,
    s=30,
    alpha=0.8,
    edgecolors="w",
)
legend_patches_style = [
    mpatches.Patch(color=color, label=name) for name, color in style_color_map.items()
]
plt.legend(handles=legend_patches_style, fontsize=11, facecolor="white", framealpha=1)
plt.title("MDS embedding colored by true style (ground truth)", fontsize=14)
plt.savefig("heterogeneous_true_styles.pdf")
plt.show()

label_to_int = {label: i for i, label in enumerate(np.unique(original_people_labels))}
true_assignments = [label_to_int[label] for label in original_people_labels]
ari_score = adjusted_rand_score(true_assignments, assignments_kmeans_eigs)

cluster_colors = [plt.cm.viridis(val) for val in np.linspace(0.3, 0.8, 3)]
plot_colors = [cluster_colors[label] for label in assignments_kmeans_eigs]

plt.figure()
plt.axis("equal")
plt.scatter(
    embedding_2d_mixed[:, 0],
    embedding_2d_mixed[:, 1],
    c=plot_colors,
    s=30,
    alpha=0.8,
    edgecolors="w",
)
plt.title(f"$k$-means++ output clusters", fontsize=14)
plt.savefig("heterogeneous_kmeans_clusters.pdf")
plt.show()