In [1]:
from itertools import chain

import altair as alt
import numpy as np
import pandas as pd
import plotly.express as px
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from umap import UMAP


In [2]:
df = (
    pd.read_csv("cleaned_hm.csv.xz")
    .dropna(subset=["ground_truth_category", "cleaned_hm"], how="any")
    .drop_duplicates(subset=["cleaned_hm"])
    .loc[:, ["cleaned_hm", "ground_truth_category"]]
    .reset_index(drop=True)
)


In [3]:
model = SentenceTransformer("paraphrase-distilroberta-base-v1")



In [4]:
docs = df["cleaned_hm"].str.lower().tolist()
embeddings = model.encode(docs, show_progress_bar=True)


Batches: 100%|██████████| 398/398 [05:53<00:00,  1.13it/s]


In [5]:
umap = UMAP(
    min_dist=0.00, n_neighbors=30, metric="cosine", random_state=1234, verbose=True
)
embeddings_umap = umap.fit_transform(embeddings)


UMAP(angular_rp_forest=True, dens_frac=0.0, dens_lambda=0.0, metric='cosine',
     min_dist=0.0, n_neighbors=30, random_state=1234, verbose=True)
Construct fuzzy simplicial set
Thu Jan 28 12:04:46 2021 Finding Nearest Neighbors
Thu Jan 28 12:04:46 2021 Building RP forest with 11 trees
Thu Jan 28 12:04:47 2021 NN descent for 14 iterations
	 1  /  14
	 2  /  14
	 3  /  14
	 4  /  14
	Stopping threshold met -- exiting after 4 iterations
Thu Jan 28 12:04:59 2021 Finished Nearest Neighbor Search
Thu Jan 28 12:05:01 2021 Construct embedding
	completed  0  /  200 epochs
	completed  20  /  200 epochs
	completed  40  /  200 epochs
	completed  60  /  200 epochs
	completed  80  /  200 epochs
	completed  100  /  200 epochs
	completed  120  /  200 epochs
	completed  140  /  200 epochs
	completed  160  /  200 epochs
	completed  180  /  200 epochs
Thu Jan 28 12:05:11 2021 Finished embedding


In [6]:
embeddings_df = pd.DataFrame()
embeddings_df["Document"] = docs
embeddings_df["Component 1"] = embeddings_umap[:, 0]
embeddings_df["Component 2"] = embeddings_umap[:, 1]
embeddings_df["Ground Truth"] = df["ground_truth_category"]


In [7]:
hdbscan = HDBSCAN(
    cluster_selection_method="leaf"
)



In [11]:
clusters = hdbscan.fit_predict(embeddings_umap)
embeddings_df["Cluster"] = clusters


In [12]:

alt.data_transformers.disable_max_rows()


DataTransformerRegistry.enable('default')

In [13]:
chart = (
    (
        alt.Chart(
            embeddings_df,
            height=1000,
            width=1000,
            title="Happy Moments - SBERT → UMAP → HDBSCAN",
        )
        .mark_point()
        .encode(
            x=alt.X("Component 1", axis=None),
            y=alt.Y("Component 2", axis=None),
            tooltip=["Document", "Cluster", "Ground Truth"],
            color="Cluster:N",
        )
    )
    .configure_axis(grid=False)
    .configure_view(strokeWidth=0)
    .interactive()
)


In [14]:
chart.save("docs/nlp-sbert-umap-hdbscan-chart.html")


In [19]:
sphere_mapper = UMAP(
    min_dist=0.00,
    n_neighbors=30,
    output_metric="haversine",
    random_state=42,
    verbose=True,
)
umap_sphere_embeddings = sphere_mapper.fit_transform(embeddings)

embeddings_df["Sphere Embedding X"] = np.sin(umap_sphere_embeddings[:, 0]) * np.cos(
    umap_sphere_embeddings[:, 1]
)
embeddings_df["Sphere Embedding Y"] = np.sin(umap_sphere_embeddings[:, 0]) * np.sin(
    umap_sphere_embeddings[:, 1]
)
embeddings_df["Sphere Embedding Z"] = np.cos(umap_sphere_embeddings[:, 0])

embeddings_df["2D Sphere Embedding X"] = np.arctan2(
    embeddings_df["Sphere Embedding X"], embeddings_df["Sphere Embedding Y"]
)
embeddings_df["2D Sphere Embedding Y"] = -np.arccos(embeddings_df["Sphere Embedding Z"])


In [27]:
hdbscan_sphere = HDBSCAN(cluster_selection_method="eom", metric="haversine")
clusters_sphere = hdbscan_sphere.fit_predict(embeddings_df[["2D Sphere Embedding X", "2D Sphere Embedding Y"]])
embeddings_df["Cluster Sphere"] = clusters_sphere
embeddings_df["Cluster Sphere (str)"] = embeddings_df["Cluster Sphere"].astype(str)

In [30]:
fig = px.scatter(
    embeddings_df,
    x="2D Sphere Embedding X",
    y="2D Sphere Embedding Y",
    color="Cluster Sphere (str)",
    hover_data=["Document", "Cluster Sphere", "Ground Truth"],
    # color_continuous_scale="viridis",
    opacity=0.8,
    color_discrete_sequence=px.colors.qualitative.Alphabet
)

fig.write_html("docs/nlp-sbert-umap-hdbscan-sphere-2d.html")

In [34]:
fig = px.scatter_3d(
    embeddings_df,
    x="Sphere Embedding X",
    y="Sphere Embedding Y",
    z="Sphere Embedding Z",
    color="Cluster Sphere (str)",
    hover_data=["Document", "Cluster Sphere", "Ground Truth"],
    color_discrete_sequence=px.colors.qualitative.Alphabet,
    opacity=0.8,
)

camera = dict(
    up=dict(x=0, y=0, z=1), center=dict(x=0, y=0, z=0), eye=dict(x=0, y=0, z=0)
)

fig.update_layout(scene_camera=camera)

fig.write_html("docs/nlp-sbert-umap-hdbscan-sphere.html")
