In [17]:
import json
import pandas as pd

with open("collection_export.json", "r") as f:
    json_data = json.load(f)

df = pd.DataFrame({
    "embedding": json_data["embeddings"],
    "source": [meta["source"] for meta in json_data["metadatas"]],
    "title": [meta.get("title", "") for meta in json_data["metadatas"]],
    "date": [meta.get("date", "") for meta in json_data["metadatas"]],
    "category": [meta.get("category", "") for meta in json_data["metadatas"]],
    "niche": [meta.get("niche", "") for meta in json_data["metadatas"]],
    "document": json_data["documents"],
    "id": json_data["ids"]
})


In [18]:
import umap

reducer = umap.UMAP(n_components=2, random_state=42)
embeddings_2d = reducer.fit_transform(df["embedding"].tolist())

df["x"] = embeddings_2d[:, 0]
df["y"] = embeddings_2d[:, 1]



'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [19]:
import plotly.express as px

fig = px.scatter(
    df, x="x", y="y",
    color="source",
    hover_data=["title", "date", "source", "category", "niche"],
    title="Embedding Clusters Visualization (by Source)"
)

fig.update_layout(
    legend_title="Source",
    width=854,
    height=480
)

fig.show()


In [None]:
reducer_3d = umap.UMAP(n_components=3, random_state=42)
embeddings_3d = reducer_3d.fit_transform(df["embedding"].tolist())

df["x3d"] = embeddings_3d[:, 0]
df["y3d"] = embeddings_3d[:, 1]
df["z3d"] = embeddings_3d[:, 2]



'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [21]:
fig_3d = px.scatter_3d(
    df, x="x3d", y="y3d", z="z3d",
    color="source",
    hover_data=["title", "date", "id", "source"],
    title="3D Embedding Clusters Visualization (by Source)"
)

fig_3d.update_layout(
    legend_title="Source",
    width=1000,
    height=800
)

fig_3d.show()
