In [33]:
import json
import pandas as pd

with open("collection_export.json", "r") as f:
    json_data = json.load(f)

df = pd.DataFrame({
    "embedding": json_data["embeddings"],
    "source": [meta["source"] for meta in json_data["metadatas"]],
    "title": [meta.get("title", "") for meta in json_data["metadatas"]],
    "date": [meta.get("date", "") for meta in json_data["metadatas"]],
    "category": [meta.get("category", "") for meta in json_data["metadatas"]],
    "niche": [meta.get("niche", "") for meta in json_data["metadatas"]],
    "key_themes": [meta.get("key_themes", "") for meta in json_data["metadatas"]],
    "recurring_topics": [meta.get("recurring_topics", "") for meta in json_data["metadatas"]],
    "document": json_data["documents"],
    "id": json_data["ids"]
})


In [34]:
import umap

reducer = umap.UMAP(n_components=2, random_state=42)
embeddings_2d = reducer.fit_transform(df["embedding"].tolist())

df["x"] = embeddings_2d[:, 0]
df["y"] = embeddings_2d[:, 1]



'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [35]:
import plotly.express as px

fig = px.scatter(
    df, x="x", y="y",
    color="source",
    hover_data=["title", "date", "source", "category", "niche"],
    title="Embedding Clusters Visualization (by Source)"
)

fig.update_layout(
    legend_title="Source",
    width=854,
    height=480
)

fig.show()


In [None]:
import numpy as np
import hdbscan

X = np.array(df["embedding"].tolist())

clusterer = hdbscan.HDBSCAN(min_cluster_size=5, gen_min_span_tree=True)
cluster_labels = clusterer.fit_predict(X)

df["cluster"] = cluster_labels

fig = px.scatter(
    df, x="x", y="y",
    color="cluster",
    hover_data=["title", "date", "source", "category", "niche"],
    title="Embedding Clusters Visualization (HDBSCAN)"
)

fig.update_layout(
    legend_title="Cluster",
    width=854,
    height=480
)
fig.show()



'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.


'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.



In [None]:
df_unique = df.drop_duplicates(subset=["title", "category", "niche", "source", "key_themes", "recurring_topics"])

df_clustered = df_unique[['cluster', 'title', 'category', 'niche', 'source', 'key_themes', 'recurring_topics']]

print(df_clustered)


     cluster                                              title  \
0          0                      Issue no. 103 | February 2025   
8         -1             Food and Nutritional Security in India   
11        -1                           Financial Crime Bulletin   
12         2  KPMG global tech report – industrial manufactu...   
20         2       KPMG global tech report: Technology insights   
27         2           KPMG global tech report: energy insights   
34         2                       KPMG global tech report 2024   
41         3  Quality measures and standards for transitioni...   
51         5      The mutual funds route to Viksit Bharat @2047   
64         5  Financial health: Transcending from access to ...   
69         1  Towards a climate-resilient future: Strategies...   
82         4                    The retail reinvention paradigm   
90        -1  PwC India's Financial Services (FS) Risk Sympo...   
91         6  How India spends: A deep dive into consumer sp..

In [39]:
# Example: Get all titles in cluster 0
cluster_0_titles = df_clustered[df_clustered["cluster"] == 0]["title"].tolist()
print(cluster_0_titles)

['Issue no. 103 | February 2025']
