In [33]:
import json
import pandas as pd

with open("collection_export.json", "r") as f:
    json_data = json.load(f)

df = pd.DataFrame({
    "embedding": json_data["embeddings"],
    "source": [meta["source"] for meta in json_data["metadatas"]],
    "title": [meta.get("title", "") for meta in json_data["metadatas"]],
    "date": [meta.get("date", "") for meta in json_data["metadatas"]],
    "category": [meta.get("category", "") for meta in json_data["metadatas"]],
    "niche": [meta.get("niche", "") for meta in json_data["metadatas"]],
    "key_themes": [meta.get("key_themes", "") for meta in json_data["metadatas"]],
    "recurring_topics": [meta.get("recurring_topics", "") for meta in json_data["metadatas"]],
    "document": json_data["documents"],
    "id": json_data["ids"]
})


In [34]:
import umap

reducer = umap.UMAP(n_components=2, random_state=42)
embeddings_2d = reducer.fit_transform(df["embedding"].tolist())

df["x"] = embeddings_2d[:, 0]
df["y"] = embeddings_2d[:, 1]



'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [35]:
import plotly.express as px

fig = px.scatter(
    df, x="x", y="y",
    color="source",
    hover_data=["title", "date", "source", "category", "niche"],
    title="Embedding Clusters Visualization (by Source)"
)

fig.update_layout(
    legend_title="Source",
    width=854,
    height=480
)

fig.show()


In [None]:
import numpy as np
import hdbscan

X = np.array(df["embedding"].tolist())

clusterer = hdbscan.HDBSCAN(min_cluster_size=5, gen_min_span_tree=True)
cluster_labels = clusterer.fit_predict(X)

df["cluster"] = cluster_labels

fig = px.scatter(
    df, x="x", y="y",
    color="cluster",
    hover_data=["title", "date", "source", "category", "niche"],
    title="Embedding Clusters Visualization (HDBSCAN)"
)

fig.update_layout(
    legend_title="Cluster",
    width=854,
    height=480
)
fig.show()



'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.


'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.



In [47]:
df_unique = df.drop_duplicates(subset=["title", "category", "niche", "source", "key_themes", "recurring_topics"])

df_clustered = df_unique[['cluster', 'title', 'category', 'niche', 'source', 'key_themes', 'recurring_topics']].copy()

def split_to_list(text):
    if isinstance(text, str) and text.strip():
        return [item.strip() for item in text.split(',') if item.strip()]
    else:
        return []

df_clustered["key_themes"] = df_clustered["key_themes"].apply(split_to_list)
df_clustered["recurring_topics"] = df_clustered["recurring_topics"].apply(split_to_list)
""
df_clustered.to_csv("clustered_df_streamlit.csv")

In [48]:
df_clustered.head()

Unnamed: 0,cluster,title,category,niche,source,key_themes,recurring_topics
0,0,Issue no. 103 | February 2025,Economy and Growth,Inflation Trends,KPMG Insights,"[Business Combinations, Sustainability Disclos...","[IFRS, Measurement Period, Acquisition Method,..."
8,-1,Food and Nutritional Security in India,India (Country),Agricultural Policy,KPMG Insights,"[Food Security, Public Policy, Nutrition, Sust...","[Public Distribution System (PDS), Malnutritio..."
11,-1,Financial Crime Bulletin,Risk Regulation,Financial Compliance,KPMG Insights,"[Financial Crime, Regulatory Initiatives, Tech...","[Anti-Money Laundering (AML), Corporate Transp..."
12,2,KPMG global tech report – industrial manufactu...,Technology,Industrial IoT,KPMG Insights,"[Digital Transformation, Operational Efficienc...","[Industrial Manufacturing, KPMG Global Tech Re..."
20,2,KPMG global tech report: Technology insights,Technology,Emerging Technologies,KPMG Insights,"[Digital Transformation, Strategic Investment,...","[KPMG Global Tech Report, Technology Sector, E..."
