In [404]:
# Standard libraries
import os
import re
import time
import warnings
import logging
import gc
import ast 

# Data manipulation
import numpy as np
import pandas as pd

# Machine learning & clustering
from sklearn.decomposition import PCA
from sklearn.cluster import AgglomerativeClustering
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import rand_score, fowlkes_mallows_score, adjusted_rand_score
from scipy.cluster.hierarchy import fcluster

# Dimensionality reduction
from umap import UMAP
from sklearn.random_projection import johnson_lindenstrauss_min_dim, SparseRandomProjection
import phate
from bertopic.dimensionality import BaseDimensionalityReduction

# Topic modeling
from bertopic import BERTopic

# Graph-based clustering
from hdbscan import HDBSCAN

# Diffusion condensation
from diffusion_condensation import DiffusionCondensation as dc

# NLP & Transformers
import torch
from transformers import BertTokenizer, BertModel
from sentence_transformers import SentenceTransformer

# Parallel processing
from joblib import Parallel, delayed
from tqdm import tqdm
from tqdm_joblib import tqdm_joblib

# Evaluation metrics
from fowlkes_mallows import FowlkesMallows

# Visualization
import plotly.express as px
import plotly.io as pio

# Utility
import importlib
import warnings
np.random.seed(42) 
warnings.filterwarnings("ignore")

importlib.reload(phate)


<module 'phate' from '/Users/spencer.dork/.local/lib/python3.12/site-packages/phate/__init__.py'>

In [213]:
import openai
from openai import OpenAI
with open("key.txt", "r") as file:
    key = file.read().strip() 
openai.api_key=key
client = OpenAI(api_key=key)

#### Load Data

In [480]:
theme ="Offshore energy impacts on fisheries"
t=1.0
max_sub = 5
depth = 3
synonyms= 0
branching = 'random'
# Data Generation/Generated Data

In [484]:
filename=f'Data Generation/Generated Data/{theme}_hierarchy_t{t}_maxsub{max_sub}_depth{depth}_synonyms{synonyms}_{branching}.csv'
print(f"Waiting for {filename} to be created...")

while not os.path.exists(filename):
    time.sleep(1)  # Check every second

print(f"{filename} detected! Reading file...")

Waiting for Data Generation/Generated Data/Offshore energy impacts on fisheries_hierarchy_t1.0_maxsub5_depth3_synonyms0_random.csv to be created...
Data Generation/Generated Data/Offshore energy impacts on fisheries_hierarchy_t1.0_maxsub5_depth3_synonyms0_random.csv detected! Reading file...


In [485]:
topic_data = pd.read_csv(filename)
num_seed_topics = len(topic_data['category 0'].unique())

#### Embed Topics

In [486]:
embedding_model = "text-embedding-ada-002"

def get_gpt_embeddings(texts):
    """Fetches embeddings from OpenAI API in batches."""
    batch_size = 100  # OpenAI API can process multiple inputs at once
    embeddings = []

    for i in tqdm(range(0, len(texts), batch_size), desc="Fetching embeddings", unit="batch"):
        batch = texts[i : i + batch_size]
        
        # Fetch embeddings for the batch
        response = client.embeddings.create(input=batch, model=embedding_model)
        
        # Inspect the response structure
        # print(response)  # Add this line to print out the response and examine it
        
        # If `response.data` is a list of 'Embedding' objects, we need to extract the embedding correctly
        batch_embeddings = [entry.embedding for entry in response.data]  # Access 'embedding' directly from the object
        embeddings.extend(batch_embeddings)

    return embeddings

In [487]:
os.makedirs(f'GPT_embeddings', exist_ok=True)
embed_file=f'GPT_embeddings/{theme}_hierarchy_t{t}_maxsub{max_sub}_depth{depth}_synonyms{synonyms}_gpt_embed.npy'
if not os.path.exists(embed_file):
    embedding_list = get_gpt_embeddings(topic_data['topic'])
    np.save(embed_file,embedding_list)
else:
    embedding_list = np.load(embed_file)

#### Shuffle data

In [488]:
shuffle_idx = np.random.RandomState(seed=42).permutation(len(topic_data))

# Shuffle both documents and embeddings using the same index
topic_data = topic_data.iloc[shuffle_idx].reset_index(drop=True)
data = np.array(embedding_list)[shuffle_idx]
reverse_idx = np.argsort(shuffle_idx)

#### Convert Topic Data to Topic Label Dicitonary

*Easier Clustering Analysis*

In [489]:
topic_dict = {}
for col in topic_data.columns:
    if re.match(r'^category \d+$', col): 
        unique_count = len(topic_data[col].unique())
        topic_dict[unique_count] = np.array(topic_data[col])

#### Run Bertopic

In [490]:
reduction_methods = ['None','UMAP', 'PCA', 'PHATE','BASE-PHATE'] # see if reproduce with phate
cluster_methods = ['Diffusion Condensation','HDBSCAN', 'Agglomerative']

# Grid search parameters
umap_params = {'n_components': [100,300]} 
pca_params = {'n_components': [100, 300]} 
hdbscan_params = {'cluster_selection_epsilon': [0,.001,.00001], 'cluster_selection_method': ['eom', 'leaf']}
agg_params = {'linkage': ['ward', 'average']}
phate_params = {'n_components': [100,300], 'decay': [20,30], 't':['auto',7,11]}
base_phate_params = {'t':['auto',7,11]}
diffusion_params  = {'k':[3,4],"t":[2,8],'alpha':[2,4],'bandwidth_norm':['max','l2']}

# List to store parameter combinations
param_list = []

for reduction_method in reduction_methods:
    # Skip PHATE and None when using Diffusion Condensation
    for cluster_method in cluster_methods:
        # if cluster_method == 'Diffusion Condensation' and reduction_method not in ['UMAP', 'PCA','PHATE']:
        #     continue  # Skip invalid combinations

        param_grid = (
            umap_params if reduction_method == 'UMAP' else
            pca_params if reduction_method == 'PCA' else
            phate_params if reduction_method == 'PHATE' else
            base_phate_params if reduction_method == "BASE-PHATE" else
            {}
        )
        cluster_param_grid = (
            hdbscan_params if cluster_method == 'HDBSCAN' else
            agg_params if cluster_method == 'Agglomerative' else
            diffusion_params
        )
        for reduction_params in ParameterGrid(param_grid) if param_grid else [{}]:
            for cluster_params in ParameterGrid(cluster_param_grid):
                param_list.append((reduction_method, cluster_method, reduction_params, cluster_params))


In [491]:

# Function to run BERTopic with given parameters
def compare_dicts(dict_str1, dict_str2):
    try:
        dict1 = ast.literal_eval(dict_str1)  # Convert string to dictionary
        dict2 = ast.literal_eval(dict_str2)  # Convert string to dictionary
        return dict1 == dict2  # Compare dictionaries
    except:
        return False
def run_bertopic(reduction_method, cluster_method, reduction_params, cluster_params):
    results = []
    bertopic_file = f'bertopic_results/results_all_methods_{theme}_hierarchy_t{t}_maxsub{max_sub}_depth{depth}_synonyms{synonyms}_{branching}.csv'
    if os.path.exists(bertopic_file):
        check_df = pd.read_csv(bertopic_file)
        row_exists = ((check_df['reduction_method'].astype(str) == str(reduction_method)) &
                        (check_df['cluster_method'].astype(str) == str(cluster_method)) &
                        check_df['reduction_params'].apply(lambda x: compare_dicts(x, str(reduction_params))) &  # Compare reduction params
                        check_df['cluster_params'].apply(lambda x: compare_dicts(x, str(cluster_params)))  # Compare cluster params
                    ).any()
    if row_exists:
        return

    # Set up the reducer
    if reduction_method == 'UMAP':
        reducer = UMAP(random_state=42,n_neighbors=5, **reduction_params)
    elif reduction_method == 'PCA':
        reducer = PCA(random_state=42, **reduction_params)
    elif reduction_method == 'PHATE':
        reducer = phate.PHATE(n_jobs=-2,random_state=42,n_pca=None, **reduction_params)
    elif reduction_method == "BASE-PHATE":
        p_base = phate.PHATE(n_jobs=-2,random_state=42,n_pca=None, **reduction_params)
        p_base.fit(data)
        reducer = BaseDimensionalityReduction()
    else:
        reducer = BaseDimensionalityReduction()

    for i in topic_dict.keys():
        if cluster_method == 'HDBSCAN':
            cluster_model = HDBSCAN(**cluster_params)
        elif cluster_method =='Diffusion Condensation':
            cluster_model = dc(min_clusters=i,max_iterations=5000,**cluster_params)
        else:
            cluster_model = AgglomerativeClustering(n_clusters=i, **cluster_params)

        topic_model = BERTopic(hdbscan_model=cluster_model, umap_model=reducer)

        if reduction_method != "BASE-PHATE":
            topics, _ = topic_model.fit_transform(documents=topic_data['topic'], embeddings=data)
        else:
            embed_phate = p_base.diff_potential
            topics, _ = topic_model.fit_transform(documents=topic_data['topic'], embeddings=embed_phate)



        # Extract HDBSCAN hierarchy and find the closest cluster level
        if cluster_method == 'HDBSCAN':
            Z = cluster_model.single_linkage_tree_.to_numpy()
            labels = fcluster(Z, i, criterion='maxclust')

            max_label= labels.max()
            labels[labels == -1] = max_label + 1
        else:
            labels = np.array(topics)

        # Evaluate Fowlkes-Mallows score
        NA_mask = pd.isna(topic_dict[i])
        target_lst = topic_dict[i][~NA_mask]
        bertopic_lst = labels[~NA_mask]

        fm = FowlkesMallows.Bk({i: target_lst}, {i: bertopic_lst})
        rand = rand_score(target_lst,bertopic_lst)
        ari = adjusted_rand_score(target_lst,bertopic_lst)

        if reduction_method == 'UMAP':
            embed=topic_model.umap_model.embedding_
        elif reduction_method == 'PCA':
            embed = topic_model.umap_model.components_
        elif reduction_method == 'PHATE':
            embed = topic_model.umap_model.embedding
        elif reduction_method == "BASE-PHATE":
            embed = embed_phate
        else:
            embed=None
            

        result = {
            'reduction_method': reduction_method,
            'cluster_method': cluster_method,
            'level': i,
            'reduction_params': reduction_params,
            'cluster_params': cluster_params,
            'FM': fm[i]['FM'],
            'E_FM': fm[i]['E_FM'],
            'V_FM': fm[i]['V_FM'],
            'Rand': rand,
            'ARI': ari,
            'topics': topics,
            'idx': shuffle_idx,
            'embed': embed
        }
        results.append(result)

    return results


In [492]:

with tqdm_joblib(tqdm(desc="Processing", total=len(param_list))) as progress_bar:
    with Parallel(n_jobs=-2, backend="loky") as parallel:
        all_results = parallel(delayed(run_bertopic)(*params) for params in param_list)


  0%|          | 0/480 [00:00<?, ?it/s]

Running PHATE on 755 observations and 1536 variables.
Calculating graph and diffusion operator...
  Calculating KNN search...
Running PHATE on 755 observations and 1536 variables.
Calculating graph and diffusion operator...
  Calculating KNN search...
Running PHATE on 755 observations and 1536 variables.
Calculating graph and diffusion operator...
  Calculating KNN search...
  Calculated KNN search in 0.28 seconds.
  Calculating affinities...
  Calculated KNN search in 0.27 seconds.
  Calculating affinities...
  Calculated KNN search in 0.29 seconds.
  Calculating affinities...
  Calculated affinities in 0.09 seconds.
Calculated graph and diffusion operator in 0.37 seconds.
Calculating optimal t...
Running PHATE on 755 observations and 1536 variables.
Calculating graph and diffusion operator...
  Calculating KNN search...
  Calculated affinities in 0.09 seconds.
Calculated graph and diffusion operator in 0.38 seconds.
Calculating optimal t...
  Automatically selected t = 37
Calculated 

In [None]:
if all_results is not None and isinstance(all_results, list):
    bertopic_results = []
    for sublist in all_results:
        if isinstance(sublist, list):  # Ensure the sublist is a list
            for item in sublist:
                if isinstance(item, dict):  # Ensure item is a dictionary
                    bertopic_results.append(item)
                else:
                    print(f"Skipping non-dict item: {item}")
        else:
            print(f"Skipping non-list sublist: {sublist}")
else:
    bertopic_results = []
    print("Warning: all_results is None or not a list.")

# Convert to DataFrame and save
bertopic_result_df = pd.DataFrame(bertopic_results)


Skipping non-list sublist: None
Skipping non-list sublist: None
Skipping non-list sublist: None
Skipping non-list sublist: None
Skipping non-list sublist: None
Skipping non-list sublist: None
Skipping non-list sublist: None
Skipping non-list sublist: None
Skipping non-list sublist: None
Skipping non-list sublist: None
Skipping non-list sublist: None
Skipping non-list sublist: None
Skipping non-list sublist: None
Skipping non-list sublist: None
Skipping non-list sublist: None
Skipping non-list sublist: None
Skipping non-list sublist: None
Skipping non-list sublist: None
Skipping non-list sublist: None
Skipping non-list sublist: None
Skipping non-list sublist: None
Skipping non-list sublist: None
Skipping non-list sublist: None
Skipping non-list sublist: None
Skipping non-list sublist: None
Skipping non-list sublist: None
Skipping non-list sublist: None
Skipping non-list sublist: None
Skipping non-list sublist: None
Skipping non-list sublist: None
Skipping non-list sublist: None
Skipping

360.0

In [None]:
os.makedirs(f'bertopic_results', exist_ok=True)

bertopic_file = f'bertopic_results/results_all_methods_{theme}_hierarchy_t{t}_maxsub{max_sub}_depth{depth}_synonyms{synonyms}_{branching}.csv'
# Flatten results
write_header = not os.path.exists(bertopic_file)

# Append to the existing file
bertopic_result_df.to_csv(bertopic_file, mode='a', index=False, header=write_header)

In [416]:
# folder_path = "/Users/spencer.dork/Documents/Hierarchical Clustering/Evaluations/bertopic_results"

# # Iterate over all CSV files in the folder
# for filename in os.listdir(folder_path):
#     if filename.endswith(".csv"):
#         file_path = os.path.join(folder_path, filename)
        
#         # Read the CSV file
#         df = pd.read_csv(file_path)
        
#         # Check if 'Cluster Method' exists in the DataFrame
#         if "cluster_method" in df.columns:
#             # Iterate over the rows and columns
#             for index, row in df.iterrows():
#                 if row["cluster_method"] == "Diffusion Condensation" and isinstance(row['cluster_params'], str) and row['cluster_params'].startswith("{") and row['cluster_params'].endswith("}"):
#                     # Add the 'bandwidth_norm': 'max' to the string representation of the dictionary
#                     new_value = row['cluster_params'].rstrip("}") + ", 'bandwidth_norm': 'max'}"
#                     df.at[index, 'cluster_params'] = new_value
            
#             # Save the modified DataFrame back to CSV
#             df.to_csv(file_path, index=False)
#             print(f"Updated: {filename}")

Updated: results_all_methods_Offshore energy impacts on fisheries_hierarchy_t1.0_maxsub5_depth3_synonyms10_random.csv
Updated: results_all_methods_Energy, Ecosystems, and Humans_hierarchy_t1.0_maxsub5_depth3_synonyms0_random.csv
Updated: results_all_methods_Energy, Ecosystems, and Humans_hierarchy_t1.0_maxsub3_depth5_synonyms0_random.csv
Updated: results_all_methods_Energy, Ecosystems, and Humans_hierarchy_t1.0_maxsub5_depth3_synonyms10_random.csv
Updated: results_all_methods_West Java, Indonesia_hierarchy_t1.0_maxsub5_depth3_synonyms0_random.csv
Updated: results_all_methods_West Java, Indonesia_hierarchy_t1.0_maxsub3_depth5_synonyms0_random.csv
Updated: processed_results_synonyms10.csv
Updated: results_all_methods_West Java, Indonesia_hierarchy_t1.0_maxsub5_depth3_synonyms10_random.csv
Updated: results_all_methods_Offshore energy impacts on fisheries_hierarchy_t1.0_maxsub5_depth3_synonyms0_random.csv
Updated: results_all_methods_Offshore energy impacts on fisheries_hierarchy_t1.0_maxs

{'max:': 'max'}
