In [None]:
# --- Imports and API Key Setup ---

import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
import json
import openai
from openai import OpenAI
import time

client = OpenAI (
    api_key = "YOUR_API_KEY_HERE"
    )

print("Libraries imported and OpenAI key set.")

In [None]:
# --- Load Data ---

file_path = "CSV_FILE_PATH_HERE" # We used a CSV file with two columns: 'gesture_name' and 'gesture_definition'
try:
    df = pd.read_csv(file_path, sep=";", quotechar='"', encoding="utf-8-sig")
    df = df[['gesture_name', 'gesture_definition']]
    df = df.dropna(subset=['gesture_name', 'gesture_definition'])
    df = df.reset_index(drop=True)

    print(f"Successfully loaded {len(df)} gestures from '{file_path}'.")

except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found.")
    df = None
except Exception as e:
    print(f"An error occurred: {e}")
    df = None

In [None]:
# --- Generate Sentence Embeddings ---

if df is not None:
    print("Initializing sentence transformer model ('all-mpnet-base-v2')...")
    model = SentenceTransformer('all-mpnet-base-v2')

    print("Generating embeddings for all definitions. This may take a moment...")
    try:
        # Store definitions in a list
        definitions = df['gesture_definition'].tolist()

        # Generate embeddings
        X = model.encode(definitions, show_progress_bar=True)

        print(f"Embeddings generated successfully. Shape: {X.shape}")

    except Exception as e:
        print(f"Error generating embeddings: {e}")
        X = None
else:
    print("Skipping embedding generation as data was not loaded.")

In [None]:
# --- Deduplication ---

if X is not None:
    print("Performing deduplication...")

    # Calculate cosine similarity matrix
    similarity_matrix = cosine_similarity(X)

    # Set a high threshold for synonyms
    SIMILARITY_THRESHOLD = 0.90

    merged_indices = set()
    representatives = []

    for i in range(len(df)):
        if i in merged_indices:
            continue

        # Find all gestures highly similar to gesture 'i'
        similar_indices = np.where(similarity_matrix[i] >= SIMILARITY_THRESHOLD)[0]

        # The representative is the first one in the group
        rep_index = similar_indices[0]
        rep_name = df.iloc[rep_index]['gesture_name']
        rep_def = df.iloc[rep_index]['gesture_definition']

        # Get names of all gestures being merged into this one
        synonym_names = [df.iloc[j]['gesture_name'] for j in similar_indices if j != rep_index]

        representatives.append({
            'representative_name': rep_name,
            'representative_definition': rep_def,
            'synonyms': synonym_names,
            'original_index': rep_index # Store original index to retrieve embedding
        })

        # Add all merged gestures to a set to skip them
        merged_indices.update(similar_indices)

    # Create the new deduplicated dataframe
    dedup_df = pd.DataFrame(representatives)

    # Create the new embedding matrix 'X_dedup' by selecting only the embeddings of the representative gestures
    representative_indices = dedup_df['original_index'].tolist()
    X_dedup = X[representative_indices, :]

    print(f"Deduplication complete.")
    print(f"Original gesture count: {len(df)}")
    print(f"Deduplicated gesture count: {len(dedup_df)}")

else:
    print("Skipping deduplication as embeddings were not generated.")

In [None]:
# --- Real LLM Helper Functions ---

# --- Function 1: Get Cluster Labels ---
def get_cluster_labels(cluster_gestures: list) -> dict:
    """
    Calls the OpenAI API (v1.0.0) to generate a label and definition.
    """
    gesture_list_str = "\n- ".join(cluster_gestures)

    prompt = f"""
    The following is a cluster of surgical gestures:
    - {gesture_list_str}

    Your task is to act as an expert in robotic/minimally-invasive surgery. Analyze this list and
    propose a concise taxonomic label and a short, clear definition
    for this group of actions. Assume a hirarchical taxonomy in which some labels contain sub-gestures.

    Return your answer in a strict JSON format.
    Example:
    {{
      "label": "Stapling",
      "definition": "Using a mechanical device to join or separate tissues."
      "label": "Clip",
      "definition": "A clip applier is activated to deploy one or more clips across a vessel or structure to occlude it."
    }}

    JSON response:
    """

    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            response_format={ "type": "json_object" }, # Use new JSON mode
            messages=[
                {"role": "system", "content": "You are a helpful assistant that returns JSON."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.2,
        )

        # --- RESPONSE PARSING ---
        result = json.loads(response.choices[0].message.content)
        return result

    except Exception as e:
        print(f"Error in get_cluster_labels: {e}")
        return {"label": "Error - API Call Failed", "definition": str(e)}

# --- Function 2: Check Cluster Homogeneity (with Scoring) ---
def check_cluster_homogeneity(cluster_gestures: list) -> dict:
    """
    Calls the OpenAI API (v1.0.0) to rate the semantic consistency.
    """
    gesture_list_str = "\n- ".join(cluster_gestures)

    prompt = f"""
    You are expert in robotic/minimally-invasive surgery, and is creating the
    taxonomy for robotic surgical gestures. On a scale of 1 to 10,
    where 1 is a completely mixed bag of random concepts and 10 is a
    perfectly consistent set of similar/hirarchical terms, please rate the semantic
    consistency of this gesture cluster:

    - {gesture_list_str}

    A "heterogeneous" cluster (score < 5) might mix actions like
    "Clip" with "Camera Move". A "homogeneous" cluster (score >= 5)
    would contain only variations of a single concept, like
    "Dissection", "Dissection - Sharp", "Dissection - Electrosurgery".

    Return a strict JSON object with your score and a brief reason.
    Example:
    {{
      "consistency_score": 8,
      "reason": "All gestures relate to cutting tissue."
    }}

    JSON response:
    """

    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            response_format={ "type": "json_object" },
            messages=[
                {"role": "system", "content": "You are a helpful assistant that returns JSON."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.5,
        )

        # --- RESPONSE PARSING ---
        result = json.loads(response.choices[0].message.content)
        return result

    except Exception as e:
        print(f"Error in check_cluster_homogeneity: {e}")
        return {"consistency_score": 1, "reason": f"API Call Failed: {e}"}

print("Real LLM helper functions (v1.0.0 syntax) defined.")

In [None]:
# --- Agglomerative Clustering Strategy ---

from sklearn.cluster import AgglomerativeClustering
import time

if 'X_dedup' in locals() and 'dedup_df' in locals():
    print("--- Starting Agglomerative Clustering Strategy ---")

    final_taxonomy = []

    # --- TUNING PARAMETERS ---
    # The maximum cosine *distance* allowed in a cluster.
    # 0.5 = ~50% similarity
    # 0.4 = ~60% similarity
    # 0.3 = ~70% similarity (Where we started)
    DISTANCE_THRESHOLD = 0.4
    # --- --- --- --- --- --- --- --- ---

    df_to_cluster = dedup_df.copy()
    X_to_cluster = X_dedup

    # 1. Initialize and run Agglomerative Clustering
    # We use metric='cosine'
    # and linkage='average'
    print(f"Running Agglomerative Clustering with distance_threshold={DISTANCE_THRESHOLD}...")

    agg_cluster = AgglomerativeClustering(
        n_clusters=None,
        metric='cosine', 
        linkage='average',
        distance_threshold=DISTANCE_THRESHOLD
    )

    # Fit and get the cluster labels
    df_to_cluster['cluster'] = agg_cluster.fit_predict(X_to_cluster)

    num_clusters_found = df_to_cluster['cluster'].nunique()
    print(f"Clustering complete. Found {num_clusters_found} clusters.")

    # 2. Loop through the found clusters and get LLM labels

    print("\nProceeding to final labeling...")

    for i in range(num_clusters_found):
        gestures_in_cluster_df = df_to_cluster[df_to_cluster['cluster'] == i]
        gestures_list = gestures_in_cluster_df['representative_name'].tolist()

        if not gestures_list:
            continue

        # 3. Get final label and definition from LLM
        print(f"  Labeling final cluster {i} (size={len(gestures_list)})...")
        time.sleep(1)  # Sleep to avoid hitting rate limits

        labels = get_cluster_labels(gestures_list)

        final_taxonomy.append({
            'cluster_id': i,
            'llm_label': labels.get('label', 'Error'),
            'llm_definition': labels.get('definition', 'Error'),
            'gestures': gestures_list,
            'synonyms_included': [s for syn_list in gestures_in_cluster_df['synonyms'] for s in syn_list]
        })

    print("All clusters labeled.")

else:
    print("Skipping main loop because data or embeddings are missing.")

In [None]:
# --- Display Final Taxonomy ---

if final_taxonomy:
    print(f"\n--- Process Complete ---")
    print(f"Generated {len(final_taxonomy)} final clusters (K={num_clusters_found}).\n")

    # Print the final taxonomy
    for cluster in final_taxonomy:
        print(f"==================================================")
        print(f"Cluster ID: {cluster['cluster_id']}")
        print(f"LLM Label: {cluster['llm_label']}")
        print(f"LLM Definition: {cluster['llm_definition']}")
        print(f"Representative Gestures ({len(cluster['gestures'])}):")
        # Print only first 5 gestures
        for gesture in cluster['gestures'][:5]:
            print(f"  - {gesture}")
        if len(cluster['gestures']) > 5:
            print(f"  ... and {len(cluster['gestures']) - 5} more.")

        if cluster['synonyms_included']:
            print(f"Synonyms Included ({len(cluster['synonyms_included'])}):")
            print(f"  {', '.join(cluster['synonyms_included'][:5])}...")
        print(f"==================================================")

    # Save to a JSON file
    try:
        with open("final_taxonomy.json", "w") as f:
            json.dump(final_taxonomy, f, indent=2)
        print("\nSuccessfully saved final taxonomy to 'final_taxonomy.json'")
    except Exception as e:
        print(f"\nCould not save taxonomy to file: {e}")

else:
    print("\nNo final taxonomy was generated.")

In [None]:
# --- Load and Inspect the Final Taxonomy ---
import json
import pandas as pd

try:
    with open("final_taxonomy.json", "r") as f:
        final_taxonomy_data = json.load(f)

    # Convert to a dataframe for easy viewing
    taxonomy_df = pd.DataFrame(final_taxonomy_data)

    # Display key columns
    print(f"Taxonomy loaded. Found {len(taxonomy_df)} clusters.")
    print("\nOverview of Clusters:")
    print(taxonomy_df[['cluster_id', 'llm_label', 'llm_definition', 'gestures']].head(10))

except FileNotFoundError:
    print("Error: 'final_taxonomy.json' not found. Ensure Cell 6 ran successfully.")

In [None]:
# --- Generate Clustermap for Visual Validation ---
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_distances

# Use the deduplicated data and embeddings
labels = dedup_df['representative_name']
X_embeddings = X_dedup

# 1. Compute cosine distance matrix (necessary for the plot)
# dist_matrix = cosine_distances(X_embeddings)
# NOTE: To use Ward linkage (default for clustermap), must use Euclidean distance on normalized vectors, which is proportional to squared cosine distance.
# We used the standard Euclidean distance for simplicity and compatibility.
dist_matrix = cosine_distances(X_embeddings)

# 2. Create a DataFrame for seaborn with gesture labels
df_dist = pd.DataFrame(dist_matrix, index=labels, columns=labels)

# 3. Plot clustermap
sns.set(font_scale=0.4)

g = sns.clustermap(
    df_dist,
    metric='cosine',
    method='average',
    cmap='viridis',
    figsize=(20, 20),
    row_cluster=True,
    col_cluster=True,
    xticklabels=True,
    yticklabels=True
)
plt.suptitle(f"Hierarchical Clustering (Linkage: Average, Metric: Cosine)", y=1.02, fontsize=16)
plt.show()

In [None]:
# --- Prepare and Save a Final CSV Report ---

report_rows = []
for index, row in taxonomy_df.iterrows():
    for gesture in row['gestures']:
        report_rows.append({
            'Cluster_ID': row['cluster_id'],
            'Taxonomic_Label': row['llm_label'],
            'Taxonomic_Definition': row['llm_definition'],
            'Representative_Gesture_Name': gesture,
            'Original_Synonyms_Merged': ', '.join(row['synonyms_included'])
        })

final_report_df = pd.DataFrame(report_rows)

# Save the final structured report
final_report_df.to_csv('final_gesture_taxonomy_report.csv', index=False)
print("Final taxonomy report saved to 'final_gesture_taxonomy_report.csv'")
print("\nFirst 5 rows of the report:")
print(final_report_df.head())

In [None]:
# Convert CSV to XLSX
# Add a new cell to convert the CSV file to XLSX
csv_file_path = "CSV_FILE_PATH_HERE"  # Path to the CSV file generated above
xlsx_file_path = "XLSX_FILE_PATH_HERE"  # Path where the XLSX file will be saved

try:
    # Read the CSV file into a pandas dataframe
    df_csv = pd.read_csv(csv_file_path)

    # Write the dataframe to an XLSX file
    df_csv.to_excel(xlsx_file_path, index=False)

    print(f"Successfully converted '{csv_file_path}' to '{xlsx_file_path}'")

except FileNotFoundError:
    print(f"Error: The file '{csv_file_path}' was not found.")
except Exception as e:
    print(f"An error occurred: {e}")

In [None]:
# --- Generate Dendrogram ---
from scipy.cluster.hierarchy import dendrogram, linkage
import matplotlib.pyplot as plt

if 'df_dist' in locals():
    print("Generating dendrogram...")

    # Use the linkage function from scipy.cluster.hierarchy
    # Use the 'average' method and 'cosine' metric to match the clustering
    linked = linkage(df_dist, method='average', metric='cosine')

    plt.figure(figsize=(20, 10))
    dendrogram(linked,
               orientation='top',
               labels=df_dist.index.tolist(),
               distance_sort='descending',
               show_leaf_counts=True)
    plt.title('Dendrogram of Gesture Clustering (Average Linkage, Cosine Distance)')
    plt.ylabel('Cosine Distance')
    plt.xlabel('Gestures')
    plt.tight_layout()
    plt.show()

else:
    print("Skipping dendrogram generation as distance matrix 'df_dist' is missing.")

In [None]:
# Plot Color-Coded Dendrogram #
from scipy.cluster.hierarchy import dendrogram, linkage
import matplotlib.pyplot as plt
from matplotlib import cm
import numpy as np

try:
    with open("final_taxonomy.json", "r") as f:
        final_taxonomy = json.load(f)
    print("Loaded final_taxonomy from JSON.")
except Exception as e:
    print(f"Failed to load final_taxonomy: {e}")
    final_taxonomy = None

# Make sure all required objects are present
if 'df_dist' in locals() and 'final_taxonomy' in locals():
    print("Generating dendrogram with LLM labels and colors...")

    # Step 1: Build mapping dictionaries
    gesture_to_cluster_id = {}
    gesture_to_llm_label = {}

    for cluster in final_taxonomy:
        cluster_id = cluster['cluster_id']
        llm_label = cluster['llm_label']
        for gesture in cluster['gestures']:
            gesture_to_cluster_id[gesture] = cluster_id
            gesture_to_llm_label[gesture] = llm_label

    # Step 2: Create labels and cluster IDs for each gesture in df_dist
    labels = []
    cluster_ids = []

    missing_gestures = []

    for gesture in df_dist.index.tolist():
        cluster_id = gesture_to_cluster_id.get(gesture, -1)
        llm_label = gesture_to_llm_label.get(gesture, "Unknown")
        labels.append(f"{gesture} ({llm_label})")
        cluster_ids.append(cluster_id)
        if cluster_id == -1:
            missing_gestures.append(gesture)

    if missing_gestures:
        print(f"Warning: {len(missing_gestures)} gestures not found in taxonomy.")
        print("Example missing gesture(s):", missing_gestures[:5])

    # Step 3: Map cluster_id to colors
    unique_clusters = sorted(set(cluster_ids))
    cmap = cm.get_cmap('tab20', len(unique_clusters))
    cluster_color_map = {cid: cmap(i) for i, cid in enumerate(unique_clusters)}
    label_colors = [cluster_color_map[cid] for cid in cluster_ids]

    # Step 4: Generate dendrogram
    linked = linkage(df_dist, method='average', metric='cosine')

    plt.figure(figsize=(22, 10))
    dendro = dendrogram(linked,
                        orientation='top',
                        labels=labels,
                        leaf_font_size=10,
                        leaf_rotation=90,
                        distance_sort='descending',
                        show_leaf_counts=False)

    # Step 5: Color the leaf labels manually
    ax = plt.gca()
    xlbls = ax.get_xmajorticklabels()

    for lbl, color in zip(xlbls, [label_colors[i] for i in dendro['leaves']]):
        lbl.set_color(color)

    plt.title('Dendrogram of Gestures with LLM Cluster Labels')
    plt.ylabel('Cosine Distance')
    plt.xlabel('Gesture Names (Colored by Cluster)')
    plt.tight_layout()
    plt.show()

else:
    print("Missing required data for dendrogram generation (df_dist or final_taxonomy).")
