In [2]:
# Example file paths
file_paths = ['./llama_2000_full.json','./llama3_1000_full.json','./generated_data_3.5_1000.json', './generated_data_4o.json']

In [None]:
import json
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score

# Function to load and parse the files
def load_files(file_paths):
    data = []
    for file_path in file_paths:
        with open(file_path, 'r') as f:
            data.extend(json.load(f))
    return data

# Function to extract embeddings and configurations
def extract_embeddings_and_configs(data):
    embeddings = []
    configs = []
    for item in data:
        embeddings.append(item['embedding'])
        configs.append(item['configuration'])
    return np.array(embeddings), configs

# Function to perform clustering
def perform_clustering(embeddings, n_clusters):
    kmeans = KMeans(n_clusters=n_clusters, random_state=0)
    labels = kmeans.fit_predict(embeddings)
    return kmeans, labels

# Function to calculate the central configuration for each cluster
def calculate_central_configs(configs, labels, n_clusters):
    config_df = pd.DataFrame(configs)
    numeric_columns = config_df.select_dtypes(include=[np.number]).columns
    central_configs = config_df.groupby(labels)[numeric_columns].mean().to_dict(orient='records')
    return central_configs

# Function to visualize clusters
def visualize_clusters(embeddings, labels, n_clusters):
    pca = PCA(n_components=2)
    reduced_embeddings = pca.fit_transform(embeddings)
    plt.figure(figsize=(10, 8))
    for cluster in range(n_clusters):
        points = reduced_embeddings[labels == cluster]
        plt.scatter(points[:, 0], points[:, 1], label=f'Cluster {cluster}')
    plt.legend()
    plt.xlabel('PCA Component 1')
    plt.ylabel('PCA Component 2')
    plt.title('Cluster Visualization')
    plt.show()

# Main analysis function
def analyze_clusters(file_paths, n_clusters):
    data = load_files(file_paths)
    embeddings, configs = extract_embeddings_and_configs(data)
    
    kmeans, labels = perform_clustering(embeddings, n_clusters)
    
    central_configs = calculate_central_configs(configs, labels, n_clusters)
    silhouette_avg = silhouette_score(embeddings, labels)
    
    print(f'Silhouette Score: {silhouette_avg}')
    print('Central Configurations for each cluster:')
    for idx, config in enumerate(central_configs):
        print(f'Cluster {idx}: {config}')
    
    visualize_clusters(embeddings, labels, n_clusters)
    
    return central_configs


# Number of clusters
n_clusters = 20

# Run the analysis
central_configs = analyze_clusters(file_paths, n_clusters)

# Save central configurations to a file
with open('central_configs.json', 'w') as f:
    json.dump(central_configs, f, indent=4)


In [None]:
import json
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score

# Function to load and parse the files
def load_files(file_paths):
    data = []
    for file_path in file_paths:
        with open(file_path, 'r') as f:
            data.extend(json.load(f))
    return data

# Function to extract embeddings and configurations
def extract_embeddings_and_configs(data):
    embeddings = []
    configs = []
    for item in data:
        embeddings.append(item['embedding'])
        configs.append(item['configuration'])
    return np.array(embeddings), configs

# Function to perform clustering
def perform_clustering(embeddings, n_clusters):
    kmeans = KMeans(n_clusters=n_clusters, random_state=0)
    labels = kmeans.fit_predict(embeddings)
    return kmeans, labels

# Function to calculate the central configuration for each cluster, ignoring specified parameters
def calculate_central_configs(configs, labels, n_clusters):
    config_df = pd.DataFrame(configs)
    # Exclude certain columns from the calculations
    config_df = config_df.drop(columns=['batch_size', 'max_tokens', 'max_seq_len'])
    numeric_columns = config_df.select_dtypes(include=[np.number]).columns
    central_configs = config_df.groupby(labels)[numeric_columns].mean().to_dict(orient='records')
    return central_configs

# Function to visualize clusters
def visualize_clusters(embeddings, labels, n_clusters):
    pca = PCA(n_components=2)
    reduced_embeddings = pca.fit_transform(embeddings)
    plt.figure(figsize=(10, 8))
    for cluster in range(n_clusters):
        points = reduced_embeddings[labels == cluster]
        plt.scatter(points[:, 0], points[:, 1], label=f'Cluster {cluster}')
    plt.legend()
    plt.xlabel('PCA Component 1')
    plt.ylabel('PCA Component 2')
    plt.title('Cluster Visualization')
    plt.show()

# Function to analyze and display details about clusters
def display_cluster_details(data, labels):
    for i in range(max(labels)+1):
        cluster_items = [item['response_content'] for j, item in enumerate(data) if labels[j] == i]
        print(f'Cluster {i}:')
        print('\n Responses:', cluster_items[:2])
        print('\n total Responses:', len(cluster_items))
        print()

# Main analysis function
def analyze_clusters(file_paths, n_clusters):
    data = load_files(file_paths)
    embeddings, configs = extract_embeddings_and_configs(data)
    
    kmeans, labels = perform_clustering(embeddings, n_clusters)
    
    central_configs = calculate_central_configs(configs, labels, n_clusters)
    silhouette_avg = silhouette_score(embeddings, labels)
    
    print(f'Silhouette Score: {silhouette_avg}')
    print('Central Configurations for each cluster:')
    for idx, config in enumerate(central_configs):
        print(f'Cluster {idx}: {config}')
    
    visualize_clusters(embeddings, labels, n_clusters)
    display_cluster_details(data, labels)
    
    return central_configs

"""
# Number of clusters
n_clusters = 20

# Run the analysis
central_configs = analyze_clusters(file_paths, n_clusters)

# Save central configurations to a file
with open('central_configs.json', 'w') as f:
    json.dump(central_configs, f, indent=4)
"""

In [None]:
import json
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score

def load_files(file_paths):
    data = []
    for file_path in file_paths:
        with open(file_path, 'r') as f:
            data.extend(json.load(f))
    return data

def extract_embeddings_and_configs(data):
    embeddings = []
    configs = []
    for item in data:
        embeddings.append(item['embedding'])
        configs.append(item['configuration'])
    return np.array(embeddings), configs

def perform_clustering(embeddings, n_clusters):
    kmeans = KMeans(n_clusters=n_clusters, random_state=0)
    labels = kmeans.fit_predict(embeddings)
    return kmeans, labels

def calculate_central_configs(configs, labels, n_clusters):
    config_df = pd.DataFrame(configs)
    config_df = config_df.drop(columns=['batch_size', 'max_tokens', 'max_seq_len'], errors='ignore')
    numeric_columns = config_df.select_dtypes(include=[np.number]).columns
    central_configs = config_df.groupby(labels)[numeric_columns].mean().to_dict(orient='records')
    return central_configs

def visualize_clusters(embeddings, labels, n_clusters):
    pca = PCA(n_components=2)
    reduced_embeddings = pca.fit_transform(embeddings)
    plt.figure(figsize=(10, 8))
    for cluster in range(n_clusters):
        points = reduced_embeddings[labels == cluster]
        plt.scatter(points[:, 0], points[:, 1], label=f'Cluster {cluster}')
    plt.legend()
    plt.xlabel('PCA Component 1')
    plt.ylabel('PCA Component 2')
    plt.title('Cluster Visualization')
    plt.show()

def display_cluster_details(data, labels):
    for i in range(max(labels)+1):
        cluster_items = [item['response_content'] for j, item in enumerate(data) if labels[j] == i]
        print(f'Cluster {i}:')
        print('Responses:', cluster_items[:5])  # Displaying first 5 responses for brevity
        print('Total Responses:', len(cluster_items))
        print()

def run_initial_analysis(file_paths, n_clusters):
    data = load_files(file_paths)
    embeddings, configs = extract_embeddings_and_configs(data)
    kmeans, labels = perform_clustering(embeddings, n_clusters)
    central_configs = calculate_central_configs(configs, labels, n_clusters)
    silhouette_avg = silhouette_score(embeddings, labels)

    # Convert numpy arrays to lists for JSON serialization
    embeddings_list = embeddings.tolist()
    labels_list = labels.tolist()

    results = {
        'data': data,
        'embeddings': embeddings_list,
        'configs': configs,
        'labels': labels_list,
        'central_configs': central_configs,
        'silhouette_score': silhouette_avg
    }

    with open('cluster_results.json', 'w') as f:
        json.dump(results, f, indent=4)
    
    print(f'Silhouette Score: {silhouette_avg}')
    print('Central Configurations for each cluster:')
    for idx, config in enumerate(central_configs):
        print(f'Cluster {idx}: {config}')

    visualize_clusters(embeddings, labels, n_clusters)
    display_cluster_details(data, labels)
    
    return results

n_clusters = 15
results = run_initial_analysis(file_paths, n_clusters)


In [None]:
import json
import numpy as np
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

def load_results(filename):
    with open(filename, 'r') as f:
        results = json.load(f)
    # Convert lists back to numpy arrays
    results['embeddings'] = np.array(results['embeddings'])
    results['labels'] = np.array(results['labels'])
    return results

def save_results(results, filename):
    # Convert numpy arrays to lists for JSON serialization
    results['embeddings'] = results['embeddings'].tolist()
    results['labels'] = results['labels'].tolist()
    
    with open(filename, 'w') as f:
        json.dump(results, f, indent=4)

def remove_clusters(data, labels, clusters_to_remove):
    filtered_data = [item for i, item in enumerate(data) if labels[i] not in clusters_to_remove]
    filtered_embeddings = [embedding for i, embedding in enumerate(results['embeddings']) if labels[i] not in clusters_to_remove]
    filtered_labels = [label for label in labels if label not in clusters_to_remove]
    return filtered_data, np.array(filtered_embeddings), filtered_labels

def recluster_data(embeddings, n_clusters):
    kmeans = KMeans(n_clusters=n_clusters, random_state=0)
    new_labels = kmeans.fit_predict(embeddings)
    return kmeans, new_labels

def visualize_and_analyze(new_labels, embeddings):
    pca = PCA(n_components=2)
    reduced_embeddings = pca.fit_transform(embeddings)
    plt.figure(figsize=(10, 8))
    for cluster in range(max(new_labels)+1):
        points = reduced_embeddings[new_labels == cluster]
        plt.scatter(points[:, 0], points[:, 1], label=f'Cluster {cluster}')
    plt.legend()
    plt.xlabel('PCA Component 1')
    plt.ylabel('PCA Component 2')
    plt.title('Re-clustered Visualization')
    plt.show()

def display_cluster_details(data, labels):
    for i in range(max(labels)+1):
        cluster_items = [item['response_content'] for j, item in enumerate(data) if labels[j] == i]
        print(f'Cluster {i}:')
        print('Responses:', cluster_items[:2])  # Displaying first 5 responses for brevity
        print('Total Responses:', len(cluster_items))
        print()

# Load previous results
results = load_results('cluster_results.json')

# Specify clusters to remove
clusters_to_remove = [0,2,6]  # Example clusters to remove

# Remove specified clusters
filtered_data, filtered_embeddings, filtered_labels = remove_clusters(results['data'], results['labels'], clusters_to_remove)

# Re-cluster and visualize
new_kmeans, new_labels = recluster_data(filtered_embeddings, n_clusters=8)  # Adjust number of clusters if needed
visualize_and_analyze(new_labels, filtered_embeddings)
display_cluster_details(filtered_data, new_labels)

# Save new clustering results
results['data'] = filtered_data
results['embeddings'] = filtered_embeddings
results['labels'] = new_labels
results['central_configs'] = calculate_central_configs([config for i, config in enumerate(results['configs']) if results['labels'][i] not in clusters_to_remove], new_labels, n_clusters)

save_results(results, 'cluster_results.json')
