# System Specifications

In [None]:
# Taken from - https://stackoverflow.com/questions/3103178/how-to-get-the-system-info-with-python

import platform,socket,re,uuid,json,psutil,logging

def getSystemInfo():
    try:
        info={}
        info['platform']=platform.system()
        info['platform-release']=platform.release()
        info['platform-version']=platform.version()
        info['architecture']=platform.machine()
        info['hostname']=socket.gethostname()
        info['ip-address']=socket.gethostbyname(socket.gethostname())
        info['mac-address']=':'.join(re.findall('..', '%012x' % uuid.getnode()))
        info['processor']=platform.processor()
        info['ram']=str(round(psutil.virtual_memory().total / (1024.0 **3)))+" GB"
        return json.dumps(info)
    except Exception as e:
        logging.exception(e)

json.loads(getSystemInfo())

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def compute_cluster_cosine_similarity(X, y):
    """
    Computes the cosine similarity between centroids of clusters in X defined by labels y.
    
    Parameters:
    X (numpy.ndarray): Feature matrix of shape (n_samples, n_features)
    y (numpy.ndarray): Cluster labels of shape (n_samples,)
    
    Returns:
    numpy.ndarray: Cosine similarity matrix between centroids
    """
    unique_labels = np.unique(y)
    centroids = np.array([X[y == label].mean(axis=0) for label in unique_labels])
    
    return cosine_similarity(centroids)

## <a id='toc2_1_'></a>[Generating Embeddings](#toc0_)

```py
# Import required libraries
import numpy as np
import pandas as pd
import time
import tiktoken  # For token counting
import ast
import warnings
import matplotlib.pyplot as plt

# Suppress warnings
warnings.filterwarnings("ignore")
import os
from openai import OpenAI

# Set OpenAI API key
os.environ["OPENAI_API_KEY"] = "SET KEY"
client = OpenAI()


# Function to get embeddings from OpenAI API
def get_embedding(text, model="text-embedding-3-small"):
    # Replace newlines with spaces for consistent processing
    text = text.replace("\n", " ")
    return (
        client.embeddings.create(
            input=[text], model=model, dimensions="INPUT YOUR DIMENSION"
        )
        .data[0]
        .embedding
    )

# Load dataset
df = "THIS SHOULD BE YOUR DATASET"

# Set maximum tokens and encoding parameters
max_tokens = 7000
embedding_encoding = "cl100k_base"
encoding = tiktoken.get_encoding(embedding_encoding)

# Count tokens in each text
df["n_tokens"] = df["text"].apply(lambda x: len(encoding.encode(x)))


# Function to truncate text to max token length
def truncate_text(text, encoding, max_tokens=7000):
    tokens = encoding.encode(text)
    if len(tokens) > max_tokens:
        # If text is too long, truncate to max_tokens
        tokens = tokens[:max_tokens]
        return encoding.decode(tokens)
    return text


# Truncate texts that are too long
df["text"] = df["text"].apply(lambda x: truncate_text(x, encoding, max_tokens))

# Recount tokens after truncation
df["n_tokens"] = df["text"].apply(lambda x: len(encoding.encode(x)))

# Get embeddings for each text using OpenAI API
df["ada_embedding"] = df["text"].apply(
    lambda x: get_embedding(x, model="text-embedding-3-small")
)

# Save embeddings to CSV file
df.to_csv("SAVE YOUR EMBEDDINGS WITH FILE NAME", index=False)
```

# Load Different Datasets

In [None]:
import pandas as pd
# Dictionary to store datasets and their true cluster counts
dataset_info = {
    'digits': {'n_clusters': 10, 'path': 'digits_embeddings.csv'},
    'birch': {'n_clusters': 3, 'path': 'birch_dataset.csv'},
    '3Newsgroups': {'n_clusters': 3, 'path': 'embeddings_news_256.csv'},
    '20Newsgroups': {'n_clusters': 20, 'path': 'embeddings_news_all_256.csv'},
    'stackexchange': {'n_clusters': 15, 'path': 'embeddings_stk_128.csv'},
    'reddit': {'n_clusters': 19, 'path': 'reddit_384.csv'},
    'detectai': {'n_clusters': 2, 'path': 'embeddings_ai_detect_32.csv'},
    'arxiv': {'n_clusters': 11, 'path': 'embeddings_arxiv_test_128.csv'},
}

# Load datasets into a dictionary
datasets = {}
for name, info in dataset_info.items():
    try:
        data = pd.read_csv(info['path'])
        datasets[name] = {
            'data': data,
            'n_clusters': info['n_clusters']
        }
        print(f"Successfully loaded {name} dataset with {info['n_clusters']} clusters")
    except Exception as e:
        print(f"Error loading {name} dataset: {str(e)}")

In [None]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import ast
import time
from scipy.optimize import linear_sum_assignment
from QuadratiK.spherical_clustering import PKBC
from sklearn.metrics import (
    adjusted_rand_score,
    v_measure_score,
    precision_recall_fscore_support,
    confusion_matrix
)

def return_embeddings(x):
    """Convert string representation of lists into actual lists."""
    return ast.literal_eval(x)

def evaluate_pkbc_clustering(dataset_info, num_iterations=1):
    """Function to perform PKBC clustering on multiple datasets multiple times and evaluate performance."""
    performance_metrics = []
    
    for i in range(num_iterations):
        print(f"Iteration {i+1}/{num_iterations}")
        
        for name, info in dataset_info.items():
            try:
                df = pd.read_csv(info['path'])
                
                # Assign labels for the Birch dataset manually and skip return_embeddings
                if name == 'birch':
                    df["labels"] = [1] * 100000 + [2] * 100000 + [3] * 100000
                    embeddings = np.array(df.drop(columns=["labels"]))  # Assuming all remaining columns are embeddings
                elif name == 'digits':
                    embeddings = np.array(df.drop(columns=["labels"]))  # No return_embeddings applied
                else:
                    df["embedding"] = df["ada_embedding"].apply(return_embeddings)
                    embeddings = np.array(df["embedding"].tolist())
                
                true_k = info['n_clusters']
                
                print(f"Processing {name} dataset with {true_k} clusters...")
                
                start_time = time.time()
                pkbc = PKBC(num_clust=true_k, random_state=42).fit(embeddings)
                end_time = time.time()
                
                # Compute ARI and V-measure
                ari_pkbc = adjusted_rand_score(pkbc.labels_[true_k], df["labels"].tolist())
                vscore_pkbc = v_measure_score(pkbc.labels_[true_k], df["labels"].tolist())
                
                # Compute confusion matrix and map predicted labels to true labels
                conf_mat = confusion_matrix(df["labels"].tolist(), pkbc.labels_[true_k])
                row_ind, col_ind = linear_sum_assignment(-conf_mat)  # Maximize matches
                label_mapping = {col_ind[i]: i for i in range(len(col_ind))}
                mapped_labels = np.array([label_mapping[label] for label in pkbc.labels_[true_k]])
                
                # Compute precision and recall
                precision_pkbc, recall_pkbc, _, _ = precision_recall_fscore_support(
                    df["labels"].tolist(), mapped_labels, average="macro"
                )
                
                # Store results
                performance_metrics.append({
                    "Iteration": i+1,
                    "Dataset": name,
                    "Algorithm": "PKBC",
                    "ARI": ari_pkbc,
                    "V Measure": vscore_pkbc,
                    "Macro Precision": precision_pkbc,
                    "Macro Recall": recall_pkbc,
                    "Number of Rows": embeddings.shape[0],
                    "Number of Columns": embeddings.shape[1],
                    "Computation Time (seconds)": end_time - start_time,
                    "K": true_k
                })
                
                print(f"Completed {name} dataset in iteration {i+1}.")
            except Exception as e:
                print(f"Error processing {name} dataset in iteration {i+1}: {str(e)}")
    
    # Convert results into DataFrame
    return pd.DataFrame(performance_metrics)

# Run the function and display the results
# results_df = evaluate_pkbc_clustering(dataset_info, num_iterations=10)
# print(results_df.to_string(index=False))


# Code to compute Cosine Similarity of the Centroids 

In [None]:
cosine_similarity_results = {}

for name, info in datasets.items():
    try:
        df = info['data']
        
        # Assign labels for the Birch dataset manually and skip return_embeddings
        if name == 'birch':
            df["labels"] = [1] * 100000 + [2] * 100000 + [3] * 100000
            embeddings = np.array(df.drop(columns=["labels"]))  # Assuming all remaining columns are embeddings
        elif name == 'digits':
            embeddings = np.array(df.drop(columns=["labels"]))  # No return_embeddings applied
        else:
            df["embedding"] = df["ada_embedding"].apply(return_embeddings)
            embeddings = np.array(df["embedding"].tolist())
        
        cosine_similarity_results[name] = compute_cluster_cosine_similarity(embeddings, df["labels"].tolist())
        print(f"Computed cosine similarity for {name} dataset.")
    except Exception as e:
        print(f"Error computing cosine similarity for {name} dataset: {str(e)}")

# Display the results
cosine_similarity_results