In [2]:
import numpy as np
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import pairwise_distances
from scipy.cluster.hierarchy import linkage, fcluster
import pandas as pd

# Load and preprocess data
file_path = 'master_file_umap_1_(18).csv'
data = pd.read_csv(file_path)
X = data.iloc[:, 1:19].values

# Perform multiple clustering runs (e.g., KMeans and Agglomerative)
n_clusters = 6
kmeans_1 = KMeans(n_clusters=n_clusters, random_state=42).fit(X)
kmeans_2 = KMeans(n_clusters=n_clusters, random_state=100).fit(X)
agg_1 = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward').fit(X)

# Combine cluster labels (Stack the predicted labels)
cluster_labels = np.vstack([kmeans_1.labels_, kmeans_2.labels_, agg_1.labels_])

# Create a co-association matrix (i.e., agreement between clustering runs)
n_samples = X.shape[0]
co_association_matrix = np.zeros((n_samples, n_samples))

for i in range(n_samples):
    for j in range(n_samples):
        # Count how often points i and j are clustered together across the methods
        co_association_matrix[i, j] = np.sum(cluster_labels[:, i] == cluster_labels[:, j])

# Normalize the co-association matrix (Optional: to convert into a similarity matrix)
co_association_matrix /= cluster_labels.shape[0]

# Apply hierarchical clustering to the co-association matrix
Z = linkage(co_association_matrix, method='average')
final_labels = fcluster(Z, t=n_clusters, criterion='maxclust')

# Save the ensemble results
predict = data
predict['cluster'] = final_labels
predict.to_csv('master_cluster_ensemble_umap_1_(18).csv')
