In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Load your dataset (replace with the correct file path)
file_path = 'master_file_umap_1_(18).csv'
data = pd.read_csv(file_path)

# Assuming the first column is file names and the next two columns are the UMAP-reduced features
X = data.iloc[:, 1:19].values  # UMAP-reduced data (2 columns)
file_names = data.iloc[:, 0]  # File names or identifiers

# Normalize the features
scaler = StandardScaler()
X_normalized = scaler.fit_transform(X)

# Specify known points and their corresponding labels (clusters)
# Replace this with your actual known files and their corresponding cluster labels
# These should be indices or positions in the dataset, not file names
known_points_indices = [3,34,53,108,104,92,38,9,88,21,48]  # Example indices of known points (replace with actual indices)
known_labels = [0,0,0,0,1,1,1,1,3,3,3]  # Corresponding cluster labels (replace with actual cluster labels)

# Initialize cluster centroids using the mean of known points for each cluster
unique_labels = np.unique(known_labels)
initial_centroids = np.array([X_normalized[known_points_indices][np.array(known_labels) == label].mean(axis=0)
                              for label in unique_labels])

# Initialize KMeans with the known centroids
n_clusters = len(unique_labels)  # Number of clusters from known labels
kmeans = KMeans(n_clusters=n_clusters, init=initial_centroids, n_init=1, random_state=42)

# Fit KMeans on the entire dataset
label_predict = kmeans.fit_predict(X_normalized)

# Evaluate the clustering with silhouette score
silhouette_avg = silhouette_score(X_normalized, label_predict)
print(f'Silhouette Score after Semi-Supervised KMeans: {silhouette_avg}')

# Add the predicted clusters to the original data
predict = data.copy()
predict['cluster'] = label_predict

# Save the final output
output_file = 'master_cluster_semi_supervised_kmeans.csv'
predict.to_csv(output_file, index=False)
print(f'Clustering output saved to {output_file}')


Silhouette Score after Semi-Supervised KMeans: 0.35961009734620347
Clustering output saved to master_cluster_semi_supervised_kmeans.csv
