In [None]:
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering, SpectralClustering
from sklearn.metrics import make_scorer, silhouette_score
from sklearn.model_selection import GridSearchCV
import numpy as np

# Custom scorer using silhouette score (since GridSearchCV doesn't support silhouette score natively)
def silhouette_scorer(estimator, X):
    labels = estimator.fit_predict(X)
    if len(set(labels)) > 1:
        return silhouette_score(X, labels)
    else:
        return -1  # Penalize models that don't produce multiple clusters

def gridsearch_clustering_tuning(data, model_name, n_jobs=-1):
    """
    Performs hyperparameter tuning using GridSearchCV for clustering models.

    Args:
        data: Data on which clustering needs to be performed.
        model_name: Name of the clustering model ('dbscan', 'kmeans', 'spectral', 'agglomerative').
        n_jobs: Number of jobs to run in parallel (default is -1 to use all available processors).

    Returns:
        best_model: The best tuned model based on the grid search.
        best_params: The best parameters found by GridSearchCV.
        best_score: The best silhouette score obtained.
    """
    # Define the parameter grids for each model
    if model_name == 'dbscan':
        param_grid = {
            'eps': [0.3, 0.5, 0.7, 1.0],   # Tuning eps for DBSCAN
            'min_samples': [3, 5, 10],     # Tuning min_samples for DBSCAN
            'metric': ['euclidean', 'manhattan']  # Distance metrics to try
        }
        model = DBSCAN()
        
    elif model_name == 'kmeans':
        param_grid = {
            'n_clusters': [3, 4, 5, 6, 8, 10],   # Number of clusters
            'init': ['k-means++', 'random'],     # Initialization methods
            'n_init': [10, 20, 30],              # Number of initializations to run
            'max_iter': [300, 500, 1000]         # Maximum iterations
        }
        model = KMeans(random_state=42)
        
    elif model_name == 'spectral':
        param_grid = {
            'n_clusters': [3, 4, 5, 6],          # Number of clusters
            'affinity': ['nearest_neighbors', 'rbf'],  # Affinity methods for graph construction
            'n_neighbors': [5, 10, 15],          # Number of neighbors for nearest neighbors affinity
            'eigen_solver': ['arpack', 'lobpcg'], # Eigen solver methods
        }
        model = SpectralClustering(random_state=42)
        
    elif model_name == 'agglomerative':
        param_grid = {
            'n_clusters': [3, 4, 5, 6, 8],        # Number of clusters
            'linkage': ['ward', 'complete', 'average', 'single'],  # Linkage criteria
            'affinity': ['euclidean', 'manhattan']  # Distance metrics
        }
        model = AgglomerativeClustering()
        
    else:
        raise ValueError("Unknown clustering model. Choose from 'dbscan', 'kmeans', 'spectral', or 'agglomerative'.")

    # Set up GridSearchCV
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring=make_scorer(silhouette_scorer),  # Custom scorer for silhouette score
        n_jobs=n_jobs,
        cv=3  # Cross-validation with 3 splits
    )

    # Fit the grid search
    grid_search.fit(data)

    # Get the best model, parameters, and score
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    print(f'Best parameters for {model_name}: {best_params}')
    print(f'Best silhouette score: {best_score}')

    return best_model, best_params, best_score

# Example usage:
# data = ...  # Your dataset
# best_model, best_params, best_score = gridsearch_clustering_tuning(data, model_name='kmeans')


In [None]:
# Example usage: tuning a KMeans model
best_model, best_params, best_score = gridsearch_clustering_tuning(data, model_name='kmeans')

# Build the final model using the best hyperparameters
def build_final_clustering_model(data, model_name, best_params):
    """
    Constructs and fits the final clustering model using the best hyperparameters.

    Args:
        data: The data on which clustering needs to be performed.
        model_name: The name of the clustering model ('dbscan', 'kmeans', 'spectral', 'agglomerative').
        best_params: The best hyperparameters found by GridSearchCV.

    Returns:
        final_model: The final trained model.
        labels: Cluster labels assigned by the final model.
    """
    if model_name == 'dbscan':
        final_model = DBSCAN(**best_params)  # Unpack the best_params to the model
    
    elif model_name == 'kmeans':
        final_model = KMeans(**best_params, random_state=42)  # Pass best params to KMeans
    
    elif model_name == 'spectral':
        final_model = SpectralClustering(**best_params, random_state=42)  # Pass best params to SpectralClustering
    
    elif model_name == 'agglomerative':
        final_model = AgglomerativeClustering(**best_params)  # Pass best params to AgglomerativeClustering
    
    else:
        raise ValueError("Unknown clustering model. Choose from 'dbscan', 'kmeans', 'spectral', or 'agglomerative'.")

    # Fit the final model on the data
    labels = final_model.fit_predict(data)  # Predict the cluster labels
    
    return final_model, labels

# Build the final version of KMeans using the best params
final_model, labels = build_final_clustering_model(data, model_name='kmeans', best_params=best_params)

# Print out the labels assigned to each data point
print("Final cluster labels:", labels)


In [None]:
import matplotlib.pyplot as plt

def plot_clusters(data, labels, title='Cluster Plot'):
    plt.figure(figsize=(10, 6))
    scatter = plt.scatter(data[:, 0], data[:, 1], c=labels, s=50, cmap='viridis')
    plt.colorbar(scatter)
    plt.title(title)
    plt.xlabel('Feature 1')
    plt.ylabel('Feature 2')
    plt.show()

# Assuming 'data' is 2D or reduced to 2D, plot the clusters
plot_clusters(data, labels, title='Final Clustering Result')
