In [18]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import (silhouette_score, calinski_harabasz_score, davies_bouldin_score,
                             adjusted_rand_score, normalized_mutual_info_score
)
import matplotlib.pyplot as plt

def train_kmeans_model(file_path, column_names=None, categorical_columns=None, target_column=None, test_size=0.3, random_state=42):
    """
    Train a K-means clustering model and evaluate its performance.

    Parameters:
        - file_path (str): Path to the dataset file.
        - column_names (list): List of column names. Default is None.
        - categorical_columns (list): Categorical columns to encode. Default is None.
        - target_column (str): Name of target column to exclude. Default is None.
        - test_size (float): Fraction of data for testing. Default is 0.3.
        - random_state (int): Random seed for reproducibility. Default is 42.

    Returns:
        None
    """
    # Load dataset
    data = pd.read_csv(file_path, header=None, names=column_names)

    # Encode categorical columns if provided
    if categorical_columns:
        for col in categorical_columns:
            data[col] = LabelEncoder().fit_transform(data[col])

    # Separate features and target if provided
    if target_column:
        target = data[target_column]
        data = data.drop(columns=[target_column])
    else:
        target = None

    scaler = StandardScaler()
    data_scaled = scaler.fit_transform(data)

    X_train, _ = train_test_split(data_scaled, test_size=test_size, random_state=random_state)

    # Determine optimal clusters using the silhouette score
    best_k = 2
    best_score = -1
    for k in range(2, 20):
        kmeans = KMeans(n_clusters=k, random_state=random_state)
        labels = kmeans.fit_predict(X_train)
        score = silhouette_score(X_train, labels)
        if score > best_score:
            best_k = k
            best_score = score

    print(f"Optimal number of clusters (k): {best_k}")

    # Train K-means with the optimal number of clusters
    kmeans = KMeans(n_clusters=best_k, random_state=random_state)
    labels = kmeans.fit_predict(X_train)

    # Evaluate clustering performance
    silhouette = silhouette_score(X_train, labels)
    calinski_harabasz = calinski_harabasz_score(X_train, labels)
    davies_bouldin = davies_bouldin_score(X_train, labels)

    print(f"Silhouette Score: {silhouette:.2f}")
    print(f"Calinski-Harabasz Index: {calinski_harabasz:.2f}")
    print(f"Davies-Bouldin Index: {davies_bouldin:.2f}")

    # # Evaluate against ground truth if labels are available
    # if target is not None:
    #     target = target[:len(labels)]  # Ensure same length for comparison
    #     ari = adjusted_rand_score(target, labels)
    #     nmi = normalized_mutual_info_score(target, labels)
        
    #     print(f"Adjusted Rand Index (ARI): {ari:.2f}")
    #     print(f"Normalized Mutual Information (NMI): {nmi:.2f}")


In [19]:
file_path = "../dataset abalone/abalone.data"
column_names = ["Sex", "Length", "Diameter", "Height", "WholeWeight",
                "ShuckedWeight", "VisceraWeight", "ShellWeight", "Rings"]
categorical_columns = ["Sex"]
target_column = "Rings"

train_kmeans_model(file_path, column_names, categorical_columns, target_column)

Optimal number of clusters (k): 2
Silhouette Score: 0.44
Calinski-Harabasz Index: 3362.01
Davies-Bouldin Index: 0.83


In [20]:
file_path = "../dataset balance+scale/balance-scale.data"
column_names = ["Class", "Left-Weight", "Left-Distance", "Right-Weight", "Right-Distance"]

target_column = "Class"
categorical_columns = None

train_kmeans_model(file_path, column_names, categorical_columns, target_column)


Optimal number of clusters (k): 8
Silhouette Score: 0.21
Calinski-Harabasz Index: 95.96
Davies-Bouldin Index: 1.19


In [21]:
file_path = "../dataset breast+cancer/breast-cancer.data"

column_names = [
    "Class", "Age", "Menopause", "Tumor_size", "Inv_nodes", 
    "Node_caps", "Deg_malig", "Breast", "Breast_quad", "Irradiat"
]

target_column = "Class"
categorical_columns = ["Age", "Menopause", "Tumor_size", "Inv_nodes", "Node_caps", "Breast", "Breast_quad", "Irradiat"]

train_kmeans_model(file_path, column_names, categorical_columns, target_column)


Optimal number of clusters (k): 2
Silhouette Score: 0.21
Calinski-Harabasz Index: 44.49
Davies-Bouldin Index: 1.95


In [22]:
file_path = "../dataset iris/iris.data"
column_names = ["SepalLength", "SepalWidth", "PetalLength", "PetalWidth", "Species"]

target_column = "Species"
categorical_columns = None

train_kmeans_model(file_path, column_names, categorical_columns, target_column)

Optimal number of clusters (k): 2
Silhouette Score: 0.56
Calinski-Harabasz Index: 148.74
Davies-Bouldin Index: 0.61


In [23]:
file_path = "../dataset wine/wine.data"
column_names = [
    "Alcohol", "Malic_acid", "Ash", "Alcalinity_of_ash", "Magnesium", 
    "Total_phenols", "Flavanoids", "Nonflavanoid_phenols", "Proanthocyanins", 
    "Color_intensity", "Hue", "OD280/OD315_of_diluted_wines", "Proline"
]

categorical_columns = None
target_column = None

train_kmeans_model(file_path, column_names, categorical_columns, target_column)

Optimal number of clusters (k): 3
Silhouette Score: 0.28
Calinski-Harabasz Index: 48.09
Davies-Bouldin Index: 1.42
