In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import mlflow
import mlflow.sklearn

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.metrics import (
    silhouette_score, silhouette_samples,
    adjusted_rand_score, normalized_mutual_info_score,
    davies_bouldin_score, calinski_harabasz_score
)



In [5]:

df = pd.read_csv("C:/AI workforce/mlops/mlops-clustering/data/data.csv")
X = df[["x", "y"]]
true_labels = df["color"]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [6]:
def log_plot(fig, name):
    path = f"{name}.png"
    fig.savefig(path, bbox_inches="tight")
    mlflow.log_artifact(path)
    plt.close(fig)

In [None]:
with mlflow.start_run(run_name="KMeans"):
    kmeans = KMeans(n_clusters=3, random_state=42)
    df["KMeans"] = kmeans.fit_predict(X_scaled)

  
    mlflow.log_param("algorithm", "KMeans")
    mlflow.log_param("n_clusters", 3)

 
    mlflow.log_metric("Silhouette", silhouette_score(X_scaled, df["KMeans"]))
    mlflow.log_metric("ARI", adjusted_rand_score(true_labels, df["KMeans"]))
    mlflow.log_metric("NMI", normalized_mutual_info_score(true_labels, df["KMeans"]))
    mlflow.log_metric("DBI", davies_bouldin_score(X_scaled, df["KMeans"]))
    mlflow.log_metric("CHI", calinski_harabasz_score(X_scaled, df["KMeans"]))

  
    centers_original = scaler.inverse_transform(kmeans.cluster_centers_)
    pd.DataFrame(centers_original, columns=["x", "y"]).to_csv("kmeans_centers.csv", index=False)
    mlflow.log_artifact("kmeans_centers.csv")

    
    fig, ax = plt.subplots(figsize=(8,6))
    ax.scatter(df["x"], df["y"], c=df["KMeans"], cmap="Set1", s=40, edgecolor="k", alpha=0.7)
    ax.scatter(centers_original[:,0], centers_original[:,1], c="yellow", s=300, marker="*", edgecolor="k", label="Centers")
    ax.set_title("KMeans Clustering (k=3)")
    log_plot(fig, "kmeans_clusters")

   
    k_values = range(2, 11)
    inertias, silhouettes = [], []
    for k in k_values:
        km = KMeans(n_clusters=k, random_state=42)
        labels = km.fit_predict(X_scaled)
        inertias.append(km.inertia_)
        silhouettes.append(silhouette_score(X_scaled, labels))

    fig, axes = plt.subplots(1,2, figsize=(12,5))
    axes[0].plot(k_values, inertias, marker="o")
    axes[0].set_title("Elbow Method")
    axes[0].set_xlabel("Number of clusters (k)")
    axes[0].set_ylabel("Inertia")

    axes[1].plot(k_values, silhouettes, marker="o", color="orange")
    axes[1].set_title("Silhouette Scores")
    axes[1].set_xlabel("Number of clusters (k)")
    axes[1].set_ylabel("Silhouette Score")
    log_plot(fig, "kmeans_elbow_silhouette")

In [None]:

with mlflow.start_run(run_name="DBSCAN"):
    dbscan = DBSCAN(eps=0.5, min_samples=5)
    df["DBSCAN"] = dbscan.fit_predict(X_scaled)

    mlflow.log_param("algorithm", "DBSCAN")
    mlflow.log_param("eps", 0.5)
    mlflow.log_param("min_samples", 5)

    if len(set(df["DBSCAN"])) > 1:
        mlflow.log_metric("Silhouette", silhouette_score(X_scaled, df["DBSCAN"]))
    else:
        mlflow.log_metric("Silhouette", -1)
    mlflow.log_metric("ARI", adjusted_rand_score(true_labels, df["DBSCAN"]))
    mlflow.log_metric("NMI", normalized_mutual_info_score(true_labels, df["DBSCAN"]))

    fig, ax = plt.subplots(figsize=(8,6))
    ax.scatter(df["x"], df["y"], c=df["DBSCAN"], cmap="Set1", s=40, edgecolor="k", alpha=0.7)
    ax.set_title("DBSCAN Clustering")
    log_plot(fig, "dbscan_clusters")


In [9]:
with mlflow.start_run(run_name="Agglomerative"):
    agg = AgglomerativeClustering(n_clusters=3)
    df["Agglomerative"] = agg.fit_predict(X_scaled)

    mlflow.log_param("algorithm", "Agglomerative")
    mlflow.log_param("n_clusters", 3)

    mlflow.log_metric("Silhouette", silhouette_score(X_scaled, df["Agglomerative"]))
    mlflow.log_metric("ARI", adjusted_rand_score(true_labels, df["Agglomerative"]))
    mlflow.log_metric("NMI", normalized_mutual_info_score(true_labels, df["Agglomerative"]))

    fig, ax = plt.subplots(figsize=(8,6))
    ax.scatter(df["x"], df["y"], c=df["Agglomerative"], cmap="Set1", s=40, edgecolor="k", alpha=0.7)
    ax.set_title("Agglomerative Clustering")
    log_plot(fig, "agg_clusters")
