In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.metrics import (
    silhouette_score, silhouette_samples,
    adjusted_rand_score, normalized_mutual_info_score,
    davies_bouldin_score, calinski_harabasz_score
)
import mlflow, os


mlflow.set_experiment("Clustering_Pipeline_Notebook")

os.makedirs("processed", exist_ok=True)
os.makedirs("plots", exist_ok=True)


In [31]:
with mlflow.start_run(run_name="preprocessing"):

   
    df = pd.read_csv("C:/AI workforce/mlops/mlops-clustering/data/data.csv")

 
    mlflow.log_param("rows_before", df.shape[0])
    mlflow.log_param("cols_before", df.shape[1])
    mlflow.log_metric("null_values_before", df.isnull().sum().sum())

    
    df_cleaned = df.dropna()

    
    mlflow.log_param("rows_after", df_cleaned.shape[0])
    mlflow.log_param("cols_after", df_cleaned.shape[1])
    mlflow.log_metric("null_values_after", df_cleaned.isnull().sum().sum())

  
    scaler = StandardScaler()
    X = df_cleaned[["x", "y"]]
    X_scaled = scaler.fit_transform(X)

    mlflow.log_param("scaled_columns", ["x", "y"])
    mlflow.log_metric("scaled_mean_x", X_scaled[:,0].mean())
    mlflow.log_metric("scaled_mean_y", X_scaled[:,1].mean())

    
    processed_path = "processed/data_scaled.csv"
    df_cleaned[["x", "y"]] = X_scaled
    df_cleaned.to_csv(processed_path, index=False)

    mlflow.log_artifact(processed_path)


In [32]:
with mlflow.start_run(run_name="analysis"):
    X_scaled = np.load("processed/X_scaled.npy")
    df = pd.read_csv("processed/data_with_labels.csv")

    true_labels = df["color"]
    metrics = {}

    kmeans = KMeans(n_clusters=3, random_state=42)
    df["KMeans"] = kmeans.fit_predict(X_scaled)

    metrics["KMeans"] = {
        "Silhouette": silhouette_score(X_scaled, df["KMeans"]),
        "ARI": adjusted_rand_score(true_labels, df["KMeans"]),
        "NMI": normalized_mutual_info_score(true_labels, df["KMeans"]),
        "DBI": davies_bouldin_score(X_scaled, df["KMeans"]),
        "CHI": calinski_harabasz_score(X_scaled, df["KMeans"])
    }

    
    dbscan = DBSCAN(eps=0.5, min_samples=5)
    df["DBSCAN"] = dbscan.fit_predict(X_scaled)
    if len(set(df["DBSCAN"])) > 1:
        sil_db = silhouette_score(X_scaled, df["DBSCAN"])
    else:
        sil_db = None
    metrics["DBSCAN"] = {
        "Silhouette": sil_db,
        "ARI": adjusted_rand_score(true_labels, df["DBSCAN"]),
        "NMI": normalized_mutual_info_score(true_labels, df["DBSCAN"])
    }

  
    agg = AgglomerativeClustering(n_clusters=3)
    df["Agglomerative"] = agg.fit_predict(X_scaled)
    metrics["Agglomerative"] = {
        "Silhouette": silhouette_score(X_scaled, df["Agglomerative"]),
        "ARI": adjusted_rand_score(true_labels, df["Agglomerative"]),
        "NMI": normalized_mutual_info_score(true_labels, df["Agglomerative"])
    }

    df.to_csv("processed/data_with_clusters.csv", index=False)

   
    for algo, vals in metrics.items():
        for k, v in vals.items():
            if v is not None:
                mlflow.log_metric(f"{algo}_{k}", v)

    mlflow.log_artifact("processed/data_with_clusters.csv")

pd.DataFrame(metrics).T


Unnamed: 0,Silhouette,ARI,NMI,DBI,CHI
KMeans,0.610956,0.92272,0.881411,0.539443,766.872749
DBSCAN,0.486498,0.562213,0.665856,,
Agglomerative,0.610956,0.92272,0.881411,,
