In [2]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score

# ----------------------------
# Load preprocessed data
# ----------------------------
df = pd.read_csv("iris_processed.csv")  # from Task 1
print("Data shape:", df.shape)

# Reconstruct class labels from one-hot encoding
y = np.argmax(df[["species_0", "species_1", "species_2"]].values, axis=1)

# Features (drop one-hot encoded label columns)
X = df.drop(columns=["species_0", "species_1", "species_2"])

# ----------------------------
# K-Means clustering (k=3)
# ----------------------------
kmeans_3 = KMeans(n_clusters=3, random_state=42, n_init=10)
kmeans_3.fit(X)
clusters_3 = kmeans_3.labels_

# Compare with actual classes using Adjusted Rand Index
ari_3 = adjusted_rand_score(y, clusters_3)
print(f"Adjusted Rand Index (k=3): {ari_3:.4f}")

# ----------------------------
# Experiment with k=2 and k=4
# ----------------------------
ari_scores = {}
for k in [2, 3, 4]:
    km = KMeans(n_clusters=k, random_state=42, n_init=10)
    km.fit(X)
    preds = km.labels_
    ari = adjusted_rand_score(y, preds)
    ari_scores[k] = ari
    print(f"Adjusted Rand Index (k={k}): {ari:.4f}")

# ----------------------------
# Elbow method to justify k
# ----------------------------
inertia_values = []
K_range = range(1, 10)
for k in K_range:
    km = KMeans(n_clusters=k, random_state=42, n_init=10)
    km.fit(X)
    inertia_values.append(km.inertia_)

plt.figure(figsize=(6, 4))
plt.plot(K_range, inertia_values, marker="o")
plt.title("Elbow Curve for Optimal k")
plt.xlabel("Number of clusters (k)")
plt.ylabel("Inertia")
plt.grid(True)
plt.savefig("elbow_curve.png")
plt.close()
print("Elbow curve saved as elbow_curve.png")

# ----------------------------
# Visualize clusters (petal length vs width)
# ----------------------------
plt.figure(figsize=(6, 4))
sns.scatterplot(
    x=df.iloc[:, 2],  # petal length
    y=df.iloc[:, 3],  # petal width
    hue=clusters_3,
    palette="viridis"
)
plt.title("K-Means Clusters (k=3)")
plt.xlabel("Petal Length")
plt.ylabel("Petal Width")
plt.legend(title="Cluster")
plt.savefig("clusters_k3.png")
plt.close()
print("Cluster plot saved as clusters_k3.png")

# ----------------------------
# Save ARI results to CSV
# ----------------------------
pd.DataFrame(list(ari_scores.items()), columns=["k", "ARI"]).to_csv("ari_scores.csv", index=False)
print("ARI scores saved as ari_scores.csv")


Data shape: (150, 7)
Adjusted Rand Index (k=3): 0.7163
Adjusted Rand Index (k=2): 0.5681
Adjusted Rand Index (k=3): 0.7163
Adjusted Rand Index (k=4): 0.6231
Elbow curve saved as elbow_curve.png
Cluster plot saved as clusters_k3.png
ARI scores saved as ari_scores.csv
