# Lab Sheet-6

# KNN Clustering on the Iris Dataset

**To implement and demonstrate K-Nearest Neighbors (KNN) clustering for grouping
similar types of flowers using the Iris dataset. We will also visualize the clusters and
evaluate model performance using silhouette scores.**

**Dataset: https://www.kaggle.com/datasets/arshid/iris-flower-dataset/code**

# Import Important Labraries

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import MinMaxScaler

In [None]:
# Load the dataset
df = pd.read_csv('/kaggle/input/knn-clustering-dataset/IRIS.csv')

In [None]:
# Display the first few rows of the dataset to understand its structure
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
df.shape

**Distribution of Species**

In [None]:
species_count = df['species'].value_counts().rename({0: 'setosa', 1: 'versicolor', 2: 'virginica'})
pd.DataFrame({'Species': ['setosa', 'versicolor', 'virginica'], 'Count': species_count.values})

In [None]:
df_melted = df.melt(id_vars='species', value_vars=['sepal_length', 'petal_length', 'petal_width'], 
                           var_name='Feature', value_name='Measurement')
plt.figure(figsize=(10, 6))
sns.barplot(x='species', y='Measurement', hue='Feature', data=df_melted, palette='Set2')
plt.title('Comparison of Sepal Length, Petal Length, and Petal Width Across Species')
plt.xlabel('Species')
plt.ylabel('Measurement (cm)')
plt.legend(title='Features')
plt.show()

In [None]:
sns.set_style("darkgrid")
sns.pairplot(df,hue="species",size=3);
plt.show()

In [None]:
df.head()

In [None]:
scaler = LabelEncoder()

df['species'] = scaler.fit_transform(df['species'])

In [None]:
y = df['species']
x = df

In [None]:
y

In [None]:
x

In [None]:
cols = x.columns

In [None]:
ms = MinMaxScaler()

X = ms.fit_transform(x)

In [None]:
X = pd.DataFrame(X, columns=[cols])

In [None]:
X.head()

**The elbow method is used to determine the optimal number of clusters in KMeans by plotting the sum of squared distances (inertia) from each point to its assigned cluster center for different numbers of clusters**

In [None]:
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 42)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)
plt.plot(range(1, 11), wcss)

**3 clusters according to the Elbow method**

In [None]:
kmeans = KMeans(n_clusters=3, init='k-means++', max_iter=300, n_init=10, random_state=0)
y_kmeans = kmeans.fit_predict(X)
kmeans = kmeans.fit(X)

In [None]:
kmeans.cluster_centers_

In [None]:
kmeans.inertia_

In [None]:
labels = sum(y == kmeans.labels_)

In [None]:
print('Accuracy score: {0:02f}'. format(labels/float(y.size)*100))

In [None]:
plt.figure(figsize=(8, 6))

# Plot each cluster with different colors
plt.scatter(X.iloc[y_kmeans == 0, 2], X.iloc[y_kmeans == 0, 3], s=100, c='red', label='Cluster 1')
plt.scatter(X.iloc[y_kmeans == 1, 2], X.iloc[y_kmeans == 1, 3], s=100, c='blue', label='Cluster 2')
plt.scatter(X.iloc[y_kmeans == 2, 2], X.iloc[y_kmeans == 2, 3], s=100, c='green', label='Cluster 3')

# Plot the centroids
plt.scatter(kmeans.cluster_centers_[:, 2], kmeans.cluster_centers_[:, 3], s=300, c='black', label='Centroids', marker='X')

**Trying out with number of clusters = 2**

In [None]:
kmeans_i = KMeans(n_clusters=2, init='k-means++', max_iter=300, n_init=10, random_state=0)
y_kmeans_i = kmeans.fit_predict(X)
kmeans_i = kmeans_i.fit(X)

In [None]:
kmeans_i.inertia_

In [None]:
labels_i = sum(y == kmeans_i.labels_)

In [None]:
print('Accuracy score: {0:02f}'. format(labels_i/float(y.size)*100))

In [None]:
plt.figure(figsize=(8, 6))

# Plot each cluster with different colors
plt.scatter(X.iloc[y_kmeans_i == 0, 2], X.iloc[y_kmeans_i == 0, 3], s=100, c='red', label='Cluster 1')
plt.scatter(X.iloc[y_kmeans_i == 1, 2], X.iloc[y_kmeans_i == 1, 3], s=100, c='blue', label='Cluster 2')

# Plot the centroids
plt.scatter(kmeans_i.cluster_centers_[:, 2], kmeans_i.cluster_centers_[:, 3], s=300, c='black', label='Centroids', marker='X')

# Making the centroids the mean of their cluster labels

In [None]:
def compute_centroids(X, labels, n_clusters):
    centroids = np.zeros((n_clusters, X.shape[1]))
    for i in range(n_clusters):
        centroids[i, :] = X[labels == i].mean(axis=0)
    return centroids
manual_centroids = compute_centroids(X.to_numpy(), y_kmeans_i, 2)

In [None]:
plt.figure(figsize=(8, 6))


plt.scatter(X.iloc[y_kmeans_i == 0, 2], X.iloc[y_kmeans_i == 0, 3], s=100, c='red', label='Cluster 1')
plt.scatter(X.iloc[y_kmeans_i == 1, 2], X.iloc[y_kmeans_i == 1, 3], s=100, c='blue', label='Cluster 2')

plt.scatter(manual_centroids[:, 2], manual_centroids[:, 3], s=300, c='black', label='Manual Centroids', marker='x')