In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score

# Load the dataset
data = pd.read_csv('knn dataset.csv')

# Preprocessing: Scaling the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)

# Determine the optimal number of clusters using the elbow method
inertia = []
cluster_range = range(1, 11)

for k in cluster_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(scaled_data)
    inertia.append(kmeans.inertia_)

# Plot the elbow method
plt.figure(figsize=(8, 5))
plt.plot(cluster_range, inertia, marker='o')
plt.title("Elbow Method for Optimal Number of Clusters")
plt.xlabel("Number of Clusters")
plt.ylabel("Inertia")
plt.xticks(cluster_range)
plt.grid()
plt.show()

# Optimal number of clusters (based on observation)
optimal_clusters = 3

# Apply K-Means clustering
kmeans = KMeans(n_clusters=optimal_clusters, random_state=42)
clusters = kmeans.fit_predict(scaled_data)

# Add cluster labels to the dataset
data['Cluster'] = clusters

# Visualize clusters using a 2D scatter plot (first two features)
plt.figure(figsize=(8, 6))
sns.scatterplot(
    x=data['Feature_1'],
    y=data['Feature_2'],
    hue=data['Cluster'],
    palette="viridis",
    style=data['Cluster'],
    s=100,
    alpha=0.8
)
plt.title("2D Scatter Plot of Clusters")
plt.xlabel("Feature 1")
plt.ylabel("Feature 2")
plt.legend(title="Cluster")
plt.grid()
plt.show()

# Silhouette score for cluster evaluation
silhouette_avg = silhouette_score(scaled_data, clusters)
print(f"Silhouette Score: {silhouette_avg}"

In [None]:
#Elbow Method: The elbow method plot suggests that the optimal number of clusters is 3, as the curve starts to flatten beyond this point.

#Cluster Visualization: A scatter plot of the first two features shows the clusters formed by the K-Means algorithm. Each cluster is distinguished by different colors and markers.

#Silhouette Score: The silhouette score for the clustering is 0.1878, indicating a relatively low degree of separation between clusters. This may suggest overlapping clusters or data that isn't strongly separated.