In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

df = pd.read_csv("../dataset/student_clustering.csv")

X = df[['cgpa', 'iq']].values

def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2) ** 2))

def initialize_centroids(X, k):
    n_samples, n_features = X.shape
    centroids = np.zeros((k, n_features))
    for i in range(k):
        centroid = X[np.random.choice(range(n_samples))]
        centroids[i] = centroid
    return centroids

def assign_clusters(X, centroids):
    clusters = np.zeros(len(X))
    for i, sample in enumerate(X):
        distances = [euclidean_distance(sample, centroid) for centroid in centroids]
        cluster = np.argmin(distances)
        clusters[i] = cluster
    return clusters

def update_centroids(X, clusters, k):
    n_samples, n_features = X.shape
    centroids = np.zeros((k, n_features))
    for i in range(k):
        cluster_samples = X[clusters == i]
        centroid = np.mean(cluster_samples, axis=0)
        centroids[i] = centroid
    return centroids

def kmeans(X, k, max_iters=100):
    centroids = initialize_centroids(X, k)
    for _ in range(max_iters):
        clusters = assign_clusters(X, centroids)
        prev_centroids = centroids
        centroids = update_centroids(X, clusters, k)
        if np.all(prev_centroids == centroids):
            break
    return clusters, centroids

wcss = []
for i in range(1, 11):
    clusters, centroids = kmeans(X, i)
    wcss.append(np.sum([euclidean_distance(X[j], centroids[int(clusters[j])]) ** 2 for j in range(len(X))]))

# plt.plot(range(1, 11), wcss, marker='o')
# plt.title('Elbow Method')
# plt.xlabel('Number of clusters')
# plt.ylabel('WCSS')
# plt.show()

clusters, centroids = kmeans(X, 4)

# plt.figure(figsize=(10, 6))
# for i in range(4):
#     plt.scatter(X[clusters == i][:, 0], X[clusters == i][:, 1], label=f'Cluster {i}', alpha=0.6, s=100)
# plt.scatter(centroids[:, 0], centroids[:, 1], s=300, c='red', label='Centroids')
# plt.xlabel('CGPA')
# plt.ylabel('IQ')
# plt.title('K-means Clustering')
# plt.legend()
# plt.show()

In [2]:
from collections import defaultdict

clustered_data = defaultdict(list)

for i, cluster_label in enumerate(clusters):
    clustered_data[int(cluster_label)].append(X[i])

for cluster_label, cluster_points in clustered_data.items():
    print(f"Cluster {cluster_label}:")
    for point in cluster_points:
        print(point)
    print()


Cluster 1:
[ 5.13 88.  ]
[ 4.6 86. ]
[ 5. 88.]
[ 4.86 86.  ]
[ 4.78 87.  ]
[ 4.96 88.  ]
[ 4.86 87.  ]
[ 5.44 84.  ]
[ 5.34 85.  ]
[ 5.31 86.  ]
[ 5.14 83.  ]
[ 4.95 86.  ]
[ 5.21 87.  ]
[ 4.91 85.  ]
[ 5.28 83.  ]
[ 5.15 88.  ]
[ 4.9 85. ]
[ 4.89 88.  ]
[ 5.05 86.  ]
[ 4.98 91.  ]
[ 5.01 86.  ]
[ 4.95 88.  ]
[ 4.96 89.  ]
[ 4.85 86.  ]
[ 4.76 90.  ]
[ 4.98 87.  ]
[ 4.78 87.  ]
[ 5.2 85. ]
[ 5.05 87.  ]
[ 5.01 83.  ]
[ 4.77 86.  ]
[ 4.68 87.  ]
[ 4.81 85.  ]
[ 5.03 87.  ]
[ 4.98 87.  ]
[ 5.32 88.  ]
[ 4.86 88.  ]
[ 4.89 85.  ]
[ 4.88 86.  ]
[ 5.01 86.  ]
[ 4.67 86.  ]
[ 5.15 85.  ]
[ 4.97 88.  ]
[ 4.87 88.  ]
[ 5.2 89. ]
[ 4.99 88.  ]
[ 4.79 88.  ]
[ 4.76 89.  ]
[ 4.78 85.  ]
[ 4.68 89.  ]

Cluster 2:
[  5.9 113. ]
[  5.45 110.  ]
[  5.88 109.  ]
[  5.79 110.  ]
[  6.1 110. ]
[  5.71 108.  ]
[  5.5 111. ]
[  6.05 111.  ]
[  5.84 113.  ]
[  5.43 106.  ]
[  6.01 112.  ]
[  5.32 106.  ]
[  5.91 108.  ]
[  5.57 113.  ]
[  6.4 108. ]
[  5.67 109.  ]
[  6.05 108.  ]
[  5.85 111.  ]
[  5.87 1