<a href="https://colab.research.google.com/github/sharvaniyadav/Assignment-1-ECE1513-Intro-To-ML/blob/main/Assignment_1_ECE1513_IntroToML_SharvaniYadav.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
# Course: ECE1513 Introduction to Machine Learning (Fall 2025)
# Assignment 1
# Student Name: Sharvani Yadav
# Student Number: 1008289870

# Part 1: Clustering with k-means
# This coded written below for Part 1 implements k-means clustering from scratch using only NumPy.
# It uses an initialization method iscussed in class (sampled from the dataset) for reproducibility.
# It will run the algorithm for k = 2..7, computes distortion, and plots an elbow curve.

# -------------------------------------------------------------------------------------------------
# Part 1.1: Implemented k-means using a single initialization method (sampling points from dataset)
# -------------------------------------------------------------------------------------------------

# Import needed libraries
# numpy: for math with arrays
# sklearn.datasets: to load the Breast Cancer dataset

# -----------------------------
# STEP 1: Loaded the dataset
# -----------------------------
from sklearn.datasets import load_breast_cancer

# Loading the UCI ML Breast Cancer dataset
data = load_breast_cancer()

# Only needed the features (X) for clustering, not the labels
X = data.data.astype(float)

print("=== DATASET LOADED ===")
print("Shape of X (rows, columns):", X.shape)   # should be 569 rows × 30 features
print("First row of raw data (before standardization):")
print(X[0])  # just to see the numbers before I change anything
print()

# -----------------------------
# STEP 2: Made all features the same scale (standardized)
# -----------------------------

# Imported NumPy for math operations
import numpy as np

# Found the average (mean) value for each feature
feature_means = X.mean(axis=0)

# Found how spread out (standard deviation) each feature was
feature_stds = X.std(axis=0, ddof=0)

# Replaced any feature with zero spread with 1 to avoid dividing by zero
feature_stds[feature_stds == 0] = 1.0

# Created a new dataset where each feature had mean 0 and spread ~1
# This was done by subtracting the mean and dividing by the standard deviation
Xs = (X - feature_means) / feature_stds

print("=== AFTER STANDARDIZATION ===")
print("First row after standardization (values were now near 0):")
print(np.round(Xs[0], 3))  # Rounded to make it easier to read

# Checked that the first few features had mean close to 0 and std close to 1
print("\nMean of first 5 features:", np.round(Xs[:, :5].mean(axis=0), 3))
print("Std of first 5 features:", np.round(Xs[:, :5].std(axis=0, ddof=0), 3))




=== DATASET LOADED ===
Shape of X (rows, columns): (569, 30)
First row of raw data (before standardization):
[1.799e+01 1.038e+01 1.228e+02 1.001e+03 1.184e-01 2.776e-01 3.001e-01
 1.471e-01 2.419e-01 7.871e-02 1.095e+00 9.053e-01 8.589e+00 1.534e+02
 6.399e-03 4.904e-02 5.373e-02 1.587e-02 3.003e-02 6.193e-03 2.538e+01
 1.733e+01 1.846e+02 2.019e+03 1.622e-01 6.656e-01 7.119e-01 2.654e-01
 4.601e-01 1.189e-01]

=== AFTER STANDARDIZATION ===
First row after standardization (values were now near 0):
[ 1.097 -2.073  1.27   0.984  1.568  3.284  2.653  2.532  2.218  2.256
  2.49  -0.565  2.833  2.488 -0.214  1.317  0.724  0.661  1.149  0.907
  1.887 -1.359  2.304  2.001  1.308  2.617  2.11   2.296  2.751  1.937]

Mean of first 5 features: [-0. -0. -0. -0.  0.]
Std of first 5 features: [1. 1. 1. 1. 1.]


In [10]:
# ----------------------------------------------------
# STEP 3: Made the small functions needed for k-means
# ----------------------------------------------------

# Picked k starting points from the data to use as centroids
# Used a random seed so the results stayed the same every time the code ran
# (helpful so results match if the notebook is run again)
def init_centroids_from_data(X, k, seed=42):
    rng = np.random.default_rng(seed)
    idx = rng.choice(X.shape[0], size=k, replace=False)
    return X[idx].copy()

# Found which centroid each data point was closest to
# First calculated the difference between each point and each centroid
# Then calculated the squared distance and picked the smallest one
# (squared distance was used because it's simpler and still works correctly)
def assign_labels(X, centroids):
    diff = X[:, None, :] - centroids[None, :, :]   # difference between points and centroids
    distances = np.sum(diff ** 2, axis=2)          # squared distances
    return np.argmin(distances, axis=1)            # index of closest centroid

# Moved each centroid to the middle (average) of all the points in its group
# If a group had no points, used the overall average of the data instead
# (this felt like the safest fallback so the algorithm wouldn't crash)
def recompute_centroids(X, labels, k):
    d = X.shape[1]
    new_centroids = np.empty((k, d), dtype=X.dtype)
    global_mean = X.mean(axis=0)
    for j in range(k):
        pts = X[labels == j]
        if pts.size > 0:
            new_centroids[j] = pts.mean(axis=0)
        else:
            new_centroids[j] = global_mean
    return new_centroids


In [13]:
# -----------------------------
# STEP 4: Put the pieces together (full k-means)
# -----------------------------

# This function ran the whole k-means process.
# Simple idea: started with some centroids, then repeated:
#   1) gave each point the label of the closest centroid
#   2) moved each centroid to the average of its labeled points
# kept going until the centroids barely moved anymore
def kmeans(X, k, *, max_iter=300, tol=1e-4, seed=42):
    # Started with k centroids picked from the data (see Step 3)
    centroids = init_centroids_from_data(X, k, seed)

    # Looped up to max_iter times in case things took a while to settle
    for i in range(max_iter):
        # Step 1: Gave each point the label of its nearest centroid
        labels = assign_labels(X, centroids)

        # Step 2: Recomputed the centroids based on those labels
        new_centroids = recompute_centroids(X, labels, k)

        # Calculated how far the centroids moved since last time
        diff = new_centroids - centroids
        shift = np.linalg.norm(diff)

        # Updated centroids for the next round
        centroids = new_centroids

        # Added a check so the function could stop early
        # if the movement was smaller than the tolerance (means it settled)
        if shift < tol:
            break

    # Returned the final centroids and the label for every point
    return centroids, labels