In [None]:
# Load image
# Reshape image to 2D array of pixels (each pixel = [R, G, B])

# Choose number of clusters (k)

# Initialize k cluster centers randomly

# Repeat until convergence or max iterations:
    # Assign each pixel to the nearest cluster center
    # Update cluster centers as the mean of assigned pixels

# Replace each pixel with its cluster center color

# Reshape back to original image shape and display/save result

In [4]:
%pip install torch torchvision torchaudio

Collecting torch
  Using cached torch-2.7.0-cp313-none-macosx_11_0_arm64.whl.metadata (29 kB)
Collecting torchvision
  Downloading torchvision-0.22.0-cp313-cp313-macosx_11_0_arm64.whl.metadata (6.1 kB)
Collecting torchaudio
  Downloading torchaudio-2.7.0-cp313-cp313-macosx_11_0_arm64.whl.metadata (6.6 kB)
Collecting filelock (from torch)
  Using cached filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting sympy>=1.13.3 (from torch)
  Using cached sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting networkx (from torch)
  Downloading networkx-3.5-py3-none-any.whl.metadata (6.3 kB)
Collecting fsspec (from torch)
  Downloading fsspec-2025.5.1-py3-none-any.whl.metadata (11 kB)
Collecting mpmath<1.4,>=1.1.0 (from sympy>=1.13.3->torch)
  Using cached mpmath-1.3.0-py3-none-any.whl.metadata (8.6 kB)
Using cached torch-2.7.0-cp313-none-macosx_11_0_arm64.whl (68.6 MB)
Downloading torchvision-0.22.0-cp313-cp313-macosx_11_0_arm64.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [1]:
import os
import torch
from torch.utils.data import DataLoader
from torchvision.transforms import PILToTensor, ToTensor
from torchvision.datasets import VOCSegmentation
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt


#transformation class
class ToTensorPair:
    def __call__(self, image, target):
        image = ToTensor()(image)
        target = torch.from_numpy(np.array(target)).long()
        return image, target
    

data_root = '/Users/sarayumum/Desktop/ucdavis/ecs171/subject-isolation/dataset'
transform = ToTensorPair()
train_dataset = VOCSegmentation(
    root=data_root,
    year="2012",
    image_set="train",
    download=True, 
    transforms=transform
)

val_dataset = VOCSegmentation(
    root=data_root,
    year="2012",
    image_set="val",
    download=True, 
    transforms=transform
)

BATCH_SIZE = 8

train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=0
)

val_loader = DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

print("Data Loaded")

Data Loaded


In [2]:
image, mask = train_dataset[0]
print("Image shape:", image.shape) 
print("Mask shape:", mask.shape) 

Image shape: torch.Size([3, 281, 500])
Mask shape: torch.Size([281, 500])


In [3]:
image_perumated = image.permute(1, 2, 0)
print("Image shape:", image_perumated)

Image shape: tensor([[[0.5137, 0.7569, 0.8157],
         [0.5137, 0.7569, 0.8157],
         [0.5137, 0.7569, 0.8157],
         ...,
         [0.7137, 0.9098, 0.9451],
         [0.7137, 0.9098, 0.9451],
         [0.7137, 0.9098, 0.9451]],

        [[0.5176, 0.7608, 0.8196],
         [0.5176, 0.7608, 0.8196],
         [0.5216, 0.7647, 0.8235],
         ...,
         [0.7176, 0.9137, 0.9490],
         [0.7176, 0.9137, 0.9490],
         [0.7137, 0.9098, 0.9451]],

        [[0.5216, 0.7647, 0.8235],
         [0.5216, 0.7647, 0.8235],
         [0.5255, 0.7686, 0.8275],
         ...,
         [0.7176, 0.9137, 0.9490],
         [0.7176, 0.9137, 0.9490],
         [0.7137, 0.9098, 0.9451]],

        ...,

        [[0.0235, 0.0588, 0.0549],
         [0.0196, 0.0549, 0.0510],
         [0.0275, 0.0510, 0.0510],
         ...,
         [0.0627, 0.1333, 0.1490],
         [0.0627, 0.1333, 0.1490],
         [0.0627, 0.1333, 0.1490]],

        [[0.0196, 0.0549, 0.0510],
         [0.0353, 0.0706, 0.0667],

In [4]:
image_reshaped = image_perumated.reshape(-1, 3)
print("Image shape:", image_reshaped)

Image shape: tensor([[0.5137, 0.7569, 0.8157],
        [0.5137, 0.7569, 0.8157],
        [0.5137, 0.7569, 0.8157],
        ...,
        [0.0627, 0.1294, 0.1569],
        [0.0471, 0.1137, 0.1412],
        [0.0667, 0.1333, 0.1608]])


In [5]:
train_pixels = []
val_pixels = []

def pixels(dataset):
    pixels = []
    for image, _ in dataset:
        image_perumated = image.permute(1, 2, 0)
        image_reshaped = image_perumated.reshape(-1, 3)
        pixels.append(image_reshaped)
    return pixels

train_pixels = pixels(train_dataset)
val_pixels = pixels(val_dataset)
print(train_pixels[1])

tensor([[0.6431, 0.6471, 0.5765],
        [0.6667, 0.6706, 0.6000],
        [0.6706, 0.6824, 0.6000],
        ...,
        [0.2863, 0.3059, 0.3216],
        [0.2784, 0.2980, 0.3216],
        [0.2667, 0.2863, 0.3098]])


In [10]:
print(train_pixels[0].shape)

torch.Size([140500, 3])


In [6]:
#concatenate the train and val pixels
total_train_pixels = torch.cat(train_pixels, dim  = 0)
total_train_pixels = np.float32(total_train_pixels)

classes = 21 #number of classes + background
num_train_pixels = len(total_train_pixels)
indices = np.random.choice(num_train_pixels, classes, replace=False)
centers = total_train_pixels[indices]
print(centers)
print(total_train_pixels)

[[0.8980392  0.74509805 0.48235294]
 [0.04313726 0.01176471 0.00392157]
 [0.47843137 0.47058824 0.16862746]
 [0.94509804 0.9607843  0.972549  ]
 [1.         0.92941177 0.7921569 ]
 [0.3647059  0.47058824 0.6784314 ]
 [0.9843137  0.99215686 0.9882353 ]
 [0.0627451  0.03921569 0.04705882]
 [0.6431373  0.7176471  0.78431374]
 [0.14117648 0.12941177 0.10980392]
 [0.06666667 0.08235294 0.07843138]
 [0.65882355 0.67058825 0.44705883]
 [0.85490197 0.827451   0.654902  ]
 [0.69411767 0.74509805 0.8117647 ]
 [0.627451   0.6745098  0.7294118 ]
 [0.3254902  0.53333336 0.74509805]
 [0.22745098 0.24313726 0.25490198]
 [0.70980394 0.7254902  0.49411765]
 [0.24705882 0.3764706  0.34117648]
 [0.43137255 0.41568628 0.41960785]
 [0.07450981 0.1254902  0.15686275]]
[[0.5137255  0.75686276 0.8156863 ]
 [0.5137255  0.75686276 0.8156863 ]
 [0.5137255  0.75686276 0.8156863 ]
 ...
 [0.12941177 0.30588236 0.3254902 ]
 [0.04705882 0.34117648 0.3647059 ]
 [0.14901961 0.29803923 0.33333334]]


In [8]:
from sklearn.cluster import KMeans
from sklearn.cluster import MiniBatchKMeans

pixels = np.uint8(total_train_pixels * 255)
k = 21  
kmeans = MiniBatchKMeans(n_clusters=21, batch_size=10000, random_state=0)
kmeans.fit(pixels)
print("KMeans training complete.")

# Segment a sample image using fitted KMeans
def segment_image_with_kmeans(image_tensor, kmeans_model):
    h, w = image_tensor.shape[1], image_tensor.shape[2]
    image_np = image_tensor.permute(1, 2, 0).reshape(-1, 3).numpy().astype(np.float32)
    labels = kmeans_model.predict(image_np)
    segmented_pixels = kmeans_model.cluster_centers_[labels]
    segmented_image = segmented_pixels.reshape(h, w, 3)
    return segmented_image

# Visualize results for first few val images
for i in range(3):
    image_tensor, _ = val_dataset[i]
    segmented = segment_image_with_kmeans(image_tensor, kmeans)

    # Show original and segmented side-by-side
    plt.figure(figsize=(10, 4))
    plt.subplot(1, 2, 1)
    plt.imshow(image_tensor.permute(1, 2, 0).numpy())
    plt.title("Original")
    plt.axis("off")

    plt.subplot(1, 2, 2)
    plt.imshow(segmented)
    plt.title("Segmented with KMeans")
    plt.axis("off")
    plt.show()

KMeans training complete.


ValueError: Buffer dtype mismatch, expected 'const float' but got 'double'