<a href="https://colab.research.google.com/github/sangho24/sogang/blob/main/EC5320_2024_2_Week10_KMEANS_v2_20200572.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#EC5320 Week10 codes: K-Means to group colors from an image

2024.11.3.<br>

Author: Hyunjoo Yang (hyang@sogang.ac.kr)<br><br>

This notebook uses Scikit Learn's K-means algorithms.<br><br>

Data source:<br>
ESA's sentinel 2 satellite images in 2020 <br><br>

Codes are from a paper by Jeong and Yang:<br>
https://www.researchgate.net/publication/357418184_Using_maps_to_predict_economic_activity <br><br>

For information on K-Means, refer to:<br>
https://scikit-learn.org/stable/modules/clustering.html <br><br>

Watch the iteration process of K-Means:<br>
https://www.youtube.com/watch?v=5I3Ei69I40s

# 1. Upload files

In [None]:
# image zipfile name
fn = 'clipped_127_37.zip'

# dropbox local save folder name
dir_nm = './pop_data'

In [None]:
!unzip {fn} -d {dir_nm}

In [None]:
!ls pop_data

# 2. Prepare file paths and ground truth

In [None]:
import glob

# grab image file list

img_list = glob.glob(dir_nm + '/*.png')
img_list[0:5]

In [None]:
import os

# test extracting id from an image
os.path.splitext(os.path.basename(img_list[1]))[0]

In [None]:
# generate a list of ids

file_gid = list(map(lambda x:os.path.splitext(os.path.basename(x))[0], img_list))
file_gid[0:5]

In [None]:
import pandas as pd

# combine file path and gid

df_img = pd.DataFrame(list(zip(img_list, file_gid)), columns=['img_path', 'gid'])
df_img

In [None]:
df_img.dtypes

# 3. Read an image as tabular data (R, G, B)

In [None]:
# show a sample image

import cv2
from google.colab.patches import cv2_imshow

img_sample_path = img_list[1]

img_to_show = cv2.imread(img_sample_path)
cv2_imshow(img_to_show)

In [None]:
# read image as array

#img_array = cv2.imread(img_sample_path, cv2.IMREAD_UNCHANGED)
img_array = cv2.cvtColor(cv2.imread(img_sample_path), cv2.COLOR_BGR2RGB)
img_array

In [None]:
img_array.shape

In [None]:
# reshape image (row x column -> long column)

image_tot_rows = img_array.shape[0]*img_array.shape[1]
img_array_reshaped = img_array.reshape(image_tot_rows, 3)

In [None]:
img_array_reshaped.shape

In [None]:
# convert to a pandas dataframe

df_img_raw = pd.DataFrame(img_array_reshaped, columns = ['r','g','b'])
df_img_raw

In [None]:
# scale RGB values by 255

img_array_scaled = img_array_reshaped/255.0
df_img = pd.DataFrame(img_array_scaled, columns = ['r','g','b'])
df_img

# 4. K-means to cluster images

In [None]:
k = 20
n_init=30
max_iter=300
my_seed = 42

## 4.1 K-Means using Scikit Learn

In [None]:
%%time

from sklearn.cluster import KMeans

kmeans_sk = KMeans(
    n_clusters=k, init='random',
    n_init=n_init, max_iter=max_iter,
    random_state=my_seed
)

kmeans_sk.fit(df_img)

colors_sk = kmeans_sk.predict(df_img)

In [None]:
colors_sk

In [None]:
colors_sk.shape

In [None]:
centroids_sk  = kmeans_sk.cluster_centers_
centroids_sk

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# show centroid colors

palette = np.array(list(map(tuple, centroids_sk)))[np.newaxis, :, :]

fig, axes = plt.subplots(1,1,figsize=(8,2))

axes.imshow(palette);

In [None]:
# save color groups

df_y_sk = pd.Series(np.squeeze(colors_sk), name='kmeans')
df_y_sk.value_counts(normalize=True)

## 4.2 Mini Batch KMeans using Scikit Learn

In [None]:
%%time

from sklearn.cluster import MiniBatchKMeans

kmeans_mini = MiniBatchKMeans(
    n_clusters=k, init='random',
    n_init=n_init, max_iter=max_iter,
    random_state=my_seed
)

kmeans_mini.fit(df_img)

colors_mini = kmeans_mini.predict(df_img)

In [None]:
colors_mini

In [None]:
colors_mini.shape

In [None]:
centroids_mini  = kmeans_mini.cluster_centers_
centroids_mini

In [None]:
import matplotlib.pyplot as plt

# show centroid colors

palette = np.array(list(map(tuple, centroids_mini)))[np.newaxis, :, :]

fig, axes = plt.subplots(1,1,figsize=(8,2))

axes.imshow(palette)

In [None]:
# save color groups

df_y_mini = pd.Series(np.squeeze(colors_mini), name='mini batch')
df_y_mini.value_counts(normalize=True)

# 5. Compare results

In [None]:
# concat df together

df_final = pd.concat([df_img_raw, df_y_sk, df_y_mini], axis=1)
df_final

# 6. Show clustered images

In [None]:
X_sk_seg = y_sk_centroids_conv_to_rgb[colors_sk].reshape(img_to_show.shape) # actual RGB vals -> centroid vals
X_mini_seg = y_mini_centroids_conv_to_rgb[colors_mini].reshape(img_to_show.shape) # actual RGB vals -> centroid vals

In [None]:
plt.rcParams['figure.figsize'] = [14, 14]
img_to_show = cv2.cvtColor(cv2.imread(img_sample_path), cv2.COLOR_BGR2RGB)

plt.subplot(2, 2, 1)                # Actual
plt.imshow(img_to_show)
plt.title('ACTUAL')

plt.subplot(2, 2, 2)                # Kmeans
plt.imshow(X_sk_seg)
plt.title('Kmeans')

plt.subplot(2, 2, 3)                # Mini Batch
plt.imshow(X_mini_seg)
plt.title('Mini Batch')


plt.tight_layout()
plt.show();