In [97]:
from pathlib import Path
from math import ceil
import joblib
import json
import shutil

import numpy as np
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_score
from tqdm.notebook import tqdm

from utils.config import *
from utils.utils import *

np.random.seed(SEED)

In [86]:
with open('../path/resize_train_imgs_path.txt', 'r', encoding='utf-8') as f:
    img_paths = f.read().splitlines()

dst_artifacts = ROOT_ARTIFACTS / 'clusters'
dst_models = dst_artifacts / 'models'
dst_results = dst_artifacts / 'results'
dst_models.mkdir(parents=True, exist_ok=True)
dst_results.mkdir(parents=True, exist_ok=True)

K_arr = [3, 5, 7, 9, 11]
chunk_size = 500

In [140]:
folders = []
for folder in Path(ROOT_RESIZE_TRAIN_IMAGES).glob('**'):
    imgs = list(folder.glob('*.jpg'))
    n_imgs = len(imgs)
    if n_imgs > 50:
        folders.append(folder)
    elif n_imgs > 0 and n_imgs <= 50:
        dst_img_dir = str(folder).replace('images', 'images_out_of_samples')
        src_label_dir = str(folder).replace('images', 'labels')
        dst_label_dir = str(src_label_dir).replace('labels', 'labels_out_of_samples')

        Path(dst_img_dir).parent.mkdir(parents=True, exist_ok=True)
        Path(dst_label_dir).parent.mkdir(parents=True, exist_ok=True)
        shutil.move(str(folder), dst_img_dir)
        shutil.move(src_label_dir, dst_label_dir)

In [None]:
for i, folder in tqdm(enumerate(folders), total=len(folders)):
    folder_name = folder.stem
    imgs = list(folder.glob('*.jpg'))
    n_imgs = len(imgs)
    if n_imgs:
        img_vectors = np.stack(
            [read_kor_path_img(imgs[i]).flatten().astype(np.float16) for i in range(len(imgs))]
        )

        silhouette_scores = {}
        for K in tqdm(K_arr, total=len(K_arr)):
            kmeans = MiniBatchKMeans(n_clusters=K, random_state=SEED, batch_size=32)
            img_idx_chunks = get_partial_idxs(n_imgs, chunk_size)
            
            for idx_chunks in img_idx_chunks:
                img_chunk = img_vectors[idx_chunks]
                kmeans.partial_fit(img_chunk)
                
            chunk_s_scores = []
            weights = []
            for idx_chunks in img_idx_chunks:
                if len(idx_chunks) > 50:
                    img_chunk = img_vectors[idx_chunks]
                    s_score = silhouette_score(
                        img_chunk,
                        kmeans.predict(img_chunk), 
                        random_state=SEED
                    )
                    chunk_s_scores.append(s_score)
                    weights.append(len(idx_chunks))
            chunk_s_score = np.average(chunk_s_scores, weights=weights)
            silhouette_scores[K] = chunk_s_score
            
            dst_kmeans = dst_models / f'{folder_name}_K{K}.joblib'
            joblib.dump(kmeans, str(dst_kmeans))

        result = {
            'dir_name': str(folder),
            'silhouette_scores' : silhouette_scores
        }
        dst_result = dst_results / f'{folder_name}.json'
        with open(dst_result, 'w', encoding='utf-8') as j:
            json.dump(result, j, indent=4)