In [1]:
import json
import shutil
import joblib
from collections import Counter, defaultdict
from pathlib import Path
from math import ceil

import numpy as np

from utils.config import *
from utils.utils import *


np.random.seed(SEED)

In [5]:
dst_artifacts = ROOT_ARTIFACTS / 'clusters'
dst_models = dst_artifacts / 'models'
dst_results = dst_artifacts / 'results'

sampling_rates = {
    '0.1': '../path/resize_sample010_train_imgs_path.txt',
    '0.25': '../path/resize_sample025_train_imgs_path.txt',
    '0.5': '../path/resize_sample050_train_imgs_path.txt',
}

In [7]:
results_path = list(dst_results.glob('*.json'))
for i, result_path in enumerate(results_path):
    with open(result_path, 'r') as j:
        result = json.load(j)
    dir_name = result['dir_name']
    s_scores = result['silhouette_scores']
    max_score = max(s_scores.values())
    K, *_ = [int(k) for k, v in s_scores.items() if v == max_score]
    
    model_name = Path(dir_name).stem
    model_path = dst_models / f'{model_name}_K{K}.joblib'
    kmeans = joblib.load(model_path)
    
    imgs_path = list(Path(dir_name).glob('*.jpg'))
    n_imgs = len(imgs_path)
    labels = []
    imgs_by_labels = defaultdict(list)
    for img_path in imgs_path:
        img_arr = read_kor_path_img(img_path).flatten().astype(np.float16)
        label, *_ = kmeans.predict(img_arr[np.newaxis])
        labels.append(label)
        imgs_by_labels[label].append(img_path)
    
    for sampling_rate, dst_samples_path in sampling_rates.items():
        sampling_rate = float(sampling_rate)
        sample_count = {
            label: ceil(count * sampling_rate) for label, count in Counter(labels).items()
        }
        for label, n_sample in sample_count.items():
            imgs_in_label = imgs_by_labels[label]
            sample_imgs = np.random.choice(imgs_in_label, n_sample)
            with open(dst_samples_path, 'a', encoding='utf-8') as f:
                f.writelines([str(x)+'\n' for x in sample_imgs])