In [7]:
pip install --upgrade threadpoolctl scikit-learn


Collecting threadpoolctl
  Obtaining dependency information for threadpoolctl from https://files.pythonhosted.org/packages/32/d5/f9a850d79b0851d1d4ef6456097579a9005b31fea68726a4ae5f2d82ddd9/threadpoolctl-3.6.0-py3-none-any.whl.metadata
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Collecting scikit-learn
  Obtaining dependency information for scikit-learn from https://files.pythonhosted.org/packages/f4/5a/ba91b8c57aa37dbd80d5ff958576a9a8c14317b04b671ae7f0d09b00993a/scikit_learn-1.7.0-cp311-cp311-win_amd64.whl.metadata
  Downloading scikit_learn-1.7.0-cp311-cp311-win_amd64.whl.metadata (14 kB)
Downloading threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Downloading scikit_learn-1.7.0-cp311-cp311-win_amd64.whl (10.7 MB)
   ---------------------------------------- 0.0/10.7 MB ? eta -:--:--
   ---------------------------------------- 0.1/10.7 MB 2.6 MB/s eta 0:00:05
    --------------------------------------- 0.2/10.7 MB 2.9 MB/s eta 0:00:04
   - -------------------------

ERROR: Could not install packages due to an OSError: [WinError 5] Access is denied: 'C:\\Users\\91879\\anaconda3\\Lib\\site-packages\\~klearn\\.libs\\msvcp140.dll'
Consider using the `--user` option or check the permissions.



In [None]:
import cv2
import numpy as np
from ultralytics import YOLO
from sklearn.cluster import KMeans
from tqdm import tqdm
import os
import shutil
import threadpoolctl

model = YOLO("yolov8n.pt")  

unlabeled_data_path = "augmented_images" 
output_label_path = "pseudo_labels"
clean_data_path = "clean_dataset"
os.makedirs(output_label_path, exist_ok=True)
os.makedirs(clean_data_path, exist_ok=True)

conf_threshold = 0.4 

box_data = []  
img_to_boxes = {}  

for img_file in tqdm(os.listdir(unlabeled_data_path)):
    if not img_file.lower().endswith(('.png', '.jpg', '.jpeg')):
        continue

    img_path = os.path.join(unlabeled_data_path, img_file)
    results = model.predict(img_path, conf=conf_threshold, verbose=False)

    label_file = os.path.splitext(img_file)[0] + ".txt"
    label_path = os.path.join(output_label_path, label_file)

    with open(label_path, 'w') as f:
        for box in results[0].boxes:
            cls = int(box.cls)
            xywh = box.xywhn[0].tolist() 
            box_line = f"{cls} {' '.join(map(str, xywh))}\n"
            f.write(box_line)

            box_entry = {
                'image': img_file,
                'label': label_file,
                'xywh': xywh,
                'width_height': [xywh[2], xywh[3]],
                'box_line': box_line
            }
            box_data.append(box_entry)
            img_to_boxes.setdefault(img_file, []).append(box_entry)

if len(box_data) > 10:
    boxes_array = np.array([b['width_height'] for b in box_data])
    threadpoolctl.threadpool_limits(limits=1, user_api='blas')
    kmeans = KMeans(n_clusters=8, random_state=42).fit(boxes_array)
    distances = kmeans.transform(boxes_array)
    keep_threshold = np.percentile(distances.min(axis=1), 90)
    clean_mask = distances.min(axis=1) < keep_threshold

    for i, is_clean in enumerate(clean_mask):
        box_data[i]['keep'] = bool(is_clean)

    kept_images = set()
    for img_file, boxes in img_to_boxes.items():
        clean_lines = [b['box_line'] for b in boxes if b.get('keep')]
        if clean_lines:
            kept_images.add(img_file)

            cleaned_label_file = os.path.splitext(img_file)[0] + ".txt"
            with open(os.path.join(clean_data_path, cleaned_label_file), 'w') as f:
                f.writelines(clean_lines)

            shutil.copy(
                os.path.join(unlabeled_data_path, img_file),
                os.path.join(clean_data_path, img_file)
            )

    print(f"Filtered {len(box_data) - sum(clean_mask)} noisy boxes")
    print(f"Saved {len(kept_images)} clean images with cleaned labels in '{clean_data_path}'")
    import matplotlib.pyplot as plt
    plt.figure(figsize=(10, 6))
    plt.scatter(boxes_array[:, 0], boxes_array[:, 1], c=kmeans.labels_, alpha=0.6)
    plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], 
                c='red', marker='X', s=200)
    plt.xlabel("Normalized Width")
    plt.ylabel("Normalized Height")
    plt.title("Bounding Box Clusters (K-means)")
    plt.savefig("box_clusters.png", dpi=300)
    plt.close()
else:
    print("Not enough boxes for K-means filtering")



 94%|█████████████████████████████████████████████████████████████████████████     | 2761/2946 [07:39<00:27,  6.67it/s]