In [6]:
import os
import cv2
import numpy as np

In [14]:
# --- CONFIG ---
dataset_root = "D:\SOP_BIO\Parasitized-Annotation"  # path to the folder containing all 150 subfolders
label_output_root = "D:\SOP_BIO\labels"
os.makedirs(label_output_root, exist_ok=True)
class_id = 0  # parasite class

In [15]:
def detect_red_circles_yolo_format(image_path, label_path):
    img = cv2.imread(image_path)
    if img is None:
        print(f"Error reading {image_path}")
        return

    hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
    # red color range
    lower_red1 = np.array([0, 100, 100])
    upper_red1 = np.array([10, 255, 255])
    lower_red2 = np.array([160, 100, 100])
    upper_red2 = np.array([180, 255, 255])
    mask1 = cv2.inRange(hsv, lower_red1, upper_red1)
    mask2 = cv2.inRange(hsv, lower_red2, upper_red2)
    mask = cv2.bitwise_or(mask1, mask2)

    kernel = np.ones((3, 3), np.uint8)
    mask = cv2.morphologyEx(mask, cv2.MORPH_OPEN, kernel)

    contour_data = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    contours = contour_data[0] if len(contour_data) == 2 else contour_data[1]

    h, w, _ = img.shape
    yolo_lines = []
    for cnt in contours:
        x, y, bw, bh = cv2.boundingRect(cnt)
        x_center = (x + bw / 2) / w
        y_center = (y + bh / 2) / h
        bw_norm = bw / w
        bh_norm = bh / h
        yolo_lines.append(f"{class_id} {x_center:.6f} {y_center:.6f} {bw_norm:.6f} {bh_norm:.6f}")

    os.makedirs(os.path.dirname(label_path), exist_ok=True)
    with open(label_path, "w") as f:
        f.write("\n".join(yolo_lines))

In [16]:
for root, _, files in os.walk(dataset_root):
    for file in files:
        if file.lower().endswith((".jpg", ".jpeg", ".png")):
            image_path = os.path.join(root, file)
            # replicate folder structure in output
            rel_path = os.path.relpath(image_path, dataset_root)
            label_path = os.path.join(label_output_root, os.path.splitext(rel_path)[0] + ".txt")
            detect_red_circles_yolo_format(image_path, label_path)
            print(f"Labeled: {image_path}")

Labeled: D:\SOP_BIO\Parasitized-Annotation\PvTk1\20190404151312.jpg
Labeled: D:\SOP_BIO\Parasitized-Annotation\PvTk1\20190404151511.jpg
Labeled: D:\SOP_BIO\Parasitized-Annotation\PvTk1\20190404151641.jpg
Labeled: D:\SOP_BIO\Parasitized-Annotation\PvTk1\20190404152216.jpg
Labeled: D:\SOP_BIO\Parasitized-Annotation\PvTk1\20190404152626.jpg
Labeled: D:\SOP_BIO\Parasitized-Annotation\PvTk1\20190404152850.jpg
Labeled: D:\SOP_BIO\Parasitized-Annotation\PvTk1\20190404153102.jpg
Labeled: D:\SOP_BIO\Parasitized-Annotation\PvTk1\20190404153307.jpg
Labeled: D:\SOP_BIO\Parasitized-Annotation\PvTk1\20190404153544.jpg
Labeled: D:\SOP_BIO\Parasitized-Annotation\PvTk1\20190404153725.jpg
Labeled: D:\SOP_BIO\Parasitized-Annotation\PvTk1\20190404154005.jpg
Labeled: D:\SOP_BIO\Parasitized-Annotation\PvTk1\20190404154352.jpg
Labeled: D:\SOP_BIO\Parasitized-Annotation\PvTk1\20190404154509.jpg
Labeled: D:\SOP_BIO\Parasitized-Annotation\PvTk1\20190404154605.jpg
Labeled: D:\SOP_BIO\Parasitized-Annotation\PvTk1

In [18]:

import random
import shutil

# --- CONFIG ---
images_root = "D:\SOP_BIO\Parasitized-Annotation"
labels_root = "D:\SOP_BIO\labels"
output_dir = "D:\SOP_BIO\dataset"
train_ratio = 0.8  # 80% train, 20% val

# --- OUTPUT FOLDERS ---
for split in ['train', 'val']:
    os.makedirs(os.path.join(output_dir, "images", split), exist_ok=True)
    os.makedirs(os.path.join(output_dir, "labels", split), exist_ok=True)

# --- COLLECT IMAGE-LABEL PAIRS ---
image_label_pairs = []

for subfolder in os.listdir(images_root):
    image_subdir = os.path.join(images_root, subfolder)
    label_subdir = os.path.join(labels_root, subfolder)

    if not os.path.isdir(image_subdir) or not os.path.isdir(label_subdir):
        continue

    for fname in os.listdir(image_subdir):
        if fname.lower().endswith((".jpg", ".jpeg", ".png")):
            img_path = os.path.join(image_subdir, fname)
            label_path = os.path.join(label_subdir, os.path.splitext(fname)[0] + ".txt")

            if os.path.exists(label_path):
                image_label_pairs.append((img_path, label_path, fname))  # include original name

# --- SHUFFLE AND SPLIT ---
random.shuffle(image_label_pairs)
split_index = int(len(image_label_pairs) * train_ratio)
train_pairs = image_label_pairs[:split_index]
val_pairs = image_label_pairs[split_index:]

# --- COPY TO YOLO FORMAT ---
def copy_pairs(pairs, split):
    for img_path, lbl_path, fname in pairs:
        base_name = os.path.splitext(fname)[0]
        ext = os.path.splitext(fname)[1]

        dst_img = os.path.join(output_dir, "images", split, base_name + ext)
        dst_lbl = os.path.join(output_dir, "labels", split, base_name + ".txt")

        shutil.copy2(img_path, dst_img)
        shutil.copy2(lbl_path, dst_lbl)

copy_pairs(train_pairs, "train")
copy_pairs(val_pairs, "val")

print(f"✅ Done. {len(train_pairs)} training, {len(val_pairs)} validation samples — filenames preserved.")



✅ Done. 2411 training, 603 validation samples — filenames preserved.
