In [53]:
from pathlib import Path
from PIL import Image
import cv2
import numpy as np
from ultralytics import YOLO
from collections import Counter
import pandas as pd
import random
import shutil
import os
from collections import defaultdict
import yaml
import json

project_root = Path.cwd()
print(f"directory: {project_root}")

dirs_to_create = [
    project_root / "Images",
    project_root / "Annotation_CSV_Files",
    project_root / "labels",
    project_root / "dataset" / "images" / "train",
    project_root / "dataset" / "images" / "val",
    project_root / "dataset" / "labels" / "train",
    project_root / "dataset" / "labels" / "val",
    project_root / "test_results"
]

for directory in dirs_to_create:
    directory.mkdir(parents=True, exist_ok=True)
print("structure created")

directory: /Users/larasabha/Desktop/Candy-object-detection
structure created


In [54]:
def analyze_csv_data(csv_folder=project_root / "Annotation_CSV_Files"):
    """analyze CSV data to understand the class distribution and format"""
    
    if not csv_folder.exists():
        print(f"CSV folder not found: {csv_folder}")
        return False
    
    csv_files = list(csv_folder.glob("*.csv"))
    if not csv_files:
        print("no CSV files found")
        return False
    
    all_classes = set()
    total_annotations = 0
    class_counts = Counter()
    
    for csv_file in csv_files:
        try:
            df = pd.read_csv(csv_file)
            if df.empty:
                continue
                
            print(f"\nanalyzing {csv_file.name}:")
            print(f"  columns: {list(df.columns)}")
            print(f"  rows: {len(df)}")
            
            # sample few region_attributes to understand format
            if 'region_attributes' in df.columns:
                print("  sample region_attributes:")
                for i, attr in enumerate(df['region_attributes'].head(3)):
                    try:
                        parsed = json.loads(attr)
                        candy_type = parsed.get('candy_type', 'Unknown')
                        normalized = normalize_class_name(candy_type)
                        print(f"    {i+1}. raw: '{candy_type}' -> normalized: '{normalized}'")
                        
                        all_classes.add(normalized)
                        class_counts[normalized] += 1
                        total_annotations += 1
                    except Exception as e:
                        print(f"    {i+1}, error parsing: {attr} - {e}")
            
        except Exception as e:
            print(f"error reading {csv_file.name}: {e}")
    
    print(f"total annotations: {total_annotations}")
    print(f"unique classes found: {len(all_classes)}")
    print(f"classes: {sorted(all_classes)}")
    
    print(f"\nclass distribution:")
    for class_name, count in class_counts.most_common():
        percentage = count / total_annotations * 100 if total_annotations > 0 else 0
        print(f"  {class_name}: {count} ({percentage:.1f}%)")
    
    return True

In [55]:
def organize_dataset_files(root_dir=project_root):
    """
    Images -> Images/
    CSV annotation files -> Annotation_CSV_Files/
    """
    print("organizing dataset files")

    archive_folders = [d for d in root_dir.iterdir() if d.is_dir() and 'candy' in d.name.lower()]
    if archive_folders:
        archive_folder = archive_folders[0]
        print(f"found archive folder: {archive_folder}")
        source_img_folder = archive_folder / "Images"
        source_csv_folder = archive_folder / "Annotation CSV Files"
    else:
        print("no archive folder found")
        source_img_folder = root_dir / "Images"
        source_csv_folder = root_dir / "Annotation CSV Files"

    dest_img_folder = root_dir / "Images"
    dest_csv_folder = root_dir / "Annotation_CSV_Files"
    
    moved_images = 0
    moved_csvs = 0

    if source_img_folder.exists():
        for file in source_img_folder.iterdir():
            if file.suffix.lower() in ['.jpg', '.jpeg', '.JPG', '.JPEG', '.png', '.PNG']:
                dst = dest_img_folder / file.name
                if not dst.exists():
                    file.rename(dst)
                    moved_images += 1
        print(f"moved {moved_images} images to {dest_img_folder}")
    else:
        print(f"images folder not found at {source_img_folder}")

    if source_csv_folder.exists():
        for file in source_csv_folder.iterdir():
            if file.suffix.lower() == '.csv':
                dst = dest_csv_folder / file.name
                if not dst.exists():
                    file.rename(dst)
                    moved_csvs += 1
        print(f"moved {moved_csvs} CSV files to {dest_csv_folder}")
    else:
        print(f"CSV folder not found at {source_csv_folder}")

    final_images = len(list(dest_img_folder.glob("*.[jJ][pP][gG]"))) + \
                   len(list(dest_img_folder.glob("*.[jJ][pP][eE][gG]"))) + \
                   len(list(dest_img_folder.glob("*.png")))

    final_csvs = len(list(dest_csv_folder.glob("*.csv")))

    print(f"final counts: {final_images} images, {final_csvs} CSV files")
    return final_images > 0 and final_csvs > 0

In [56]:
def normalize_class_name(class_name):
    """normalize class names to be more consistent"""
    
    if not class_name or class_name == '':
        return 'Unknown'
    
    # convert to lowercase and remove extra spaces
    normalized = class_name.lower().strip()
    
    # define mapping for common variations based on your actual CSV data
    name_mapping = {
        # actual classes from CSV
        'crunch': 'Crunch',
        'baby_ruth': 'Baby_Ruth',
        'babyruth': 'Baby_Ruth', 
        'baby ruth': 'Baby_Ruth',
        'butterfingers': 'Butterfinger',
        'butterfinger': 'Butterfinger',
        '3_musketeers': '3_Musketeers',
        '3 musketeers': '3_Musketeers',
        'three musketeers': '3_Musketeers',
        'milky_way': 'Milky_Way',
        'milky way': 'Milky_Way',
        'milkyway': 'Milky_Way',
        
        # additional common variations
        'snickers': 'Snickers',
        'twix': 'Twix',
        '100 grand': '100_Grand',
        '100grand': '100_Grand',
        '100_grand': '100_Grand',
        'midnight milky way': 'Midnight_Milky_Way',
        'midnight_milky_way': 'Midnight_Milky_Way',
        'unknown': 'Unknown'
    }
    
    return name_mapping.get(normalized, class_name.replace(' ', '_'))

In [57]:
def create_non_overlapping_boxes(existing_boxes, img_w, img_h, num_boxes=3, min_size=0.15, max_size=0.3):
    new_boxes = []
    max_attempts = 50
    
    for _ in range(num_boxes):
        for attempt in range(max_attempts):
            # random
            w_norm = random.uniform(min_size, max_size)
            h_norm = random.uniform(min_size, max_size)
            
            # box stays within image bounds
            x_center = random.uniform(w_norm/2, 1 - w_norm/2)
            y_center = random.uniform(h_norm/2, 1 - h_norm/2)
            
            # overlap with existing boxes (both existing annotations and new boxes)
            all_existing = existing_boxes + new_boxes
            overlap = False
            
            for ex_x, ex_y, ex_w, ex_h in all_existing:
                # calc minimum distance needed to avoid overlap (with padding)
                min_dist_x = (w_norm + ex_w) / 2 + 0.05  # 5% padding
                min_dist_y = (h_norm + ex_h) / 2 + 0.05
                
                actual_dist_x = abs(x_center - ex_x)
                actual_dist_y = abs(y_center - ex_y)
                
                if actual_dist_x < min_dist_x and actual_dist_y < min_dist_y:
                    overlap = True
                    break
            
            if not overlap:
                new_boxes.append((x_center, y_center, w_norm, h_norm))
                break
        else:
            # try a smaller box
            if min_size > 0.08:
                w_norm = random.uniform(0.08, min_size)
                h_norm = random.uniform(0.08, min_size)
                x_center = random.uniform(w_norm/2, 1 - w_norm/2)
                y_center = random.uniform(h_norm/2, 1 - h_norm/2)
                new_boxes.append((x_center, y_center, w_norm, h_norm))
    
    return new_boxes

In [58]:
def create_unknown_training_data(image_folder, df, output_labels, class_to_id, max_unknown_samples=20):
    
    if "Unknown" not in class_to_id:
        print("Unknown class not found in class mapping")
        return
    
    unknown_id = class_to_id["Unknown"]
    annotated_images = set(df['filename'].unique())
    
    print(f"Using {len(annotated_images)} annotated images to generate Unknown regions")
    
    unknown_count = 0
    annotated_sample = list(annotated_images)
    random.shuffle(annotated_sample)  # randomize selection
    
    for img_name in annotated_sample:
        if unknown_count >= max_unknown_samples:
            break
            
        try:
            img_path = image_folder / img_name
            img = Image.open(img_path)
            img_w, img_h = img.size
            
            label_path = output_labels / (img_path.stem + ".txt")
            
            # read existing annotations
            existing_boxes = []
            if label_path.exists():
                with open(label_path, "r") as f:
                    for line in f:
                        parts = line.strip().split()
                        if len(parts) == 5:
                            _, x_c, y_c, w, h = map(float, parts)
                            existing_boxes.append((x_c, y_c, w, h))
            
            # try to add non-overlapping unknown box
            for attempt in range(5):
                x_center = random.uniform(0.1, 0.9)
                y_center = random.uniform(0.1, 0.9)
                w_norm = random.uniform(0.1, 0.25)
                h_norm = random.uniform(0.1, 0.25)
                
                # check for overlap
                overlap = False
                for ex_x, ex_y, ex_w, ex_h in existing_boxes:
                    if (abs(x_center - ex_x) < (w_norm + ex_w)/2 + 0.1 and 
                        abs(y_center - ex_y) < (h_norm + ex_h)/2 + 0.1):
                        overlap = True
                        break
                
                if not overlap:
                    with open(label_path, "a") as f:
                        f.write(f"{unknown_id} {x_center:.6f} {y_center:.6f} {w_norm:.6f} {h_norm:.6f}\n")
                    
                    unknown_count += 1
                    print(f"added Unknown region to annotated image: {img_name}")
                    break
            
        except Exception as e:
            print(f"error augmenting {img_name} with Unknown class: {e}")
    
    print(f"created {unknown_count} Unknown class training examples")

In [59]:
def create_default_unknown_box():
    x_center = random.uniform(0.2, 0.4)
    y_center = random.uniform(0.2, 0.4)
    w_norm = random.uniform(0.15, 0.25)
    h_norm = random.uniform(0.15, 0.25)
    
    return x_center, y_center, w_norm, h_norm

In [60]:
def fix_overlapping_annotations_in_existing_labels(labels_dir, min_distance=0.05):
    fixed_count = 0
    
    for label_file in labels_dir.glob("*.txt"):
        try:
            # all annotations
            annotations = []
            with open(label_file, "r") as f:
                for line in f:
                    parts = line.strip().split()
                    if len(parts) == 5:
                        class_id, x_c, y_c, w, h = map(float, parts)
                        annotations.append([int(class_id), x_c, y_c, w, h])
            
            if len(annotations) <= 1:
                continue
                
            # check overlaps and fix them
            modified = False
            
            for i in range(len(annotations)):
                for j in range(i + 1, len(annotations)):
                    ann1 = annotations[i]
                    ann2 = annotations[j]
                    
                    # calc distance between centers
                    dist_x = abs(ann1[1] - ann2[1])
                    dist_y = abs(ann1[2] - ann2[2])
                    
                    # min distance needed
                    min_dist_x = (ann1[3] + ann2[3]) / 2 + min_distance
                    min_dist_y = (ann1[4] + ann2[4]) / 2 + min_distance
                    
                    # if overlapping -> move the second annotation
                    if dist_x < min_dist_x and dist_y < min_dist_y:
                        # calc new position for ann2
                        if ann2[1] < ann1[1]:  # ann2 is to the left
                            new_x = ann1[1] - min_dist_x
                        else:  # ann2 is to the right
                            new_x = ann1[1] + min_dist_x
                            
                        if ann2[2] < ann1[2]:  # ann2 is above
                            new_y = ann1[2] - min_dist_y
                        else:  # ann2 is below
                            new_y = ann1[2] + min_dist_y
                        
                        # new position is within bounds
                        half_w = ann2[3] / 2
                        half_h = ann2[4] / 2
                        
                        new_x = max(half_w, min(1 - half_w, new_x))
                        new_y = max(half_h, min(1 - half_h, new_y))
                        
                        annotations[j][1] = new_x
                        annotations[j][2] = new_y
                        modified = True
                        
                        print(f"fixed overlap in {label_file.name}")
            
            if modified:
                with open(label_file, "w") as f:
                    for ann in annotations:
                        class_id, x_c, y_c, w, h = ann
                        f.write(f"{class_id} {x_c:.6f} {y_c:.6f} {w:.6f} {h:.6f}\n")
                fixed_count += 1
                
        except Exception as e:
            print(f"error processing {label_file.name}: {e}")
    
    print(f"fixed overlapping annotations in {fixed_count} label files")

In [36]:
def convert_csv_to_yolo(csv_folder=project_root / "Annotation_CSV_Files", 
                       image_folder=project_root / "Images", 
                       output_labels=project_root / "labels"):
    
    print("converting CSV annotations to YOLO format")
    
    output_labels.mkdir(parents=True, exist_ok=True)
    
    if not csv_folder.exists():
        print(f"CSV folder not found: {csv_folder}")
        return False
    
    csv_files = list(csv_folder.glob("*.csv"))
    if not csv_files:
        print("no CSV files found")
        return False
    
    print(f"found {len(csv_files)} CSV files")
    
    df_list = []
    for csv_file in csv_files:
        try:
            df = pd.read_csv(csv_file)
            if df.empty or df.columns.size == 0:
                print(f"skipping empty file: {csv_file.name}")
                continue
            df_list.append(df)
            print(f"loaded {csv_file.name}")
        except pd.errors.EmptyDataError:
            print(f"skipping empty file: {csv_file.name}")
            continue
        except Exception as e:
            print(f"error loading {csv_file.name}: {e}")
    
    if not df_list:
        print("no valid CSV files loaded")
        return False
        
    df = pd.concat(df_list, ignore_index=True)
    print(f"total annotations: {len(df)}")
    
    try:
        # class extraction with normalization
        def extract_and_normalize_class(region_attr):
            try:
                parsed = json.loads(region_attr)
                raw_class = parsed.get('candy_type', 'Unknown')
                return normalize_class_name(raw_class)
            except:
                return 'Unknown'
        
        df['class'] = df['region_attributes'].apply(extract_and_normalize_class)
        
        # remove any empty or invalid classes
        df = df[df['class'].notna() & (df['class'] != '')]
        
        # get normalized classes and ensure consistency
        actual_classes = sorted(df['class'].unique())
        print(f"normalized classes found: {actual_classes}")
        
        # define the final class list based on your actual CSV data
        predefined_classes = [
            'Crunch', 'Baby_Ruth', 'Butterfinger', '3_Musketeers', 'Milky_Way'
        ]
        
        # add any additional classes that might appear
        additional_common_classes = [
            'Snickers', 'Twix', '100_Grand', 'Midnight_Milky_Way'
        ]
        
        # use predefined classes if they exist in data, otherwise use discovered classes
        class_names = []
        for cls in predefined_classes:
            if cls in actual_classes:
                class_names.append(cls)
        
        # add any additional common classes found in data
        for cls in additional_common_classes:
            if cls in actual_classes and cls not in class_names:
                class_names.append(cls)
        
        # add any other classes found in data that are not predefined
        for cls in actual_classes:
            if cls not in class_names and cls != 'Unknown':
                class_names.append(cls)
        
        # Unknown as the last class
        class_names.append('Unknown')
        
        class_to_id = {name: i for i, name in enumerate(class_names)}
        
        print(f"final {len(class_names)} classes: {class_names}")
        print(f"class mapping: {class_to_id}")
        
        with open(project_root / "classes.txt", "w") as f:
            for name in class_names:
                f.write(name + "\n")
        
        print("created classes.txt")
        
    except Exception as e:
        print(f"error extracting classes: {e}")
        return False

    converted_count = 0
    skipped_count = 0
    unknown_dimension_count = 0
    class_distribution = Counter()
    
    for filename in df['filename'].unique():
        img_path = image_folder / filename
        label_path = output_labels / (img_path.stem + ".txt")
        
        if not img_path.exists():
            print(f"image not found: {img_path}")
            skipped_count += 1
            continue
        
        try:
            img = Image.open(img_path)
            img_w, img_h = img.size
        except Exception as e:
            print(f"failed to open image {filename}: {e}")
            skipped_count += 1
            continue

        with open(label_path, "w") as f:
            rows = df[df['filename'] == filename]
            valid_annotations = 0
            
            for _, row in rows.iterrows():
                try:
                    # parse the region shape attributes
                    shape_str = row['region_shape_attributes']
                    
                    # check if dimensions are missing or invalid
                    if pd.isna(shape_str) or shape_str == '{}' or shape_str.strip() == '':
                        print(f"no dimensions found for annotation in {filename}, marking as Unknown")
                        
                        # create a default bounding box for Unknown class (center of image)
                        x_center, y_center, w_norm, h_norm = create_default_unknown_box()
                        
                        unknown_id = class_to_id['Unknown']
                        f.write(f"{unknown_id} {x_center:.6f} {y_center:.6f} {w_norm:.6f} {h_norm:.6f}\n")
                        
                        class_distribution['Unknown'] += 1
                        unknown_dimension_count += 1
                        valid_annotations += 1
                        continue
                    
                    shape = json.loads(shape_str)
                    label = row['class']
                    
                    if label not in class_to_id:
                        print(f"skipping unknown class '{label}' in {filename}")
                        continue
                    
                    # check if required dimension fields are present and valid
                    required_fields = ['x', 'y', 'width', 'height']
                    missing_fields = [field for field in required_fields if field not in shape or shape[field] is None]
                    
                    if missing_fields:
                        print(f"missing dimension fields {missing_fields} in {filename}, marking as Unknown")
                        
                        # create a default bounding box for Unknown class
                        x_center, y_center, w_norm, h_norm = create_default_unknown_box()
                        
                        unknown_id = class_to_id['Unknown']
                        f.write(f"{unknown_id} {x_center:.6f} {y_center:.6f} {w_norm:.6f} {h_norm:.6f}\n")
                        
                        class_distribution['Unknown'] += 1
                        unknown_dimension_count += 1
                        valid_annotations += 1
                        continue
                    
                    class_id = class_to_id[label]
                    class_distribution[label] += 1

                    # extract dimensions
                    x, y, w, h = shape['x'], shape['y'], shape['width'], shape['height']
                    
                    # check for invalid dimensions (zero, negative, or unreasonable values)
                    if w <= 0 or h <= 0 or x < 0 or y < 0:
                        print(f"invalid dimensions in {filename}: x={x}, y={y}, w={w}, h={h}, marking as Unknown")
                        
                        # create a default bounding box for Unknown class
                        x_center, y_center, w_norm, h_norm = create_default_unknown_box()
                        
                        unknown_id = class_to_id['Unknown']
                        f.write(f"{unknown_id} {x_center:.6f} {y_center:.6f} {w_norm:.6f} {h_norm:.6f}\n")
                        
                        class_distribution['Unknown'] += 1
                        unknown_dimension_count += 1
                        valid_annotations += 1
                        continue
                    
                    # validate bounding box is within image bounds with tolerance
                    if x < -5 or y < -5 or x + w > img_w + 5 or y + h > img_h + 5:
                        print(f"bounding box outside image bounds in {filename}: ({x},{y},{w},{h}) for image {img_w}x{img_h}, marking as Unknown")
                        
                        # instead of clipping, mark as Unknown
                        x_center, y_center, w_norm, h_norm = create_default_unknown_box()
                        
                        unknown_id = class_to_id['Unknown']
                        f.write(f"{unknown_id} {x_center:.6f} {y_center:.6f} {w_norm:.6f} {h_norm:.6f}\n")
                        
                        class_distribution['Unknown'] += 1
                        unknown_dimension_count += 1
                        valid_annotations += 1
                        continue
                    
                    if w <= 5 or h <= 5:
                        print(f"box too small in {filename}: width={w}, height={h}, marking as Unknown")
                        
                        # instead of skipping, mark as Unknown
                        x_center, y_center, w_norm, h_norm = create_default_unknown_box()
                        
                        unknown_id = class_to_id['Unknown']
                        f.write(f"{unknown_id} {x_center:.6f} {y_center:.6f} {w_norm:.6f} {h_norm:.6f}\n")
                        
                        class_distribution['Unknown'] += 1
                        unknown_dimension_count += 1
                        valid_annotations += 1
                        continue
                    
                    # convert to YOLO format with precision
                    x_center = (x + w / 2) / img_w
                    y_center = (y + h / 2) / img_h
                    w_norm = w / img_w
                    h_norm = h / img_h
                    
                    # bounds checking
                    if not (0 <= x_center <= 1 and 0 <= y_center <= 1 and 0 < w_norm <= 1 and 0 < h_norm <= 1):
                        print(f"normalized coordinates out of bounds in {filename}: {x_center}, {y_center}, {w_norm}, {h_norm}, marking as Unknown")
                        
                        # instead of skipping, mark as Unknown
                        x_center, y_center, w_norm, h_norm = create_default_unknown_box()
                        
                        unknown_id = class_to_id['Unknown']
                        f.write(f"{unknown_id} {x_center:.6f} {y_center:.6f} {w_norm:.6f} {h_norm:.6f}\n")
                        
                        class_distribution['Unknown'] += 1
                        unknown_dimension_count += 1
                        valid_annotations += 1
                        continue
                    
                    # valid annotation -> write it
                    f.write(f"{class_id} {x_center:.6f} {y_center:.6f} {w_norm:.6f} {h_norm:.6f}\n")
                    valid_annotations += 1
                    
                except Exception as e:
                    print(f"error processing annotation in {filename}: {e}, marking as Unknown")
                    
                    # when there is any error, create Unknown annotation
                    x_center, y_center, w_norm, h_norm = create_default_unknown_box()
                    
                    unknown_id = class_to_id['Unknown']
                    f.write(f"{unknown_id} {x_center:.6f} {y_center:.6f} {w_norm:.6f} {h_norm:.6f}\n")
                    
                    class_distribution['Unknown'] += 1
                    unknown_dimension_count += 1
                    valid_annotations += 1
                    continue
            
            if valid_annotations > 0:
                converted_count += 1
            else:
                # remove empty label file
                if label_path.exists():
                    label_path.unlink()
    
    print(f"converted {converted_count} image-label pairs, skipped {skipped_count}")
    print(f"created {unknown_dimension_count} Unknown class annotations from missing/invalid dimensions")
    print(f"class distribution: {dict(class_distribution)}")
    return converted_count > 0

In [62]:
# function to check for missing dimensions in CSV
def analyze_csv_for_dimension_issues(csv_folder=project_root / "Annotation_CSV_Files"):
    
    if not csv_folder.exists():
        print(f"CSV folder not found: {csv_folder}")
        return False
    
    csv_files = list(csv_folder.glob("*.csv"))
    if not csv_files:
        print("no CSV files found")
        return False
    
    total_annotations = 0
    missing_dimensions = 0
    invalid_dimensions = 0
    empty_shape_attrs = 0
    valid_dimensions = 0
    
    for csv_file in csv_files:
        try:
            df = pd.read_csv(csv_file)
            if df.empty:
                continue
                
            print(f"\nanalyzing {csv_file.name}:")
            
            for _, row in df.iterrows():
                total_annotations += 1
                
                # check region_shape_attributes
                shape_str = row.get('region_shape_attributes', '')
                
                if pd.isna(shape_str) or shape_str == '{}' or shape_str.strip() == '':
                    empty_shape_attrs += 1
                    print(f"  empty shape attributes in row {total_annotations}")
                    continue
                
                try:
                    shape = json.loads(shape_str)
                    
                    # check for required fields
                    required_fields = ['x', 'y', 'width', 'height']
                    missing_fields = [field for field in required_fields if field not in shape]
                    
                    if missing_fields:
                        missing_dimensions += 1
                        print(f"  missing fields {missing_fields} in row {total_annotations}")
                        continue
                    
                    # check for invalid values
                    x, y, w, h = shape['x'], shape['y'], shape['width'], shape['height']
                    
                    if any(val is None for val in [x, y, w, h]):
                        invalid_dimensions += 1
                        print(f"  null dimension values in row {total_annotations}")
                        continue

                    if w <= 0 or h <= 0:
                        invalid_dimensions += 1
                        print(f"  invalid dimensions (w={w}, h={h}) in row {total_annotations}")
                        continue
                    
                    valid_dimensions += 1
                    
                except json.JSONDecodeError:
                    missing_dimensions += 1
                    print(f"  invalid JSON in shape attributes, row {total_annotations}")
                except Exception as e:
                    invalid_dimensions += 1
                    print(f"  error parsing dimensions in row {total_annotations}: {e}")
            
        except Exception as e:
            print(f"error reading {csv_file.name}: {e}")
    
    print(f"total annotations: {total_annotations}")
    print(f"valid dimensions: {valid_dimensions}")
    print(f"empty shape attributes: {empty_shape_attrs}")
    print(f"missing dimension fields: {missing_dimensions}")
    print(f"invalid dimension values: {invalid_dimensions}")
    
    issues_found = empty_shape_attrs + missing_dimensions + invalid_dimensions
    print(f"total issues that will be marked as Unknown: {issues_found}")
    
    if issues_found > 0:
        percentage = (issues_found / total_annotations) * 100
        print(f"percentage of annotations that will become Unknown: {percentage:.1f}%")
    
    return True

In [63]:
def create_train_val_split(train_ratio=0.8, root_dir=project_root):
    
    print("creating train/validation split with class balancing")
    
    images_dir = root_dir / "Images"
    labels_dir = root_dir / "labels"
    train_img_dir = root_dir / "dataset/images/train"
    val_img_dir = root_dir / "dataset/images/val"
    train_label_dir = root_dir / "dataset/labels/train"
    val_label_dir = root_dir / "dataset/labels/val"
    
    if not images_dir.exists() or not labels_dir.exists():
        print("images or labels directory not found")
        return False

    # file matching logic
    image_files = {}
    for img_file in images_dir.glob("*"):
        if img_file.suffix.lower() in [".jpg", ".jpeg", ".png"]:
            stem = img_file.stem
            image_files[stem] = img_file.name

    label_files = {}
    for label_file in labels_dir.glob("*.txt"):
        stem = label_file.stem
        label_files[stem] = label_file.name

    print(f"found {len(image_files)} images and {len(label_files)} labels")

    # match pairs and analyze class distribution
    matched_pairs = []
    class_to_files = {}
    
    for stem in image_files.keys():
        if stem in label_files:
            # read label file to get classes
            label_path = labels_dir / label_files[stem]
            classes_in_image = set()
            
            try:
                with open(label_path, 'r') as f:
                    for line in f:
                        parts = line.strip().split()
                        if len(parts) >= 1:
                            class_id = int(parts[0])
                            classes_in_image.add(class_id)
                
                matched_pairs.append((image_files[stem], label_files[stem], classes_in_image))
                
                # track which files contain each class
                for class_id in classes_in_image:
                    if class_id not in class_to_files:
                        class_to_files[class_id] = []
                    class_to_files[class_id].append(len(matched_pairs) - 1)
                    
            except Exception as e:
                print(f"Error reading {label_path}: {e}")
                matched_pairs.append((image_files[stem], label_files[stem], set()))

    print(f"found {len(matched_pairs)} matching image-label pairs")
    print(f"class distribution in files: {[(k, len(v)) for k, v in class_to_files.items()]}")

    if len(matched_pairs) == 0:
        print("no matching pairs found")
        return False

    random.seed(42)
    train_indices = set()
    val_indices = set()
    
    for class_id, file_indices in class_to_files.items():
        random.shuffle(file_indices)
        split_point = int(len(file_indices) * train_ratio)
        
        for i in file_indices[:split_point]:
            train_indices.add(i)
        for i in file_indices[split_point:]:
            val_indices.add(i)
    
    # check files in both sets
    all_indices = set(range(len(matched_pairs)))
    remaining_indices = all_indices - train_indices - val_indices
    
    # distribute remaining files
    remaining_list = list(remaining_indices)
    random.shuffle(remaining_list)
    split_point = int(len(remaining_list) * train_ratio)
    
    train_indices.update(remaining_list[:split_point])
    val_indices.update(remaining_list[split_point:])

    train_pairs = [matched_pairs[i] for i in train_indices]
    val_pairs = [matched_pairs[i] for i in val_indices]

    print(f"stratified split: {len(train_pairs)} training, {len(val_pairs)} validation")

    # clear existing files first
    for dir_path in [train_img_dir, val_img_dir, train_label_dir, val_label_dir]:
        if dir_path.exists():
            for file in dir_path.glob("*"):
                file.unlink()

    train_success = 0
    val_success = 0

    for img_file, label_file, _ in train_pairs:
        try:
            shutil.copy2(images_dir / img_file, train_img_dir / img_file)
            shutil.copy2(labels_dir / label_file, train_label_dir / label_file)
            train_success += 1
        except Exception as e:
            print(f"failed to copy training pair {img_file}: {e}")

    for img_file, label_file, _ in val_pairs:
        try:
            shutil.copy2(images_dir / img_file, val_img_dir / img_file)
            shutil.copy2(labels_dir / label_file, val_label_dir / label_file)
            val_success += 1
        except Exception as e:
            print(f"failed to copy validation pair {img_file}: {e}")

    print(f"successfully created {train_success} training and {val_success} validation pairs")
    return train_success > 0 and val_success > 0

In [64]:
def create_data_yaml(root_dir):    
    print("creating data.yaml")
    
    classes_file = root_dir / "classes.txt"
    
    try:
        with open(classes_file, "r") as f:
            class_names = [line.strip() for line in f.readlines() if line.strip()]
    except FileNotFoundError:
        print(f"{classes_file} not found!")
        return False

    data_content = {
        'path': str(root_dir / 'dataset'),
        'train': 'images/train',
        'val': 'images/val',
        'nc': len(class_names),
        'names': class_names
    }

    output_yaml = root_dir / 'data.yaml'
    with open(output_yaml, 'w') as f:
        yaml.dump(data_content, f, default_flow_style=False, sort_keys=False)

    print(f"created data.yaml with {len(class_names)} classes")
    print(f"classes: {class_names}")
    return True

In [65]:
def train_model(epochs=100, img_size=640, batch_size=16, root_dir=project_root):    
    print("starting model training")
    
    data_yaml = root_dir / "data.yaml"
    
    if not data_yaml.exists():
        print(f"{data_yaml} not found")
        return False

    try:
        # YOLOv8s for better accuracy than nano
        model = YOLO('yolov8s.pt')

        # training parameters
        results = model.train(
            data=str(data_yaml),
            epochs=epochs,
            imgsz=img_size,
            batch=batch_size,
            name='candy_detection',
            patience=40,
            save=True,
            plots=True,
            verbose=True,
            # learning rate schedule
            lr0=0.01,
            lrf=0.01,
            # regularization
            weight_decay=0.0005,
            momentum=0.937,
            # data augmentation for better generalization
            hsv_h=0.015,
            hsv_s=0.7,
            hsv_v=0.4,
            degrees=10.0,
            translate=0.1,
            scale=0.5,
            shear=0.0,
            perspective=0.0,
            flipud=0.0,
            fliplr=0.5,
            mosaic=1.0,
            mixup=0.1,
            copy_paste=0.1,
            # class balancing
            cls=0.5,
            box=0.05,
            # confidence thresholds
            conf=0.001,  # lower for training to catch more examples
            iou=0.6,
        )

        print("training completed")
        print(f"results saved in: runs/detect/candy_detection/")
        return True

    except Exception as e:
        print(f"training failed: {e}")
        return False

In [66]:
# check for existing trained model
def find_best_model(runs_dir="runs/detect"):
    runs_path = Path(runs_dir)
    if not runs_path.exists():
        print(f"no runs directory found at {runs_path}")
        return None

    run_dirs = sorted(
        [d for d in runs_path.iterdir() if (d / "weights" / "best.pt").exists()],
        key=lambda d: d.stat().st_mtime,
        reverse=True
    )

    if not run_dirs:
        print("no trained model found")
        return None

    best_model_path = run_dirs[0] / "weights" / "best.pt"
    print(f"Found model: {best_model_path}")
    return str(best_model_path)

In [67]:
def test_single_image(image_path, model_path=None, conf_threshold=0.25, unknown_threshold=0.5):
    
    print(f"testing detection on: {image_path}")
    
    if model_path is None:
        model_path = find_best_model()
        if model_path is None:
            return False
    
    if not os.path.exists(image_path):
        print(f"image not found: {image_path}")
        return False
    
    try:
        model = YOLO(model_path)
        print(f"model loaded: {model_path}")
        
        # class names
        class_names = model.names
        print(f"model classes: {class_names}")
        
    except Exception as e:
        print(f"error loading model: {e}")
        return False
    
    try:
        image = cv2.imread(image_path)
        if image is None:
            print("could not load image")
            return False
        
        height, width = image.shape[:2]
        print(f"image size: {width}x{height}")
        
    except Exception as e:
        print(f"error loading image: {e}")
        return False
    
    try:
        # run detection with multiple confidence levels for comparison
        results_low = model(image, conf=0.1)  # very low for comprehensive detection
        results_medium = model(image, conf=conf_threshold)
        results_high = model(image, conf=unknown_threshold)
        
        # process results with confidence-based classification
        final_detections = []
        result_image = image.copy()
        
        # use medium confidence results as base
        for result in results_medium:
            if hasattr(result, 'boxes') and result.boxes is not None:
                for box in result.boxes:
                    if box.xyxy is None or box.conf is None or box.cls is None:
                        continue
                    
                    x1, y1, x2, y2 = map(int, box.xyxy[0].cpu().numpy())
                    conf = float(box.conf[0].cpu().numpy())
                    class_id = int(box.cls[0].cpu().numpy())
                    class_name = class_names.get(class_id, f"Class_{class_id}")
                    
                    # unknown detection logic
                    display_name = class_name
                    is_uncertain = False
                    color = (0, 255, 0)  # default green
                    
                    if class_name == "Unknown":
                        display_name = "Unknown"
                        color = (0, 0, 255)  # red for unknown
                    elif conf < unknown_threshold and class_name != "Unknown":
                        display_name = f"Uncertain_{class_name}"
                        is_uncertain = True
                        color = (0, 165, 255)  # orange for uncertain
                    else:
                        # high confidence known class
                        colors = [(0, 255, 0), (255, 0, 0), (0, 255, 255), (255, 255, 0), 
                                 (255, 0, 255), (128, 0, 128), (255, 165, 0), (0, 128, 255)]
                        color = colors[class_id % len(colors)]
                    
                    final_detections.append({
                        'class': display_name,
                        'original_class': class_name,
                        'confidence': conf,
                        'box': [x1, y1, x2, y2],
                        'uncertain': is_uncertain,
                        'class_id': class_id
                    })
                    
                    # draw bounding box
                    cv2.rectangle(result_image, (x1, y1), (x2, y2), color, 3)
                    
                    # label with better positioning
                    label = f"{display_name} {conf:.2f}"
                    font = cv2.FONT_HERSHEY_SIMPLEX
                    font_scale = 0.6
                    thickness = 2
                    
                    (text_width, text_height), baseline = cv2.getTextSize(label, font, font_scale, thickness)
                    
                    # position label above box, or below if near top
                    label_y = y1 - 10 if y1 - text_height - 10 > 0 else y2 + text_height + 10
                    
                    # draw background rectangle for text
                    cv2.rectangle(result_image, 
                                (x1, label_y - text_height - 5), 
                                (x1 + text_width, label_y + 5), 
                                color, -1)
                    
                    # draw text
                    cv2.putText(result_image, label, (x1, label_y), 
                              font, font_scale, (255, 255, 255), thickness)
        
        # analysis
        if len(final_detections) == 0:
            print("no detections found:")
            print("1. try lowering confidence threshold")
            print("2. check if model was trained on similar candy types")
            print("3. image may contain only unknown/new candy types")
        else:
            print(f"\nfound {len(final_detections)} detections:")
            
            # group by class for better analysis
            class_counts = {}
            for det in final_detections:
                key = det['class']
                if key not in class_counts:
                    class_counts[key] = []
                class_counts[key].append(det['confidence'])
            
            for class_name, confidences in class_counts.items():
                avg_conf = sum(confidences) / len(confidences)
                print(f"  {class_name}: {len(confidences)} instances (avg conf: {avg_conf:.3f})")
            
            # detailed detection list
            print("\ndetailed detections:")
            for i, det in enumerate(final_detections):
                status = " [UNCERTAIN]" if det.get('uncertain', False) else ""
                print(f"  {i+1}. {det['class']} ({det['confidence']:.3f}){status}")

        # result
        os.makedirs("test_results", exist_ok=True)
        output_path = Path("test_results") / f"detection_{Path(image_path).stem}.jpg"
        cv2.imwrite(str(output_path), result_image)
        print(f"\nresult saved to: {output_path.resolve()}")
        
        return True, final_detections, result_image
        
    except Exception as e:
        print(f"error during detection: {e}")
        return False, [], None

In [68]:
def analyze_model_classes(model_path=None):
    
    if model_path is None:
        model_path = find_best_model()
        if model_path is None:
            return False
    
    try:
        model = YOLO(model_path)
        class_names = model.names
        
        print(f"\nmodel classes analysis:")
        print(f"total classes: {len(class_names)}")
        print("class mapping:")
        for class_id, class_name in class_names.items():
            print(f"  {class_id}: {class_name}")
        
        return class_names
        
    except Exception as e:
        print(f"error analyzing model: {e}")
        return False

In [72]:
def test_multiple_images(image_folder="Images", model_path=None, conf_threshold=0.25):    
    image_folder = Path(image_folder)
    
    print(f"testing detection on images in: {image_folder}")
    
    if model_path is None:
        model_path = find_best_model()
        if model_path is None:
            return False
    
    try:
        model = YOLO(model_path)
        class_names = model.names
        print(f"model loaded with {len(class_names)} classes: {list(class_names.values())}")
    except Exception as e:
        print(f"error loading model: {e}")
        return False
    
    if not image_folder.exists() or not image_folder.is_dir():
        print(f"image folder does not exist: {image_folder}")
        return False
    
    # all image files
    image_files = []
    for ext in ['.jpg', '.jpeg', '.png', '.JPG', '.JPEG', '.PNG']:
        image_files.extend(image_folder.glob(f"*{ext}"))
    
    if not image_files:
        print(f"no images found in {image_folder}")
        return False
    
    print(f"found {len(image_files)} images")
    
    # analysis
    total_detections = 0
    processed_count = 0
    class_counts = Counter()
    confidence_scores = []
    images_with_detections = 0
    
    # test on more images for better statistics
    test_images = image_files[:50] if len(image_files) > 50 else image_files
    
    for img_path in test_images:
        try:
            image = cv2.imread(str(img_path))
            if image is None:
                print(f"could not load image {img_path}")
                continue
            
            results = model(image, conf=conf_threshold)
            
            image_detections = 0
            image_classes = []
            
            for result in results:
                if hasattr(result, 'boxes') and result.boxes is not None:
                    for box in result.boxes:
                        class_id = int(box.cls[0].cpu().numpy())
                        class_name = class_names.get(class_id, f"Class_{class_id}")
                        conf = float(box.conf[0].cpu().numpy())
                        
                        # Apply same uncertainty logic as single image test
                        if class_name != "Unknown" and conf < 0.5:
                            display_name = f"Uncertain_{class_name}"
                        else:
                            display_name = class_name
                        
                        image_classes.append(f"{display_name}({conf:.2f})")
                        class_counts[display_name] += 1
                        confidence_scores.append(conf)
                        image_detections += 1
            
            total_detections += image_detections
            processed_count += 1
            
            if image_detections > 0:
                images_with_detections += 1
                print(f"{img_path.name}: {image_detections} detections - {', '.join(image_classes)}")
            else:
                print(f"{img_path.name}: no detections")
        
        except Exception as e:
            print(f"error processing {img_path.name}: {e}")
    
    # summary
    print(f"images processed: {processed_count}")
    print(f"images with detections: {images_with_detections}")
    print(f"total detections: {total_detections}")
    print(f"average detections per image: {total_detections/processed_count:.2f}")
    print(f"detection rate: {images_with_detections/processed_count*100:.1f}%")
    
    if confidence_scores:
        avg_conf = sum(confidence_scores) / len(confidence_scores)
        print(f"average confidence: {avg_conf:.3f}")
        print(f"confidence range: {min(confidence_scores):.3f} - {max(confidence_scores):.3f}")
    
    print(f"\nclass distribution:")
    for class_name, count in class_counts.most_common():
        percentage = count / total_detections * 100 if total_detections > 0 else 0
        print(f"  {class_name}: {count} ({percentage:.1f}%)")
    
    return True

In [73]:
def validate_training_data(root_dir=project_root):
    
    train_img_dir = root_dir / "dataset/images/train"
    train_label_dir = root_dir / "dataset/labels/train"
    val_img_dir = root_dir / "dataset/images/val" 
    val_label_dir = root_dir / "dataset/labels/val"
    
    # directories exist
    for dir_path, name in [(train_img_dir, "train images"), (train_label_dir, "train labels"),
                          (val_img_dir, "val images"), (val_label_dir, "val labels")]:
        if not dir_path.exists():
            print(f"error: {name} directory not found: {dir_path}")
            return False
    
    # count files
    train_imgs = len(list(train_img_dir.glob("*")))
    train_labels = len(list(train_label_dir.glob("*.txt")))
    val_imgs = len(list(val_img_dir.glob("*")))
    val_labels = len(list(val_label_dir.glob("*.txt")))
    
    print(f"training: {train_imgs} images, {train_labels} labels")
    print(f"validation: {val_imgs} images, {val_labels} labels")
    
    # class distribution
    try:
        with open(root_dir / "classes.txt", "r") as f:
            class_names = [line.strip() for line in f.readlines()]
        
        class_counts = Counter()
        total_annotations = 0
        
        for label_file in train_label_dir.glob("*.txt"):
            with open(label_file, "r") as f:
                for line in f:
                    parts = line.strip().split()
                    if len(parts) >= 1:
                        class_id = int(parts[0])
                        if class_id < len(class_names):
                            class_counts[class_names[class_id]] += 1
                        total_annotations += 1
        
        print(f"\ntraining class distribution ({total_annotations} total annotations):")
        for class_name in class_names:
            count = class_counts.get(class_name, 0)
            percentage = count / total_annotations * 100 if total_annotations > 0 else 0
            print(f"  {class_name}: {count} ({percentage:.1f}%)")
        
        # check class imbalance
        if class_counts:
            max_count = max(class_counts.values())
            min_count = min(class_counts.values())
            imbalance_ratio = max_count / min_count if min_count > 0 else float('inf')
            
            print(f"\nclass balance analysis:")
            print(f"  max/min ratio: {imbalance_ratio:.2f}")
            
            if imbalance_ratio > 10:
                print("  significant class imbalance detected")
            elif imbalance_ratio > 5:
                print("  moderate class imbalance detected")
        
    except Exception as e:
        print(f"error analyzing class distribution: {e}")
    
    return True

In [74]:
if __name__ == "__main__":
    
    print("step 0: analyzing CSV data")
    analyze_csv_data()
    
    print("\nstep 1: organizing dataset files")
    if not organize_dataset_files(project_root):
        print("failed to organize dataset files")
        exit(1)
    
    print("\nstep 2: converting CSV to YOLO format")
    if not convert_csv_to_yolo():
        print("failed to convert CSV to YOLO format")
        exit(1)
    
    print("\nstep 3: creating train/validation split")
    if not create_train_val_split():
        print("failed to create train/val split")
        exit(1)
    
    print("\nstep 4: creating data.yaml")
    if not create_data_yaml(project_root):
        print("failed to create data.yaml")
        exit(1)
    
    print("\nstep 5: validating training data")
    validate_training_data(project_root)
    
    print("\ntraining section")
    # print("to train the model, uncomment:")
    # print("# train_model(epochs=100)")
    # train_model(epochs=100)
    
    print("\ntesting section")
    model_path = find_best_model()
    if model_path:
        # analyze model classes
        analyze_model_classes(model_path)
        
        # test single image with detection
        desktop_image = str(Path.home() / "Desktop" / "image1.jpg")
        if os.path.exists(desktop_image):
            print(f"\ntesting single image: {desktop_image}")
            test_single_image(desktop_image, model_path)
        
        # test multiple images with analysis
        if Path("Images").exists():
            print(f"\ntesting multiple images...")
            test_multiple_images("Images", model_path)
    else:
        print("no trained model found")

step 0: analyzing CSV data

analyzing Candy_Project_1930-1981_csv.csv:
  columns: ['filename', 'file_size', 'file_attributes', 'region_count', 'region_id', 'region_shape_attributes', 'region_attributes']
  rows: 201
  sample region_attributes:
    1. raw: 'Crunch' -> normalized: 'Crunch'
    2. raw: 'Butterfingers' -> normalized: 'Butterfinger'
    3. raw: '100_Grand' -> normalized: '100_Grand'

analyzing Candy_Project_2039-2092_csv.csv:
  columns: ['filename', 'file_size', 'file_attributes', 'region_count', 'region_id', 'region_shape_attributes', 'region_attributes']
  rows: 200
  sample region_attributes:
    1. raw: 'Baby_Ruth' -> normalized: 'Baby_Ruth'
    2. raw: '3_Musketeers' -> normalized: '3_Musketeers'
    3. raw: 'Milky_Way' -> normalized: 'Milky_Way'

analyzing Candy_Project_2359-2415_csv.csv:
  columns: ['filename', 'file_size', 'file_attributes', 'region_count', 'region_id', 'region_shape_attributes', 'region_attributes']
  rows: 200
  sample region_attributes:
    1. ra