In [11]:
import os
import shutil
from sklearn.model_selection import train_test_split

# Input paths for all image folders
image_folders = [
    "G:/Shahadath/Dataset1/images_part1",
    "G:/Shahadath/Dataset1/images_part2",
    "G:/Shahadath/Dataset1/images_part3",
    "G:/Shahadath/Dataset1/images_part4"
]
labels_folder = "G:/Shahadath/Dataset1/yolov5/labels"

# Output paths
output_base = "G:/Shahadath/Dataset"
train_images_folder = os.path.join(output_base, "train/images")
train_labels_folder = os.path.join(output_base, "train/labels")
val_images_folder = os.path.join(output_base, "val/images")
val_labels_folder = os.path.join(output_base, "val/labels")
test_images_folder = os.path.join(output_base, "test/images")
test_labels_folder = os.path.join(output_base, "test/labels")

# Ensure output directories exist
for folder in [train_images_folder, train_labels_folder, val_images_folder, val_labels_folder, test_images_folder, test_labels_folder]:
    if os.path.exists(folder):
        # Clear out existing files in the directory
        for file in os.listdir(folder):
            file_path = os.path.join(folder, file)
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)  # Remove file or symbolic link
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)  # Remove directory
    else:
        os.makedirs(folder, exist_ok=True)

# Random state for reproducibility
random_state = 28  # We fix a random state so that test and validation dataset have at least one instance of every class. You can change this value to generate a different random split

# Collect all image and label files
image_files = set()
for folder in image_folders:
    image_files.update({os.path.join(folder, f) for f in os.listdir(folder) if f.endswith('.png')})
label_files = set(f for f in os.listdir(labels_folder) if f.endswith('.txt'))

# Match image files with label files
valid_image_files = []
valid_label_files = []

for image_path in image_files:
    image_name = os.path.basename(image_path)
    label_name = image_name.replace('.png', '.txt')
    if label_name in label_files:
        valid_image_files.append(image_path)
        valid_label_files.append(os.path.join(labels_folder, label_name))

 
# Split into train (70%), temp (30%) for val and test
train_imgs, temp_imgs, train_lbls, temp_lbls = train_test_split(
    valid_image_files, valid_label_files, test_size=0.3, random_state=random_state
)
# Further split temp into val (20%) and test (10%) proportions
val_imgs, test_imgs, val_lbls, test_lbls = train_test_split(
    temp_imgs, temp_lbls, test_size=(1/3), random_state=random_state
)


# Helper function to copy files
def copy_files(file_list, dest_folder):
    for file_path in file_list:
        dest_path = os.path.join(dest_folder, os.path.basename(file_path))
        shutil.copy(file_path, dest_path)

# Copy files to their respective directories
copy_files(train_imgs, train_images_folder)
copy_files(train_lbls, train_labels_folder)
copy_files(val_imgs, val_images_folder)
copy_files(val_lbls, val_labels_folder)
copy_files(test_imgs, test_images_folder)
copy_files(test_lbls, test_labels_folder)

# Output summary
print(f"Train set: {len(train_imgs)} images, {len(train_lbls)} labels")
print(f"Validation set: {len(val_imgs)} images, {len(val_lbls)} labels")
print(f"Test set: {len(test_imgs)} images, {len(test_lbls)} labels")
print("Data split and copied successfully!")

import os
from collections import Counter

# Class name mapping (index corresponds to the digit in the first column)
class_names = [
    "boneanomaly",
    "bonelesion",
    "foreignbody",
    "fracture",
    "metal",
    "periostealreaction",
    "pronatorsign",
    "softtissue",
    "text"
]

def count_labels(labels_folder):
    # Initialize a counter for the first column
    digit_counts = Counter()

    # Iterate through all text files in the folder
    for file_name in os.listdir(labels_folder):
        if file_name.endswith(".txt"):
            file_path = os.path.join(labels_folder, file_name)
            with open(file_path, 'r') as file:
                # Read lines and extract the first column
                for line in file:
                    first_column = line.split()[0]  # Split by whitespace and get the first column
                    digit_counts[int(first_column)] += 1

    # Output the counts with class names
    for digit, name in enumerate(class_names):
        print(f"{name}: {digit_counts[digit]} occurrences")

# Paths to the folders
folders = {
    "test": "G:/Shahadath/Dataset/test/labels",
    "val": "G:/Shahadath/Dataset/val/labels"
}

# Process each folder
for folder_name, folder_path in folders.items():
    print(f"\nCounts for {folder_name} folder:")
    count_labels(folder_path)

Train set: 14228 images, 14228 labels
Validation set: 4066 images, 4066 labels
Test set: 2033 images, 2033 labels
Data split and copied successfully!

Counts for test folder:
boneanomaly: 32 occurrences
bonelesion: 7 occurrences
foreignbody: 1 occurrences
fracture: 1799 occurrences
metal: 73 occurrences
periostealreaction: 351 occurrences
pronatorsign: 42 occurrences
softtissue: 53 occurrences
text: 2360 occurrences

Counts for val folder:
boneanomaly: 46 occurrences
bonelesion: 14 occurrences
foreignbody: 2 occurrences
fracture: 3606 occurrences
metal: 163 occurrences
periostealreaction: 693 occurrences
pronatorsign: 102 occurrences
softtissue: 82 occurrences
text: 4718 occurrences
