In [11]:
import os
import shutil
from sklearn.model_selection import train_test_split

# Define paths for the dataset
dataset_dir = r"C:\Users\HP\Desktop\mm\x\c"  # Update this to your dataset's path
base_dir = r"C:\Users\HP\Desktop\mm\x\allfileshere"  # Base directory for output

# Categories as per the Kaggle dataset
categories = ['cardboard', 'glass', 'metal', 'paper', 'plastic', 'trash']

# Destination folders for images and labels
train_dir = os.path.join(base_dir, 'images/train')
val_dir = os.path.join(base_dir, 'images/val')
test_dir = os.path.join(base_dir, 'images/test')

label_train_dir = os.path.join(base_dir, 'labels/train')
label_val_dir = os.path.join(base_dir, 'labels/val')
label_test_dir = os.path.join(base_dir, 'labels/test')

# Create necessary directories for images and labels
for folder in [train_dir, val_dir, test_dir, label_train_dir, label_val_dir, label_test_dir]:
    for category in categories:
        os.makedirs(os.path.join(folder, category), exist_ok=True)

# Function to create a label file for YOLO format
def create_label_files(image_file, class_id, output_dir, category):
    """
    Create a label file for a given image in YOLO format.
    
    Parameters:
        image_file (str): The name of the image file.
        class_id (int): The class ID for the image.
        output_dir (str): Directory to save the label file.
        category (str): The category of the image.
    """
    image_name = os.path.basename(image_file).replace('.jpg', '.txt')
    label_output_path = os.path.join(output_dir, category, image_name)
    
    # Assuming center coordinates and dimensions are fixed (for YOLO format, they need to be calculated based on the actual bounding boxes)
    with open(label_output_path, 'w') as label_file:
        label_file.write(f"{class_id} 0.5 0.5 1.0 1.0\n")  # Placeholder values for the center coordinates and dimensions

# Function to split and copy data into train, validation, and test sets
def split_data(category, source_dir, train_dir, val_dir, test_dir, label_train_dir, label_val_dir, label_test_dir):
    """
    Split the data for a given category into training, validation, and test sets.

    Parameters:
        category (str): The category to process.
        source_dir (str): Directory where original images are stored.
        train_dir (str): Directory to save training images.
        val_dir (str): Directory to save validation images.
        test_dir (str): Directory to save test images.
        label_train_dir (str): Directory to save training labels.
        label_val_dir (str): Directory to save validation labels.
        label_test_dir (str): Directory to save test labels.
    """
    category_dir = os.path.join(source_dir, category)
    image_files = [f for f in os.listdir(category_dir) if f.endswith('.jpg')]

    # Split into train (80%), temp (20%) -> then split temp into val (10%) and test (10%)
    train_files, temp_files = train_test_split(image_files, test_size=0.2, random_state=42)
    val_files, test_files = train_test_split(temp_files, test_size=0.5, random_state=42)

    # Copy images and create label files for train, val, and test sets
    for file_set, img_dir, lbl_dir in zip([train_files, val_files, test_files], [train_dir, val_dir, test_dir], [label_train_dir, label_val_dir, label_test_dir]):
        for file in file_set:
            # Copy image to the corresponding directory
            shutil.copy(os.path.join(category_dir, file), os.path.join(img_dir, category, file))
            
            # Create corresponding label file
            class_id = categories.index(category)  # Get class ID from category
            create_label_files(file, class_id, lbl_dir, category)

# Apply the split for each category
for category in categories:
    split_data(category, dataset_dir, train_dir, val_dir, test_dir, label_train_dir, label_val_dir, label_test_dir)

# Create dataset.yaml
yaml_content = f"""path: {base_dir}  # root directory of the dataset
train: images/train
val: images/val
test: images/test  # optional
nc: {len(categories)}  # number of classes
names: {categories}  # list of class names
"""

# Write to the YAML file
with open(os.path.join(base_dir, 'dataset.yaml'), 'w') as yaml_file:
    yaml_file.write(yaml_content)

print("Data split, labels generated, and dataset.yaml created successfully.")


Data split, labels generated, and dataset.yaml created successfully.
