In [1]:
import os
import shutil
from sklearn.model_selection import train_test_split
from collections import defaultdict

# Define the paths
data_dir = '/kaggle/working/'
images_dir = "/kaggle/input/fvsaugmented/images"
labels_dir = "/kaggle/input/fvsaugmented/labels"

# Create directories for train, test, and validation sets
split_dirs = {
    'train': ['/kaggle/working/train/images', '/kaggle/working/train/labels'],
    'test': ['/kaggle/working/test/images', '/kaggle/working/test/labels'],
    'valid': ['/kaggle/working/valid/images', '/kaggle/working/valid/labels']
}

for split in split_dirs:
    for subdir in split_dirs[split]:
        os.makedirs(subdir, exist_ok=True)

# Verify that directories are created
print("Directories created:")
for split, dirs in split_dirs.items():
    for d in dirs:
        print(d, os.path.exists(d))

def extract_class_from_label(label_file_path):
    with open(label_file_path, 'r') as file:
        first_line = file.readline().strip()
        return first_line.split()[0]  # Assuming class is the first element

# Collect all files and organize by class
class_files = defaultdict(list)

for filename in os.listdir(images_dir):
    if filename.endswith('.jpg'):
        image_path = os.path.join(images_dir, filename)
        label_path = os.path.join(labels_dir, filename.replace('.jpg', '.txt'))
        
        if os.path.isfile(label_path):
            cls = extract_class_from_label(label_path)
            class_files[cls].append((image_path, label_path))

# Balance and split data
train_files, temp_files = [], []
valid_files, test_files = [], []

for cls, files in class_files.items():
    train, temp = train_test_split(files, test_size=0.3)
    valid, test = train_test_split(temp, test_size=2/3)
    
    train_files.extend(train)
    valid_files.extend(valid)
    test_files.extend(test)

# Function to move files
def move_files(file_list, split):
    for image_path, label_path in file_list:
        shutil.copy(image_path, os.path.join(split_dirs[split][0], os.path.basename(image_path)))
        shutil.copy(label_path, os.path.join(split_dirs[split][1], os.path.basename(label_path)))
    print(f"{len(file_list)} files moved to {split}.")

move_files(train_files, 'train')
move_files(valid_files, 'valid')
move_files(test_files, 'test')

print("Data splitting complete.")

# Print directory structure and sample files
def print_directory_structure(base_dir, max_files=5):
    for root, dirs, files in os.walk(base_dir):
        level = root.replace(base_dir, '').count(os.sep)
        indent = ' ' * 4 * (level)
        print(f"{indent}{os.path.basename(root)}/")
        subindent = ' ' * 4 * (level + 1)
        for i, f in enumerate(files):
            if i < max_files:
                print(f"{subindent}{f}")
            else:
                break
        if len(files) > max_files:
            print(f"{subindent}... ({len(files) - max_files} more files)")

print("\nTrain Directory Structure:")
print_directory_structure('/kaggle/working/train/images')

print("\nValidation Directory Structure:")
print_directory_structure('/kaggle/working/valid/images')

print("\nTest Directory Structure:")
print_directory_structure('/kaggle/working/test/images')


Directories created:
/kaggle/working/train/images True
/kaggle/working/train/labels True
/kaggle/working/test/images True
/kaggle/working/test/labels True
/kaggle/working/valid/images True
/kaggle/working/valid/labels True
125383 files moved to train.
17904 files moved to valid.
35871 files moved to test.
Data splitting complete.

Train Directory Structure:
images/
    0078742369440-2-400_jpg.rf.b178f7119fecd625c23303bf3a3b81d6.jpg
    Image_57_jpg.rf.10050a6ffd853fe95dc2daf57cbbd6b5.jpg
    olivee-140-_jpg_1400_2547.jpg
    bell-pepper-rot_109_jpg.rf.9742e3699a6d6dfeb7e0f19af77db0e6.jpg
    C_Class-95-_jpg.rf.7475c9598c35917f053f1a618b4f6025.jpg
    ... (125378 more files)

Validation Directory Structure:
images/
    A_Class-413-_jpg.rf.5effa70d4c377954cfad614150d55a9e.jpg
    573_jpg.rf.d15622b75b94c0a1ef255f3217f7c5b4.jpg
    7610200066947-6-400_jpg.rf.095c098bd6e038da8a09550cc2a3f796.jpg
    621_5_chilli_wb_50_jpg.rf.30c53268e62972b9e3453fa05750ff85.jpg
    custardapple_282_jpg_298