In [2]:
from google.colab import drive
import os
import shutil
import math
from pathlib import Path

# Mount Google Drive
drive.mount('/content/drive')

# Define source and destination paths
src_base = '/content/drive/MyDrive/data_NG'
dst_base = '/content/drive/MyDrive/data_NG_split'

# Class information
class_info = {
    'CN7_NG': 85,
    'BLDC-400-F_NG': 157,
    'BLDC-400-B_NG': 210,
    'G2_NG': 976
}

# Create destination directories
splits = ['train', 'val', 'test']
for split in splits:
    for class_name in class_info.keys():
        Path(f"{dst_base}/{split}/{class_name}").mkdir(parents=True, exist_ok=True)

# Calculate split ratios (7:1:2)
def calculate_split_counts(total_count):
    train = math.floor(total_count * 0.7)
    val = math.floor(total_count * 0.1)
    test = total_count - train - val
    return {'train': train, 'val': val, 'test': test}

# Process each class
for class_name, total_count in class_info.items():
    # Get all files in the class directory
    src_dir = os.path.join(src_base, class_name)
    files = sorted(os.listdir(src_dir))

    # Calculate split counts
    split_counts = calculate_split_counts(total_count)

    # Track current position in files list
    current_idx = 0

    # Copy files to respective splits
    for split, count in split_counts.items():
        for i in range(count):
            src_file = os.path.join(src_dir, files[current_idx])
            dst_file = os.path.join(dst_base, split, class_name, files[current_idx])
            shutil.copy2(src_file, dst_file)
            current_idx += 1

# Print distribution summary
print("\nDataset Distribution Summary:")
print("-" * 50)
for split in splits:
    print(f"\n{split.upper()} Set:")
    total_split = 0
    for class_name in class_info.keys():
        count = len(os.listdir(os.path.join(dst_base, split, class_name)))
        original_ratio = class_info[class_name] / sum(class_info.values())
        split_ratio = count / sum([len(os.listdir(os.path.join(dst_base, split, c)))
                                 for c in class_info.keys()])
        print(f"{class_name}: {count} images (Original ratio: {original_ratio:.3f}, Split ratio: {split_ratio:.3f})")
        total_split += count
    print(f"Total {split} images: {total_split}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

Dataset Distribution Summary:
--------------------------------------------------

TRAIN Set:
CN7_NG: 59 images (Original ratio: 0.060, Split ratio: 0.059)
BLDC-400-F_NG: 109 images (Original ratio: 0.110, Split ratio: 0.109)
BLDC-400-B_NG: 147 images (Original ratio: 0.147, Split ratio: 0.147)
G2_NG: 683 images (Original ratio: 0.683, Split ratio: 0.684)
Total train images: 998

VAL Set:
CN7_NG: 8 images (Original ratio: 0.060, Split ratio: 0.057)
BLDC-400-F_NG: 15 images (Original ratio: 0.110, Split ratio: 0.106)
BLDC-400-B_NG: 21 images (Original ratio: 0.147, Split ratio: 0.149)
G2_NG: 97 images (Original ratio: 0.683, Split ratio: 0.688)
Total val images: 141

TEST Set:
CN7_NG: 18 images (Original ratio: 0.060, Split ratio: 0.062)
BLDC-400-F_NG: 33 images (Original ratio: 0.110, Split ratio: 0.114)
BLDC-400-B_NG: 42 images (Original ratio: 0.147, Split 