In [None]:
import os
import random 
import pickle
import shutil
import yaml
from pathlib import Path

In [None]:
# download & unzip the dataset
!wget https://huggingface.co/datasets/rmirjalili/VLM-Vac-dataset/resolve/main/VLM-Vac-dataset.zip
!unzip VLM-Vac-dataset.zip -d .

In [None]:
# =============================================================================
# CONFIGURATION
# =============================================================================

data_bank_dir = Path.cwd() / 'VLM-Vac-dataset'  # path to VLM-Vac-dataset folder
days_dir = Path.cwd().parent / 'data'
val_images_dir = f'{days_dir}/ValFolder/images'
val_labels_dir = f'{days_dir}/ValFolder/labels'

# =============================================================================
# DAY CONFIGURATIONS - DEFINE DATASET COMPOSITION FOR EACH DAY
# =============================================================================

# Random variation parameters for dataset sizes
top_rand = 5
low_rand = -5

# Day 1: Gray carpet with various objects
day1 = { 
    "gray_carpet_crumbs":50 + random.randint(low_rand, top_rand),
    "gray_carpet_NONE":50 + random.randint(low_rand, top_rand),
    "gray_carpet_sprinkles":50 + random.randint(low_rand, top_rand),

    "gray_carpet_pet_feces":30 + random.randint(low_rand, top_rand),
    "gray_carpet_ring":30 + random.randint(low_rand, top_rand),
    "gray_carpet_chair_leg":30 + random.randint(low_rand, top_rand),
    "gray_carpet_socks":30 + random.randint(low_rand, top_rand),
}

# Day 2: Wood surface with various objects
day2 = { 
    "wood_crumbs":50 + random.randint(low_rand, top_rand),
    "wood_NONE":50 + random.randint(low_rand, top_rand),
    "wood_sprinkles":50 + random.randint(low_rand, top_rand),

    "wood_puzzle":30 + random.randint(low_rand, top_rand),
    "wood_paperclip":30 + random.randint(low_rand, top_rand),
    "wood_USB":30 + random.randint(low_rand, top_rand) ,
    "wood_nails":30 + random.randint(low_rand, top_rand)
}

# Day 3: Zigzag carpet with various objects
day3 = {
    "zigzag_carpet_crumbs":50 + random.randint(low_rand, top_rand),
    "zigzag_carpet_NONE":50 + random.randint(low_rand, top_rand),
    "zigzag_carpet_sprinkles":50 + random.randint(low_rand, top_rand),
    
    "zigzag_carpet_lego":30 + random.randint(low_rand, top_rand),
    "zigzag_carpet_puzzle":30 + random.randint(low_rand, top_rand),
    "zigzag_carpet_ring":30 + random.randint(low_rand, top_rand),
    "zigzag_carpet_USB":30 + random.randint(low_rand, top_rand)
}

# Day 4: Repeat of Day 1 configuration (gray carpet)
day4 = {
    "gray_carpet_crumbs":50 + random.randint(low_rand, top_rand),
    "gray_carpet_NONE":50 + random.randint(low_rand, top_rand),
    "gray_carpet_sprinkles":50 + random.randint(low_rand, top_rand),

    "gray_carpet_pet_feces":30 + random.randint(low_rand, top_rand),
    "gray_carpet_ring":30 + random.randint(low_rand, top_rand),
    "gray_carpet_chair_leg":30 + random.randint(low_rand, top_rand),
    "gray_carpet_socks":30 + random.randint(low_rand, top_rand),
}

# Day 5: Repeat of Day 2 configuration (wood surface)
day5 = {
    "wood_crumbs":50 + random.randint(low_rand, top_rand),
    "wood_NONE":50 + random.randint(low_rand, top_rand),
    "wood_sprinkles":50 + random.randint(low_rand, top_rand),

    "wood_puzzle":30 + random.randint(low_rand, top_rand),
    "wood_paperclip":30 + random.randint(low_rand, top_rand),
    "wood_USB":30 + random.randint(low_rand, top_rand) ,
    "wood_nails":30 + random.randint(low_rand, top_rand)
}

# Day 6: Repeat of Day 3 configuration (zigzag carpet)
day6 = {
    "zigzag_carpet_crumbs":50 + random.randint(low_rand, top_rand),
    "zigzag_carpet_NONE":50 + random.randint(low_rand, top_rand),
    "zigzag_carpet_sprinkles":50 + random.randint(low_rand, top_rand),
    
    "zigzag_carpet_lego":30 + random.randint(low_rand, top_rand),
    "zigzag_carpet_puzzle":30 + random.randint(low_rand, top_rand),
    "zigzag_carpet_ring":30 + random.randint(low_rand, top_rand),
    "zigzag_carpet_USB":30 + random.randint(low_rand, top_rand)
}

# Day 7: Repeat of Day 1 configuration (gray carpet)
day7 = {
    "gray_carpet_crumbs":50 + random.randint(low_rand, top_rand),
    "gray_carpet_NONE":50 + random.randint(low_rand, top_rand),
    "gray_carpet_sprinkles":50 + random.randint(low_rand, top_rand),

    "gray_carpet_pet_feces":30 + random.randint(low_rand, top_rand),
    "gray_carpet_ring":30 + random.randint(low_rand, top_rand),
    "gray_carpet_chair_leg":30 + random.randint(low_rand, top_rand),
    "gray_carpet_socks":30 + random.randint(low_rand, top_rand),
}

# Day 8: Repeat of Day 2 configuration (wood surface)
day8 = {
    "wood_crumbs":50 + random.randint(low_rand, top_rand),
    "wood_NONE":50 + random.randint(low_rand, top_rand),
    "wood_sprinkles":50 + random.randint(low_rand, top_rand),

    "wood_puzzle":30 + random.randint(low_rand, top_rand),
    "wood_paperclip":30 + random.randint(low_rand, top_rand),
    "wood_USB":30 + random.randint(low_rand, top_rand) ,
    "wood_nails":30 + random.randint(low_rand, top_rand)
}

# Day 9: Repeat of Day 3 configuration (zigzag carpet)
day9 = {
    "zigzag_carpet_crumbs":50 + random.randint(low_rand, top_rand),
    "zigzag_carpet_NONE":50 + random.randint(low_rand, top_rand),
    "zigzag_carpet_sprinkles":50 + random.randint(low_rand, top_rand),
    
    "zigzag_carpet_lego":30 + random.randint(low_rand, top_rand),
    "zigzag_carpet_puzzle":30 + random.randint(low_rand, top_rand),
    "zigzag_carpet_ring":30 + random.randint(low_rand, top_rand),
    "zigzag_carpet_USB":30 + random.randint(low_rand, top_rand)
}


In [None]:
# =============================================================================
# UTILITY FUNCTIONS
# =============================================================================

def count_images_in_folder(folder_path):
    # Define a set of valid image file extensions
    image_extensions = {'.jpg', '.jpeg', '.png', '.bmp', '.gif', '.tiff', '.webp'}

    # Initialize a counter
    image_count = 0

    # Loop through all files in the folder
    for filename in os.listdir(folder_path):
        # Check if the file is an image based on its extension
        if os.path.splitext(filename)[1].lower() in image_extensions:
            image_count += 1
    
    return image_count


dict_num_dataset = {}
def main_folder_image_count(main_folder_path):
    sorted_folders = sorted(os.listdir(main_folder_path))
    # Loop through each subfolder in the main folder
    for subfolder in sorted_folders:
        subfolder_path = os.path.join(main_folder_path, subfolder, 'images')
        
        # Ensure it's a directory
        if os.path.isdir(subfolder_path):
            num_images = count_images_in_folder(subfolder_path)
            print(f"{subfolder} : {num_images} ")
            dict_num_dataset[subfolder] = num_images
            
# Execute the image counting function
main_folder_image_count(data_bank_dir)

In [None]:
# =============================================================================
# DATASET ORGANIZATION AND SUMMARY CALCULATIONS
# =============================================================================

# Organize all day configurations into lists
days = [day1, day2, day3, day4, day5, day6, day7, day8, day9]
day_names = ["day1", "day2", "day3", "day4", "day5", "day6", "day7", "day8", "day9"]

num_days = len(days)
dict_days_sum = {}
for day in days:
    for key, value in day.items():
        if key in dict_days_sum:
            dict_days_sum[key] += value
        else:
            dict_days_sum[key] = value


# =============================================================================
# DISPLAY DAY CONFIGURATIONS
# =============================================================================

# Print out the configuration for each day to verify the setup
for i in range(1,len(days)+1):
    print('** ', day_names[i-1], '  **')
    DAY = days[i-1]
    for key, value in DAY.items():
        print(f"{key}: {value}")
    print('------------------------')



In [None]:
# =============================================================================
# CREATE DIRECTORY STRUCTURE
# =============================================================================
# Create main days directory and subdirectories for each day

if os.path.exists(days_dir):
    shutil.rmtree(days_dir)
os.makedirs(days_dir)

os.makedirs(val_images_dir, exist_ok=True)
os.makedirs(val_labels_dir, exist_ok=True)
[shutil.rmtree(os.path.join(val_images_dir, f)) if os.path.isdir(os.path.join(val_images_dir, f)) else os.remove(os.path.join(val_images_dir, f)) for f in os.listdir(val_images_dir)]
[shutil.rmtree(os.path.join(val_labels_dir, f)) if os.path.isdir(os.path.join(val_labels_dir, f)) else os.remove(os.path.join(val_labels_dir, f)) for f in os.listdir(val_labels_dir)]



next_day = 1
for day_folder in day_names:
    next_day += 1
    folder_path = os.path.join(days_dir, day_folder)
    os.makedirs(folder_path)
    
    # Create images subdirectory for each day
    day_imgs_dir = os.path.join(days_dir, day_folder, "images")
    os.makedirs(day_imgs_dir)

    # Create YAML configuration file for each day

    datayaml = {
                'train' :  "images",
                'val' :  "../ValFolder/images",
                'test' :  "images",
            'nc': 2,
            'names': ['avoid', 'suck']
            }    

    with open(os.path.join(folder_path, 'data.yaml'), 'w') as yaml_file:
        yaml.dump(datayaml, yaml_file, default_flow_style=None)

# =============================================================================
# BUILD IMAGE INVENTORY
# =============================================================================

# Create a dictionary mapping each item category to its available images
items_dict = {}
# Loop through each folder in the data bank directory
for folder in os.listdir(data_bank_dir):
    folder_path = os.path.join(data_bank_dir, folder, 'images')
    # Check if it is a directory
    if os.path.isdir(folder_path):
        # Initialize a list to store image paths for the current folder
        image_list = []
        # Loop through each file in the folder
        for file in os.listdir(folder_path):
            # Check if the file is an image
            if file.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp')):
                file_path = os.path.join(data_bank_dir, folder, 'images',file)
                image_list.append(file_path)
        image_list.sort()
        # Add the list of image paths to the dictionary with the folder name as the key
        items_dict[folder] = image_list

# =============================================================================
# RANDOM IMAGE SELECTION AND ASSIGNMENT
# =============================================================================

days_images = []
all_days_imgs = []

# Process each day
for i in range(num_days):  # going per day
    day = days[i]
    day_imgs = []
    
    # Process each item category within the current day
    for j in range(len(day)):  # going per item in each day
        item = list(day.keys())[j]  # e.g., 'crumbs'
        item_num = day[item]  # number of images needed for this item
        item_images = items_dict.get(item, [])  # available images for this item
        
        # Randomly select the required number of images
        if len(item_images) >= item_num:
            random_images = random.sample(item_images, item_num)
        else:
            print(f"Not enough images for {item} in day{i+1}.") 

        for k in range(len(random_images)):
            temp1 = os.path.join(data_bank_dir, item)
            temp2 = os.path.join(days_dir, f'day{i+1}')
            day_dir_random_image = random_images[k].replace(temp1 ,temp2 )


        # Remove selected images from the available pool to avoid duplicates
        items_dict[item] = [image for image in item_images if image not in random_images]
        

        # Add selected images to the day's image list
        day_imgs.extend(random_images)  
    all_days_imgs.append(day_imgs) 

# =============================================================================
# COPY IMAGES TO DAY DIRECTORIES
# =============================================================================

# Copy selected images to each day's directory
for i in range(num_days):
    day = day_names[i]
    for j in range(len(all_days_imgs[i])):
        # Copy image files
        input_path = all_days_imgs[i][j]
        output_path = os.path.join(days_dir, day,"images")
        shutil.copy2(input_path, output_path)

# =============================================================================
# CREATE ADDITIONAL DIRECTORIES
# =============================================================================

# Create validation folder and its subdirectory
os.makedirs(f"{days_dir}/ValFolder", exist_ok=True)
os.makedirs(f"{days_dir}/ValFolder/images", exist_ok=True)


# =============================================================================
# VALIDATION DATASET INITIALIZATION
# =============================================================================

# Copy 5 random images from day1 to validation folder for YOLO initialization
day0_val_source_dir = os.path.join(days_dir, 'day1', 'images')
source_dir = day0_val_source_dir
destination_dir = val_images_dir
all_files = os.listdir(source_dir)
image_files = [file for file in all_files if file.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp'))]

if len(image_files) < 5:
    print(f"Not enough images in the source directory. Found {len(image_files)} images.")
else:
    selected_images = random.sample(image_files, 5)
    for image in selected_images:
        shutil.copy(os.path.join(source_dir, image), destination_dir)