In [1]:
import os
import shutil
import re

def combine_datasets(source_dirs, destination_dir):
    # Create the destination directory if it doesn't exist
    if not os.path.exists(destination_dir):
        os.makedirs(destination_dir)

    combined_train_labels_file = os.path.join(destination_dir, 'combined_train_labels.txt')
    combined_test_labels_file = os.path.join(destination_dir, 'combined_test_labels.txt')

    # Function to copy files and directories, overwriting existing ones if needed
    def copy_with_overwrite(source, destination):
        for root, dirs, files in os.walk(source):
            for file in files:
                if ".DS_Store" not in os.path.join(root, file):  # Skip files with .DS_Store in their path
                    source_path = os.path.join(root, file)
                    relative_path = os.path.relpath(source_path, source)
                    destination_path = os.path.join(destination, relative_path)
                    destination_dir = os.path.dirname(destination_path)
                    os.makedirs(destination_dir, exist_ok=True)
                    shutil.copy2(source_path, destination_path)

    def validate_file_path(file):
        # Define the pattern to match the desired format
        pattern = r".*/\d+/[^/]+\.v\d+i\.yolov\d+_(train|test)_frame_\d+_jpg\.rf\.[a-f\d]{16}\.jpg$"
        return re.match(pattern, file) is not None

    with open(combined_train_labels_file, 'w', encoding='utf-8') as combined_train_labels, \
         open(combined_test_labels_file, 'w', encoding='utf-8') as combined_test_labels:
        
        for source_dir in source_dirs:
            # Merge train label files
            train_label_source = os.path.join(source_dir, 'train/labels')
            train_label_files = os.listdir(train_label_source)
            for label_file in train_label_files:
                if ".DS_Store" not in label_file:  # Skip label files with .DS_Store in their name
                    image_name, _ = os.path.splitext(label_file)
                    image_path = f"{image_name}.jpg"  # Adjust the image path to match the label path
                    with open(os.path.join(train_label_source, label_file), 'r', encoding='latin-1') as f:
                        lines = f.readlines()
                        for line in lines:
                            # Write the image path followed by the remaining columns
                            combined_train_labels.write(f"{image_path} {line.strip()}\n")

            # Merge test label files
            test_label_source = os.path.join(source_dir, 'test/labels')
            test_label_files = os.listdir(test_label_source)
            for label_file in test_label_files:
                if ".DS_Store" not in label_file:  # Skip label files with .DS_Store in their name
                    image_name, _ = os.path.splitext(label_file)
                    image_path = f"{image_name}.jpg"  # Adjust the image path to match the label path
                    with open(os.path.join(test_label_source, label_file), 'r', encoding='latin-1') as f:
                        lines = f.readlines()
                        for line in lines:
                            # Write the image path followed by the remaining columns
                            combined_test_labels.write(f"{image_path} {line.strip()}\n")

            # Copy train image files
            train_image_source = os.path.join(source_dir, 'train/images')
            train_image_destination = os.path.join(destination_dir, 'train/images')
            copy_with_overwrite(train_image_source, train_image_destination)

            # Copy test image files
            test_image_source = os.path.join(source_dir, 'test/images')
            test_image_destination = os.path.join(destination_dir, 'test/images')
            copy_with_overwrite(test_image_source, test_image_destination)

    print("Dataset combination completed.")

# Example usage:
source_dirs = ["../../Drowsey_Driver_DL_Data/data_objectclassificaiton/1/dl_project.v1i.yolov8", "../../Drowsey_Driver_DL_Data/data_objectclassificaiton/2/DrowsyDriverDetection.v1i.yolov8",
              "../../Drowsey_Driver_DL_Data/data_objectclassificaiton/3/dl_images.v1i.yolov8", "../../Drowsey_Driver_DL_Data/data_objectclassificaiton/4"]
destination_dir = '../../Drowsey_Driver_DL_Data/combined_classification_dataset'
combine_datasets(source_dirs, destination_dir)


Dataset combination completed.


In [2]:
import os

def find_missing_paths(label_file, image_directory):
    missing_paths = []
    with open(label_file, 'r', encoding='utf-8') as f:
        for line in f:
            # Extract the image path from each line in the label file
            image_path = line.split()[0]  # Assuming the image path is the first item in each line
            image_file = os.path.join(image_directory, image_path)
            # Check if the image file exists
            if not os.path.exists(image_file):
                missing_paths.append(image_path)
            elif not os.path.isfile(image_file):
                missing_paths.append(image_path + " (Not a file)")
    return missing_paths

# Example usage:
label_file = '../../Drowsey_Driver_DL_Data/combined_classification_dataset/combined_test_labels.txt'
image_directory = '../../Drowsey_Driver_DL_Data/combined_classification_dataset/test/images'
missing_paths = find_missing_paths(label_file, image_directory)
print("Paths present in labels but not found among the test images:")
for path in missing_paths:
    print(path)


label_file = '../../Drowsey_Driver_DL_Data/combined_classification_dataset/combined_train_labels.txt'
image_directory = '../../Drowsey_Driver_DL_Data/combined_classification_dataset/train/images'
missing_paths = find_missing_paths(label_file, image_directory)
print("Paths present in labels but not found among the train images:")
for path in missing_paths:
    print(path)

Paths present in labels but not found among the test images:
Paths present in labels but not found among the train images:


In [3]:
def compare_file_counts(image_directory, label_file):
    image_files = os.listdir(image_directory)
    with open(label_file, 'r', encoding='utf-8') as f:
        label_lines = f.readlines()
    num_images = len(image_files)
    num_labels = len(label_lines)
    if num_images != num_labels:
        print(f"Warning: Number of image files ({num_images}) in {image_directory} does not match the number of label lines ({num_labels}).")
    else:
        print(f"File counts match")
# Example usage:
image_directory = '../../Drowsey_Driver_DL_Data/combined_classification_dataset/train/images'
label_file = '../../Drowsey_Driver_DL_Data/combined_classification_dataset/combined_train_labels.txt'
compare_file_counts(image_directory, label_file)

# Example usage:
image_directory = '../../Drowsey_Driver_DL_Data/combined_classification_dataset/test/images'
label_file = '../../Drowsey_Driver_DL_Data/combined_classification_dataset/combined_test_labels.txt'
compare_file_counts(image_directory, label_file)


File counts match
File counts match
