This notebook reads the output csv of the apollo toilet app. It then copies a user defined number of random images that were labeled as a specific label from an input folder to an output folder.

In [1]:
import os
import pandas as pd
import shutil
import random

In [2]:
def check_for_duplicates(csv_file):
    # Read the CSV file
    df = pd.read_csv(csv_file)
    
    # Extract the basename from the 'Filename' column (ignores path)
    df['basename'] = df['Filename'].apply(lambda x: os.path.basename(x))
    
    # Check for duplicate basenames
    duplicate_rows = df['basename'].duplicated()
    num_duplicates = duplicate_rows.sum()  # Count duplicates
    
    if num_duplicates > 0:
        print(f"The CSV file contains {num_duplicates} duplicate filenames. Duplicates were deleted from df!!")
    else:
        print("The CSV file has no duplicate filenames.")
    
    # Remove the duplicate rows based on 'basename' column
    filtered_df = df[~duplicate_rows]  # Exclude the rows with duplicates
    
    # Drop the 'basename' column as it's no longer needed
    filtered_df = filtered_df.drop(columns=['basename'])
    
    return filtered_df

In [3]:
def process_csv_and_copy_images(csv_file, input_folder, output_folder, num_files, filter_for):
    df = check_for_duplicates(csv_file)
    # Read the CSV file
    #df = pd.read_csv(csv_file)
    
    # Filter rows where 'Response' column is 'Correct'
    filtered_df = df[df['Response'] == filter_for]
    
    # Check if there are enough rows to sample
    if len(filtered_df) < num_files:
        raise ValueError(f"Not enough {filter_for} entries to sample {num_files} rows. Only {len(filtered_df)} entries found...")
    
    # Randomly select the specified number of rows
    sampled_df = filtered_df.sample(n=num_files)
    
    # Ensure the output folder exists
    os.makedirs(output_folder, exist_ok=True)
    
    # Process each sampled row
    for _, row in sampled_df.iterrows():
        # Extract the filename from the path
        full_path = row['Filename']
        filename = os.path.basename(full_path)
        
        # Check if the corresponding file exists in the input folder
        input_file_path = os.path.join(input_folder, filename)
        if os.path.exists(input_file_path):
            # Copy the file to the output folder
            shutil.copy(input_file_path, output_folder)
        else:
            print(f"Warning: File {filename} not found in the input folder.")
    
    print(f"Successfully copied {num_files} files to {output_folder}")

In [8]:
# Set the file paths and parameters
csv_file_path = "/Volumes/T7_Shield/Diopsis_Cameras/App/user_checked_predictions_24_01_25.csv"  # Path to the CSV file
input_images_folder = "/Volumes/T7_Shield/Diopsis_Cameras/RESULTS_2024/all_crops/all_crops"  # Path to the folder containing images
output_images_folder = "/Volumes/T7_Shield/Diopsis_Cameras/Train_Test_datasets/Apollo_dirt_classifier/dirt_test_new/no_insect"  # Path to the folder to save the copied images
number_of_files = 1000  # Number of files to randomly select and copy
filter_for = "Dirt"

In [9]:
process_csv_and_copy_images(
    csv_file=csv_file_path,
    input_folder=input_images_folder,
    output_folder=output_images_folder,
    num_files=number_of_files,
    filter_for = filter_for
)

The CSV file contains 394 duplicate filenames. Duplicates were deleted from df!!
Successfully copied 1000 files to /Volumes/T7_Shield/Diopsis_Cameras/Train_Test_datasets/Apollo_dirt_classifier/dirt_test_new/no_insect
