This notebook reads the output csv of the apollo toilet app. It then copies a user defined number of random images from an input folder to two output folders. It copies it to one folder for all entries where the Response was either 'Correct', 'Incorrect', or 'Unsure'. All Images labeled as 'Dirt' are copied to the other of the two folders.

In [1]:
import os
import pandas as pd
import shutil
import random

In [2]:
def check_for_duplicates(csv_file):
    # Read the CSV file
    df = pd.read_csv(csv_file)
    
    # Extract the basename from the 'Filename' column (ignores path)
    df['basename'] = df['Filename'].apply(lambda x: os.path.basename(x))
    
    # Check for duplicate basenames
    duplicate_rows = df['basename'].duplicated()
    num_duplicates = duplicate_rows.sum()  # Count duplicates
    
    if num_duplicates > 0:
        print(f"The CSV file contains {num_duplicates} duplicate filenames. Duplicates were deleted from df!!")
    else:
        print("The CSV file has no duplicate filenames.")
    
    # Remove the duplicate rows based on 'basename' column
    filtered_df = df[~duplicate_rows]  # Exclude the rows with duplicates
    
    # Drop the 'basename' column as it's no longer needed
    filtered_df = filtered_df.drop(columns=['basename'])
    
    return filtered_df

In [13]:
def process_csv_and_copy_images(csv_file, input_folder, insect_folder, no_insect_folder, num_files):
    df = check_for_duplicates(csv_file)
    # Read the CSV file
    #df = pd.read_csv(csv_file)
    dirt = 0
    insect = 0
    # Filter rows where 'Response' column is 'Correct'
    # filtered_df = df[df['Response'] == filter_for]

    # Randomly select the specified number of rows
    sampled_df = df.sample(n=num_files)
    
    # Check if there are enough rows to sample
    if len(sampled_df) < num_files:
        raise ValueError(f"Not enough {filter_for} entries to sample {num_files} rows. Only {len(sampled_df)} entries found...")
    
    # Ensure the output folder exists
    os.makedirs(insect_folder, exist_ok=True)
    os.makedirs(no_insect_folder, exist_ok=True)
    
    # Process each sampled row
    for _, row in sampled_df.iterrows():
        # Extract the filename from the path
        full_path = row['Filename']
        filename = os.path.basename(full_path)

        response = row['Response']
        
        # Check if the corresponding file exists in the input folder
        input_file_path = os.path.join(input_folder, filename)
        if os.path.exists(input_file_path):
            # Copy the file to the output folder
            if response == 'Dirt':
                shutil.copy(input_file_path, no_insect_folder)
                dirt += 1
            else:
                shutil.copy(input_file_path, insect_folder)
                insect += 1
        else:
            print(f"Warning: File {filename} not found in the input folder.")
    
    print(f"Copied {dirt} no_insect images to {no_insect_folder}")
    print(f"Copied {insect} insect images to {insect_folder}")

In [14]:
# Set the file paths and parameters
csv_file_path = "/Volumes/T7_Shield/Diopsis_Cameras/App/user_checked_predictions_24_01_25.csv"  # Path to the CSV file
input_images_folder = "/Volumes/T7_Shield/Diopsis_Cameras/RESULTS_2024/all_crops/all_crops"  # Path to the folder containing images
insect_folder = "/Volumes/T7_Shield/Diopsis_Cameras/Train_Test_datasets/Apollo_dirt_classifier/dirt_test_new/insect"  # Path to the folder to save the images with response labels 'Correct', 'Incorrect', or 'Unsure'
no_insect_folder = "/Volumes/T7_Shield/Diopsis_Cameras/Train_Test_datasets/Apollo_dirt_classifier/dirt_test_new/no_insect"
number_of_files = 10619  # Number of files to randomly select and copy

In [15]:
process_csv_and_copy_images(
    csv_file=csv_file_path,
    input_folder=input_images_folder,
    insect_folder = insect_folder,
    no_insect_folder = no_insect_folder,
    num_files=number_of_files
)

The CSV file contains 394 duplicate filenames. Duplicates were deleted from df!!
Copied 143 no_insect images to /Volumes/T7_Shield/Diopsis_Cameras/Train_Test_datasets/Apollo_dirt_classifier/dirt_test_new/no_insect
Copied 2843 insect images to /Volumes/T7_Shield/Diopsis_Cameras/Train_Test_datasets/Apollo_dirt_classifier/dirt_test_new/insect
