# We can now merge and filter our image dataset to those images labeled as having high-quality, close-up flower photos.

In [17]:
import pandas as pd
import glob
import os
import re
import shutil
import numpy as np

# First: Merge the GPT responses into one big dataframe

### Access csv files

In [3]:
# use glob to do a pattern matching query
csv_files = glob.glob("./gpt_raw_labeling/*_to_*.csv")

In [4]:
csv_files

['./gpt_raw_labeling/31000_to_32000.csv',
 './gpt_raw_labeling/37000_to_38000.csv',
 './gpt_raw_labeling/12000_to_14000.csv',
 './gpt_raw_labeling/34000_to_35000.csv',
 './gpt_raw_labeling/32000_to_33000.csv',
 './gpt_raw_labeling/6000_to_8000.csv',
 './gpt_raw_labeling/0_to_2000.csv',
 './gpt_raw_labeling/29000_to_30000.csv',
 './gpt_raw_labeling/41000_to_41069.csv',
 './gpt_raw_labeling/24000_to_25000.csv',
 './gpt_raw_labeling/22000_to_23000.csv',
 './gpt_raw_labeling/4000_to_6000.csv',
 './gpt_raw_labeling/27000_to_28000.csv',
 './gpt_raw_labeling/21000_to_22000.csv',
 './gpt_raw_labeling/14000_to_16000.csv',
 './gpt_raw_labeling/2000_to_4000.csv',
 './gpt_raw_labeling/33000_to_34000.csv',
 './gpt_raw_labeling/16000_to_18000.csv',
 './gpt_raw_labeling/10000_to_12000.csv',
 './gpt_raw_labeling/25000_to_26000.csv',
 './gpt_raw_labeling/38000_to_39000.csv',
 './gpt_raw_labeling/8000_to_10000.csv',
 './gpt_raw_labeling/39000_to_40000.csv',
 './gpt_raw_labeling/20000_to_21000.csv',
 './

### Sort the files

In [5]:
# sort files by the starting index extracted from the filename.
def get_starting_index(filename):
    basename = os.path.basename(filename)
    # re to pull out index
    m = re.match(r"(\d+)_to_\d+\.csv", basename)
    if m:
        return int(m.group(1))
    else:
        return float('inf')

In [7]:
csv_files = sorted(csv_files, key=get_starting_index)
csv_files

['./gpt_raw_labeling/0_to_2000.csv',
 './gpt_raw_labeling/2000_to_4000.csv',
 './gpt_raw_labeling/4000_to_6000.csv',
 './gpt_raw_labeling/6000_to_8000.csv',
 './gpt_raw_labeling/8000_to_10000.csv',
 './gpt_raw_labeling/10000_to_12000.csv',
 './gpt_raw_labeling/12000_to_14000.csv',
 './gpt_raw_labeling/14000_to_16000.csv',
 './gpt_raw_labeling/16000_to_18000.csv',
 './gpt_raw_labeling/18000_to_20000.csv',
 './gpt_raw_labeling/20000_to_21000.csv',
 './gpt_raw_labeling/21000_to_22000.csv',
 './gpt_raw_labeling/22000_to_23000.csv',
 './gpt_raw_labeling/23000_to_24000.csv',
 './gpt_raw_labeling/24000_to_25000.csv',
 './gpt_raw_labeling/25000_to_26000.csv',
 './gpt_raw_labeling/26000_to_27000.csv',
 './gpt_raw_labeling/27000_to_28000.csv',
 './gpt_raw_labeling/28000_to_29000.csv',
 './gpt_raw_labeling/29000_to_30000.csv',
 './gpt_raw_labeling/30000_to_31000.csv',
 './gpt_raw_labeling/31000_to_32000.csv',
 './gpt_raw_labeling/32000_to_33000.csv',
 './gpt_raw_labeling/33000_to_34000.csv',
 './

### Build the dataframe

In [8]:
# make a list of dfs
df_list = []
for file in csv_files:
    df = pd.read_csv(file)
    df_list.append(df)

# concat all dfs into one big one
# ignore_index resets the idxs
df_all = pd.concat(df_list, ignore_index=True)

In [9]:
# make a new column 'global_index' that spans from 0 to the total number of rows minus one - 
# should match index, but want to make sure we don't lose it!
df_all['global_index'] = range(len(df_all))

# nice to reorder the columns so the global index as the first column
cols = ['global_index'] + [col for col in df_all.columns if col != 'global_index']
df_all = df_all[cols]

# df_all contains all concat data with a global index
print(df_all.head())
print("Final DataFrame shape:", df_all.shape)

   global_index flower_present
0             0            YES
1             1             NO
2             2            YES
3             3             NO
4             4            YES
Final DataFrame shape: (41069, 2)


In [10]:
df_all.to_csv('./gpt_image_filtering.csv',index=False)

# Second: Filter the full image dataset using the GPT labels to create a separate dataset of high-quality flower images

### Get the image idxs that were labeled YES by GPT as having high-quality photos.

In [12]:
# filter df to only include rows with "YES" for flower_present
yes_df = df_all[df_all['flower_present'] == "YES"]

# get the list of indices (which map the image file names)
yes_indices = yes_df['global_index'].tolist()

In [14]:
# how many YES values
len(yes_indices)

20761

### Copy over the corresponding images

In [13]:
# define the source directory where the original dataset of all images is stored
source_dir = '/Volumes/My Passport/monarda_fistulosa_segmentation/image_dataset/images/'  # e.g., '/Volumes/My Passport/monarda_fistulosa_segmentation/image_dataset/images/'

# define the dest directory where filtered images will end up
target_dir = '/Volumes/My Passport/monarda_fistulosa_segmentation/image_dataset/gpt_filtered_images/'
os.makedirs(target_dir, exist_ok=True)  # Create the directory if it doesn't exist

# loop through and copy each image
for idx in yes_indices:
    source_path = os.path.join(source_dir, f"{idx}.jpg")
    target_path = os.path.join(target_dir, f"{idx}.jpg")
    if os.path.exists(source_path):
        shutil.copy(source_path, target_path)
    else:
        print(f"Warning: {source_path} does not exist.")

print("Filtered images copied successfully!")

Filtered images copied successfully!


### Make a directory with segmentation model training images

In [21]:
# define the target directory where the filtered images will be copied
source_dir = '/Volumes/My Passport/monarda_fistulosa_segmentation/image_dataset/gpt_filtered_images/'
target_dir = '/Volumes/My Passport/monarda_fistulosa_segmentation/image_dataset/semantic_segmentation_training/'
os.makedirs(target_dir, exist_ok=True)  # make dir if doesn't exist

In [22]:
selected_idxs = np.random.choice(yes_indices,200)
selected_idxs

array([33419, 37864, 11334, 30136, 32800,  8869,   264,  6692, 17831,
       31976,   919, 25643, 11103,  6386,  5732, 27825,   360, 16756,
       25880, 26668,  3246, 14191,  5709, 23297, 32974, 14768,  7193,
       13781,  6476,  3796,  1164,  9148, 25945, 26226, 36251, 14224,
       20917, 26141, 29006,  7636, 36655,    84, 27507, 22132, 32914,
       21043, 37685, 23497,  1416,  8551,   428,  5316,  9242, 15088,
        7212, 37749, 14739, 21768, 33532,  1734,  6921, 17613,  7544,
       38418, 12245, 21661, 12435, 26680, 37485, 16985, 27359,   233,
       19400, 11858, 25440, 10976, 13237,  7491, 40479, 19806, 26306,
        7141, 25386, 17124,  5059, 22187,  3990,  5237, 31554, 28732,
       19193, 31985, 38433, 31039, 12624,  7435, 15422, 13371, 13709,
       34151, 33690, 33246, 11437, 14949, 39978,  1571, 33131, 33354,
       14078,  8268, 23554, 27546, 24899, 20507,  9872, 14400,  3208,
       31496,  5715, 29902, 30360, 23281,  5917, 21682, 38429, 19449,
       19777, 15567,

In [23]:
# loop through the indices and copy each image
for idx in selected_idxs:
    source_path = os.path.join(source_dir, f"{idx}.jpg")
    target_path = os.path.join(target_dir, f"{idx}.jpg")
    if os.path.exists(source_path):
        shutil.copy(source_path, target_path)
    else:
        print(f"Warning: {source_path} does not exist.")

print("Filtered images copied successfully!")

Filtered images copied successfully!
