# make the complete dataframe

This will be the "one dataframe to rule them all" that has all of the image ids together with their gbif idxs and inferred values for hex, hsl, and 

In [1]:
import numpy as np
import pandas as pd
import os
import json
import base64
from matplotlib import pyplot as plt
import cv2
from PIL import Image
from io import BytesIO
import colorsys
from sklearn.cluster import KMeans

# conversion functions
def rgb_to_hex(rgb):
    # rgb should be a tuple like (R, G, B)
    return '#{:02x}{:02x}{:02x}'.format(*rgb)

def hex_to_rgb(hex_str):
    # hex here is just a string
    hex_str = hex_str.lstrip('#')
    return tuple(int(hex_str[i:i+2], 16) for i in (0, 2, 4))

def rgb_to_lab(rgb):
    """convert (r,g,b) in [0,255] to Lab using OpenCV"""
    arr = np.array([[rgb]], dtype=np.uint8)
    lab = cv2.cvtColor(arr, cv2.COLOR_RGB2LAB)[0][0]
    return lab

def lab_to_rgb(lab):
    """convert (L,a,b) in OpenCV's Lab to (r,g,b) in [0,255]"""
    arr = np.array([[lab]], dtype=np.uint8)
    rgb = cv2.cvtColor(arr, cv2.COLOR_LAB2RGB)[0][0]
    return tuple(int(x) for x in rgb)

def hex_to_hsl(hex_code):
    r, g, b = hex_to_rgb(hex_code)
    # convert rgb values to range [0,1]
    r, g, b = [x/255.0 for x in (r, g, b)]
    # colorsys.rgb_to_hls returns (hue, lightness, saturation)
    h, l, s = colorsys.rgb_to_hls(r, g, b)
    return h, s, l

In [2]:
all_filtered_images = os.listdir("/Volumes/My Passport/monarda_fistulosa_segmentation/image_dataset/gpt_filtered_images/") # the number segmented
len(all_filtered_images)

20767

In [3]:
# there are sometimes annoying temporary files to filter out
len([i for i in all_filtered_images if "_" not in i])

20761

In [4]:
all_filtered_images = [i for i in all_filtered_images if "_" not in i]

In [5]:
# Lists to store results
results = []

for im in all_filtered_images:
    image_path = os.path.join("/Volumes/My Passport/monarda_fistulosa_segmentation/image_dataset/gpt_filtered_images/", im)
    base_name_no_ext = os.path.splitext(im)[0]
    json_path = os.path.join('/Volumes/My Passport/monarda_fistulosa_segmentation/segmentation_results',
                             f"{base_name_no_ext}.json")

    try:
        with open(json_path, "r") as json_file:
            data = json.load(json_file)

        if "segmentation_mask" not in data:
            # this is the most common error -- basically corresponds to where
            # roboflow doesn't detect a flower
            raise ValueError("segmentation_mask key missing")

        seg_mask_base64 = data['segmentation_mask']
        mask_data = base64.b64decode(seg_mask_base64)
        mask_image = Image.open(BytesIO(mask_data))
        mask_np = np.array(mask_image)

        orig_img = cv2.imread(image_path)
        if orig_img is None:
            raise ValueError("Image not found or unable to load")

        # get color in rgb
        orig_img_rgb = cv2.cvtColor(orig_img, cv2.COLOR_BGR2RGB)
        mask_resized = cv2.resize(mask_np, (orig_img_rgb.shape[1], orig_img_rgb.shape[0]), interpolation=cv2.INTER_NEAREST)
        flower_pixels = orig_img_rgb[mask_resized == 1]

        if flower_pixels.size == 0:
            raise ValueError("No flower pixels found")

        # convert flower pixels from rgb to lab for clustering
        flower_pixels_lab = cv2.cvtColor(flower_pixels.reshape(-1, 1, 3).astype(np.uint8), cv2.COLOR_RGB2LAB).reshape(-1, 3)

        # sample pixels if necessary (if there are a ton)
        if flower_pixels_lab.shape[0] > 10000:
            idx = np.random.choice(flower_pixels_lab.shape[0], 10000, replace=False)
            filtered_pixels = flower_pixels_lab[idx]
        else:
            filtered_pixels = flower_pixels_lab

        ### k-means clustering in Lab space ###
        kmeans = KMeans(n_clusters=3, random_state=42)
        kmeans.fit(filtered_pixels)
        centroids = kmeans.cluster_centers_
        labels = kmeans.labels_
        counts = np.bincount(labels)
        dominant_idx = np.argmax(counts)
        #dominant_lab = centroids[dominant_idx].astype(int)
        dominant_lab = [int(i) for i in centroids[dominant_idx]]

        # convert dominant color Lab -> RGB -> HEX -> HSL
        dominant_rgb = lab_to_rgb(dominant_lab)
        hex_color = rgb_to_hex(dominant_rgb)
        h, s, l = hex_to_hsl(hex_color)

        # append results
        results.append({
            "image_idx": int(base_name_no_ext),
            "hex": hex_color,
            "rgb": dominant_rgb,
            "hsl": (h, s, l),
            "lab": tuple(dominant_lab)
        })

    except Exception as e:
        print(f"{im}: FAILED ({e})")
        results.append({
            "image_idx": int(base_name_no_ext),
            "hex": "FAILED",
            "rgb": None,
            "hsl": None,
            "lab": None
        })

521.jpg: FAILED (segmentation_mask key missing)
644.jpg: FAILED (No flower pixels found)
821.jpg: FAILED (segmentation_mask key missing)
1175.jpg: FAILED (segmentation_mask key missing)
1689.jpg: FAILED (segmentation_mask key missing)
1965.jpg: FAILED (segmentation_mask key missing)
2008.jpg: FAILED (segmentation_mask key missing)
2543.jpg: FAILED (segmentation_mask key missing)
3293.jpg: FAILED (segmentation_mask key missing)
3316.jpg: FAILED (segmentation_mask key missing)
3365.jpg: FAILED (segmentation_mask key missing)
3447.jpg: FAILED (segmentation_mask key missing)
3454.jpg: FAILED (segmentation_mask key missing)
3535.jpg: FAILED (segmentation_mask key missing)
3601.jpg: FAILED (segmentation_mask key missing)
3684.jpg: FAILED (segmentation_mask key missing)
3686.jpg: FAILED (segmentation_mask key missing)
4086.jpg: FAILED (segmentation_mask key missing)
4124.jpg: FAILED (segmentation_mask key missing)
4420.jpg: FAILED (segmentation_mask key missing)
4523.jpg: FAILED (segmentation

In [6]:
# make df with color labels
color_df = pd.DataFrame(results)
# make sure it's sorted
color_df = color_df.sort_values(by="image_idx").reset_index(drop=True)
color_df

Unnamed: 0,image_idx,hex,rgb,hsl,lab
0,0,#eeddf6,"(238, 221, 246)","(0.7799999999999999, 0.5813953488372098, 0.915...","(230, 138, 118)"
1,2,#c7aac4,"(199, 170, 196)","(0.8505747126436781, 0.20567375886524827, 0.72...","(186, 143, 119)"
2,4,#bb9ed5,"(187, 158, 213)","(0.7545454545454545, 0.3956834532374102, 0.727...","(177, 149, 104)"
3,7,#9175a0,"(145, 117, 160)","(0.7751937984496124, 0.18454935622317595, 0.54...","(136, 147, 109)"
4,9,#b796e2,"(183, 150, 226)","(0.7390350877192983, 0.5671641791044776, 0.737...","(172, 155, 94)"
...,...,...,...,...,...
20756,41059,#b3a5d8,"(179, 165, 216)","(0.7124183006535948, 0.39534883720930225, 0.74...","(180, 143, 104)"
20757,41060,#ae66b9,"(174, 102, 185)","(0.8112449799196787, 0.3721973094170403, 0.562...","(138, 170, 96)"
20758,41061,#a88dc4,"(168, 141, 196)","(0.7484848484848484, 0.3179190751445088, 0.660...","(160, 149, 103)"
20759,41063,#baada2,"(186, 173, 162)","(0.076388888888889, 0.14814814814814817, 0.682...","(182, 131, 135)"


### match each idx + color codes with corresponding gbifID from gbif multimedia table

In [7]:
# get the observation id for each image
multimedia_table = pd.read_table('../raw_data/0002206-250218110819086/multimedia.txt',sep='\t')
gbifids = multimedia_table.gbifID.loc[list(color_df.image_idx)]

In [8]:
# add a column for observation id for each image
color_df['gbifID'] = list(gbifids)
color_df

Unnamed: 0,image_idx,hex,rgb,hsl,lab,gbifID
0,0,#eeddf6,"(238, 221, 246)","(0.7799999999999999, 0.5813953488372098, 0.915...","(230, 138, 118)",923911394
1,2,#c7aac4,"(199, 170, 196)","(0.8505747126436781, 0.20567375886524827, 0.72...","(186, 143, 119)",923910407
2,4,#bb9ed5,"(187, 158, 213)","(0.7545454545454545, 0.3956834532374102, 0.727...","(177, 149, 104)",899970365
3,7,#9175a0,"(145, 117, 160)","(0.7751937984496124, 0.18454935622317595, 0.54...","(136, 147, 109)",891778924
4,9,#b796e2,"(183, 150, 226)","(0.7390350877192983, 0.5671641791044776, 0.737...","(172, 155, 94)",891760719
...,...,...,...,...,...,...
20756,41059,#b3a5d8,"(179, 165, 216)","(0.7124183006535948, 0.39534883720930225, 0.74...","(180, 143, 104)",1024218211
20757,41060,#ae66b9,"(174, 102, 185)","(0.8112449799196787, 0.3721973094170403, 0.562...","(138, 170, 96)",1024202810
20758,41061,#a88dc4,"(168, 141, 196)","(0.7484848484848484, 0.3179190751445088, 0.660...","(160, 149, 103)",1024200169
20759,41063,#baada2,"(186, 173, 162)","(0.076388888888889, 0.14814814814814817, 0.682...","(182, 131, 135)",1024198670


### also add url column for easy querying later on

In [9]:
urls = multimedia_table.identifier.loc[list(color_df.image_idx)]

In [10]:
# add a column for observation id for each image
color_df['identifier'] = list(urls)
color_df

Unnamed: 0,image_idx,hex,rgb,hsl,lab,gbifID,identifier
0,0,#eeddf6,"(238, 221, 246)","(0.7799999999999999, 0.5813953488372098, 0.915...","(230, 138, 118)",923911394,https://inaturalist-open-data.s3.amazonaws.com...
1,2,#c7aac4,"(199, 170, 196)","(0.8505747126436781, 0.20567375886524827, 0.72...","(186, 143, 119)",923910407,https://inaturalist-open-data.s3.amazonaws.com...
2,4,#bb9ed5,"(187, 158, 213)","(0.7545454545454545, 0.3956834532374102, 0.727...","(177, 149, 104)",899970365,https://inaturalist-open-data.s3.amazonaws.com...
3,7,#9175a0,"(145, 117, 160)","(0.7751937984496124, 0.18454935622317595, 0.54...","(136, 147, 109)",891778924,https://inaturalist-open-data.s3.amazonaws.com...
4,9,#b796e2,"(183, 150, 226)","(0.7390350877192983, 0.5671641791044776, 0.737...","(172, 155, 94)",891760719,https://inaturalist-open-data.s3.amazonaws.com...
...,...,...,...,...,...,...,...
20756,41059,#b3a5d8,"(179, 165, 216)","(0.7124183006535948, 0.39534883720930225, 0.74...","(180, 143, 104)",1024218211,https://inaturalist-open-data.s3.amazonaws.com...
20757,41060,#ae66b9,"(174, 102, 185)","(0.8112449799196787, 0.3721973094170403, 0.562...","(138, 170, 96)",1024202810,https://inaturalist-open-data.s3.amazonaws.com...
20758,41061,#a88dc4,"(168, 141, 196)","(0.7484848484848484, 0.3179190751445088, 0.660...","(160, 149, 103)",1024200169,https://inaturalist-open-data.s3.amazonaws.com...
20759,41063,#baada2,"(186, 173, 162)","(0.076388888888889, 0.14814814814814817, 0.682...","(182, 131, 135)",1024198670,https://inaturalist-open-data.s3.amazonaws.com...


### get corresponding lat/lon info from gbif occurrence table

In [11]:
occ_table = pd.read_table('../raw_data/0002206-250218110819086/occurrence.txt')

  occ_table = pd.read_table('../raw_data/0002206-250218110819086/occurrence.txt')


In [12]:
lats = []
lons = []
for occid in color_df.gbifID:
    occ_subdf = occ_table[occ_table.gbifID.eq(occid)]
    lats.append(occ_subdf.decimalLatitude.iloc[0])
    lons.append(occ_subdf.decimalLongitude.iloc[0])

In [13]:
color_df['latitude'] = lats
color_df['longitude'] = lons

### remove failed rows

In [14]:
color_df = color_df[color_df['hex'] != "FAILED"]
color_df

Unnamed: 0,image_idx,hex,rgb,hsl,lab,gbifID,identifier,latitude,longitude
0,0,#eeddf6,"(238, 221, 246)","(0.7799999999999999, 0.5813953488372098, 0.915...","(230, 138, 118)",923911394,https://inaturalist-open-data.s3.amazonaws.com...,48.826305,-102.092171
1,2,#c7aac4,"(199, 170, 196)","(0.8505747126436781, 0.20567375886524827, 0.72...","(186, 143, 119)",923910407,https://inaturalist-open-data.s3.amazonaws.com...,43.613086,-73.057076
2,4,#bb9ed5,"(187, 158, 213)","(0.7545454545454545, 0.3956834532374102, 0.727...","(177, 149, 104)",899970365,https://inaturalist-open-data.s3.amazonaws.com...,43.066871,-87.890565
3,7,#9175a0,"(145, 117, 160)","(0.7751937984496124, 0.18454935622317595, 0.54...","(136, 147, 109)",891778924,https://inaturalist-open-data.s3.amazonaws.com...,42.140556,-87.831643
4,9,#b796e2,"(183, 150, 226)","(0.7390350877192983, 0.5671641791044776, 0.737...","(172, 155, 94)",891760719,https://inaturalist-open-data.s3.amazonaws.com...,38.679240,-97.990035
...,...,...,...,...,...,...,...,...,...
20756,41059,#b3a5d8,"(179, 165, 216)","(0.7124183006535948, 0.39534883720930225, 0.74...","(180, 143, 104)",1024218211,https://inaturalist-open-data.s3.amazonaws.com...,41.569419,-88.150552
20757,41060,#ae66b9,"(174, 102, 185)","(0.8112449799196787, 0.3721973094170403, 0.562...","(138, 170, 96)",1024202810,https://inaturalist-open-data.s3.amazonaws.com...,45.057871,-87.168277
20758,41061,#a88dc4,"(168, 141, 196)","(0.7484848484848484, 0.3179190751445088, 0.660...","(160, 149, 103)",1024200169,https://inaturalist-open-data.s3.amazonaws.com...,42.921738,-88.026752
20759,41063,#baada2,"(186, 173, 162)","(0.076388888888889, 0.14814814814814817, 0.682...","(182, 131, 135)",1024198670,https://inaturalist-open-data.s3.amazonaws.com...,40.791723,-80.492498


### this is the main dataframe. Write it to a csv

In [15]:
color_df.to_csv('../filtered_labeled_data.csv',index=False)