Analysis of Consensus of image labels generated by USFWS Biologists from Bosque del Apache and Maxwell NWRs
Rowan Converse
Start Date: 2023 - 03 - 07

In [2]:
#Imports
import pandas as pd
import numpy as np
import ast

import sklearn.metrics

from shapely.geometry import Polygon,Point
import matplotlib.pyplot as plt
import shapely
import cv2 as cv
import os
import gc

In [3]:
#Data Loading

#Originals 
orig_path = "C:/Users/rowanconverse/OneDrive - University of New Mexico/CV4Ecology/Prototyping/Data/Labels/coco/labelbox.csv"
with open(orig_path) as f:
  originals = pd.read_csv(f)
  originals = originals[originals["filename"] != "BDA_24C_20181107_1.JPG"]
  originals['bbox'] = originals['bbox'].apply(ast.literal_eval)

#Analysis annotations
path = "C:/Users/rowanconverse/OneDrive - University of New Mexico/Dissertation/1_Chapter/consensus/data/expert/20230307_expertanalysislabels_spponly.csv"
with open(path) as f:
  df = pd.read_csv(f)
  df = df[df["filename"] != "BDA_24C_20181107_1.JPG"]
  #df['bbox_orig'] = df['bbox_orig'].apply(ast.literal_eval)
  #df['bbox_refined'] = df['bbox_refined'].apply(ast.literal_eval)

#Consensus annotations
refinedpath = "C:/Users/rowanconverse/OneDrive - University of New Mexico/Dissertation/1_Chapter/consensus/data/expert/expertconsensus_superclass.csv"
with open(refinedpath) as f:
  ref = pd.read_csv(f)
  ref['bbox'] = ref['bbox'].apply(ast.literal_eval)


In [3]:
#Calculate area of bounding boxes
def calc_area(row):
    bbox = row['bbox']
    xmin, ymin, w, h = bbox
    return w * h

ref['area'] = ref.apply(calc_area, axis=1)

In [4]:
#Determine average area of bounding box per class
ref.groupby("class_id")["area"].mean()

class_id
1    16238.500000
2     3719.692857
3     2598.333601
Name: area, dtype: float64

In [3]:
#Calculating IOU for each individual box in the dataset to make a per-label score
from shapely.geometry import box

def eval_bbox(row, col_name):
    bbox_str = row[col_name]
    if pd.notnull(bbox_str):
        bbox = np.array(ast.literal_eval(bbox_str))
        bbox = bbox.astype(float)
    else:
        bbox = np.array([np.nan, np.nan, np.nan, np.nan])
    return bbox


# Define a function to calculate the IOU only if both bounding boxes are non-null
def calculate_iou(row):
    bbox_orig = eval_bbox(row, 'bbox_orig')
    bbox_ref = eval_bbox(row, 'bbox_refined')
    if np.isnan(bbox_orig[0]) or np.isnan(bbox_orig[1]) or np.isnan(bbox_orig[2]) or np.isnan(bbox_orig[3]) or \
        np.isnan(bbox_ref[0]) or np.isnan(bbox_ref[1]) or np.isnan(bbox_ref[2]) or np.isnan(bbox_ref[3]):
        iou = None
    else:
        bbox_orig = box(bbox_orig[0], bbox_orig[1], bbox_orig[0] + bbox_orig[2], bbox_orig[1] + bbox_orig[3])
        bbox_ref = box(bbox_ref[0], bbox_ref[1], bbox_ref[0] + bbox_ref[2], bbox_ref[1] + bbox_ref[3])
        iou = bbox_orig.intersection(bbox_ref).area / bbox_orig.union(bbox_ref).area
    return iou

# Apply the function to each row of the DataFrame and save the results in a new column
df['IOU'] = df.apply(calculate_iou, axis=1)
df.head()

Unnamed: 0.1,Unnamed: 0,annotation_ID,bbox_orig,filename,labeler,cat_orig,cluster_id,cat_refined,bbox_refined,IOU
0,0,1,"[4428, 2707, 125, 103]",BDA_12C_20181127_1.JPG,steven_sesnie@fws.gov,Canadian Goose,0,Canadian Goose,"[4445.5, 2719.5, 95.0, 80.5]",0.593981
1,1,2,"[4308, 2731, 105, 67]",BDA_12C_20181127_1.JPG,steven_sesnie@fws.gov,Canadian Goose,1,Canadian Goose,"[4312.5, 2739.5, 98.0, 44.0]",0.612935
2,2,3,"[3707, 1761, 110, 101]",BDA_12C_20181127_1.JPG,steven_sesnie@fws.gov,Canadian Goose,2,Canadian Goose,"[3725.5, 1779.0, 73.5, 70.5]",0.466404
3,3,4,"[3628, 1882, 90, 38]",BDA_12C_20181127_1.JPG,steven_sesnie@fws.gov,Canadian Goose,3,Canadian Goose,"[3628.0, 1882.0, 92.0, 38.0]",0.978261
4,4,5,"[3669, 1927, 69, 82]",BDA_12C_20181127_1.JPG,steven_sesnie@fws.gov,Canadian Goose,4,Canadian Goose,"[3679.0, 1929.0, 65.0, 82.0]",0.753031


In [8]:
#Overall average IOU of individual bboxes compared with corresponding consensus box
df["IOU"].mean()

0.6380874298901835

In [3]:
#map superclasses
mapping = {'Canadian Goose': 'Goose',
           'Sandhill Crane': 'Crane',
           'Mallard': 'Duck',
           'Northern Pintail': 'Duck',
           'American Wigeon': 'Duck',
           'Ringneck': 'Duck',
           "Ruddy": 'Duck',
           "Readhead": "Duck",
           "Snow Goose": "Goose",
           'Other': 'Other',
           'Teal': 'Duck',
           'Gadwall': 'Duck',
           'Northern Shoveler': 'Duck'}
df["orig_superclass"] = df["cat_orig"].map(mapping)
df.head()

#map superclasses
mapping = {'Canadian Goose': 'Goose',
           'Sandhill Crane': 'Crane',
           'Mallard': 'Duck',
           'Northern Pintail': 'Duck',
           'American Wigeon': 'Duck',
           'Ringneck': 'Duck',
           "Ruddy": 'Duck',
           "Readhead": "Duck",
           "Snow Goose": "Goose",
           'Other': 'Other',
           'Teal': 'Duck',
           'Gadwall': 'Duck',
           'Northern Shoveler': 'Duck'}
df["ref_superclass"] = df["cat_refined"].map(mapping)
df.head()

Unnamed: 0.1,Unnamed: 0,annotation_ID,bbox_orig,filename,labeler,cat_orig,cluster_id,cat_refined,bbox_refined,orig_superclass,ref_superclass
0,0,1,"[4428, 2707, 125, 103]",BDA_12C_20181127_1.JPG,steven_sesnie@fws.gov,Canadian Goose,0,Canadian Goose,"[4445.5, 2719.5, 95.0, 80.5]",Goose,Goose
1,1,2,"[4308, 2731, 105, 67]",BDA_12C_20181127_1.JPG,steven_sesnie@fws.gov,Canadian Goose,1,Canadian Goose,"[4312.5, 2739.5, 98.0, 44.0]",Goose,Goose
2,2,3,"[3707, 1761, 110, 101]",BDA_12C_20181127_1.JPG,steven_sesnie@fws.gov,Canadian Goose,2,Canadian Goose,"[3725.5, 1779.0, 73.5, 70.5]",Goose,Goose
3,3,4,"[3628, 1882, 90, 38]",BDA_12C_20181127_1.JPG,steven_sesnie@fws.gov,Canadian Goose,3,Canadian Goose,"[3628.0, 1882.0, 92.0, 38.0]",Goose,Goose
4,4,5,"[3669, 1927, 69, 82]",BDA_12C_20181127_1.JPG,steven_sesnie@fws.gov,Canadian Goose,4,Canadian Goose,"[3679.0, 1929.0, 65.0, 82.0]",Goose,Goose


In [7]:
#IOU per class
df.groupby("cat_refined")["IOU"].mean()

cat_refined
American Wigeon      0.667285
Canadian Goose       0.616649
Gadwall              0.665799
Mallard              0.630587
Northern Pintail     0.685820
Northern Shoveler    0.641286
Other                0.621356
Sandhill Crane       0.683264
Teal                 0.693175
Name: IOU, dtype: float64

In [4]:
#CALCULATING PIELOU'S INDEX-- 

# Group the dataframe by image, then by cluster
grouped = df.groupby(['filename', 'cluster_id'])

# Create empty lists to store the results
cluster_id_list = []
filename_list = []
consensus_class_id_list = []
class_count_list = []
pielou_index_list = []

# Loop through each group and calculate Pielou's evenness index
for name, group in grouped:
    # Get the cluster ID, filename, and consensus class ID for this group
    cluster_id = name[1]
    filename = name[0]
    #ADJUST LINE BELOW FOR SPP VS SUPERCLASS
    consensus_class_id = group['ref_superclass'].iloc[0]  # Assumes all consensus IDs in the group are the same
    
    # Count the number of annotations in the group
    num_annotations = len(group)
    
    # Count the number of annotations for each original class ID (ADJUST HERE FOR SPP VS SUPERCLASS)
    class_counts = group.groupby('orig_superclass').size().values

    #NEW CODE TO RE-SCORE AGREEMENT AS ZERO BELOW:
    if len(class_counts) == 1:
        evenness_index = 0
    else:
        # Calculate the relative abundance of each original class ID
        relative_abundance = class_counts / num_annotations
    
        # Calculate the evenness index using Pielou's formula
        evenness_index = -np.sum(relative_abundance * np.log(relative_abundance)) / np.log(len(relative_abundance))
    
    # Calculate the relative abundance of each original class ID
    #relative_abundance = class_counts / num_annotations
    
    # Calculate the evenness index using Pielou's formula
    #evenness_index = -np.sum(relative_abundance * np.log(relative_abundance)) / np.log(len(relative_abundance))
    
    # Append the results to the lists
    cluster_id_list.append(cluster_id)
    filename_list.append(filename)
    consensus_class_id_list.append(consensus_class_id)
    agreement = class_count_list.append(class_counts)
    pielou_index_list.append(evenness_index)

# Create a new dataframe with the results
pielou = pd.DataFrame({
    'cluster_id': cluster_id_list,
    'filename': filename_list,
    'consensus_class_ID': consensus_class_id_list,
    'agreement': class_count_list,
    'pielou_index': pielou_index_list
})

In [6]:
#Average Pielou Index per superclass
print(pielou.groupby("consensus_class_ID")["pielou_index"].mean())
print(pielou.groupby("consensus_class_ID")["pielou_index"].std())

consensus_class_ID
Crane    0.009678
Duck     0.252757
Goose    0.240262
Other    0.968260
Name: pielou_index, dtype: float64
consensus_class_ID
Crane    0.069789
Duck     0.321407
Goose    0.263414
Other    0.118706
Name: pielou_index, dtype: float64


In [8]:
print(pielou["pielou_index"].mean())
print(pielou["pielou_index"].std())

0.29532459343672596
0.35145770480458194


In [19]:
#Average Pielou index per species
pielou.groupby("consensus_class_ID")["pielou_index"].mean()

consensus_class_ID
American Wigeon      0.834838
Canadian Goose       0.248050
Gadwall              0.913858
Mallard              0.432168
Northern Pintail     0.480891
Northern Shoveler    0.923912
Other                0.903201
Sandhill Crane       0.009678
Teal                 0.890206
Name: pielou_index, dtype: float64

In [9]:
#Count of overall labels per labeler
df.groupby(["labeler"])["bbox_orig"].count()

labeler
andrew_stetter@fws.gov             1145
barry_wilson@fws.gov               2315
bill_johnson@fws.gov               1983
dan_collins@fws.gov                1395
david.butler@tpwd.texas.gov        2747
jeff_sanchez@fws.gov               1791
jena_moon@fws.gov                  1076
john_vradenburg@fws.gov            1426
josh_vest@fws.gov                   508
jude_smith@fws.gov                 1844
kammie_kruse@fws.gov                960
mbrasher@ducks.org                  485
ronald_deroche@fws.gov              485
stephen.mcdowell@tpwd.texas.gov    1096
steven_sesnie@fws.gov                68
Name: bbox_orig, dtype: int64

In [10]:
#Count of consensus labels per class
ref.groupby(["category"])["bbox"].count()

category
American Wigeon                                                         22
Canadian Goose                                                         140
Gadwall                                                                  5
Mallard                                                               1688
Northern Pintail                                                       262
Northern Shoveler                                                        2
Other                                                                   70
Sandhill Crane                                                          52
Teal                                                                     2
['American Wigeon' 'Gadwall' 'Mallard' 'Northern Pintail' 'Other']       1
['American Wigeon' 'Gadwall' 'Mallard' 'Other' 'Teal']                   1
['American Wigeon' 'Gadwall' 'Teal']                                     2
['American Wigeon' 'Mallard' 'Northern Pintail' 'Other' 'Teal']          9
['American Wigeo

In [20]:
#Previous work not saved in the above dataframe-- mapping superclasses for calculating individual agreement with consensus per class
class_dict = {'Canadian Goose': "Goose",
 'Sandhill Crane': "Crane",
 'Mallard':"Duck",
 'Northern Pintail': "Duck",
 'Northern Shoveler': "Duck",
 'Teal': "Duck",
 'American Wigeon': "Duck",
 'Gadwall': "Duck",
 'Ringneck': "Duck",
 'Ruddy': "Duck",
 'Readhead': "Duck",
 'Other': "Other",
 'Snow Goose': "Goose"}

 df["class_refined"] = df["cat_refined"].map(class_dict)

In [11]:
#Creating new column, calculating agreement per spp class
df['agree'] = 0
df.loc[df['cat_orig'] == df["cat_refined"], 'agree'] = 1
df.head()

Unnamed: 0.1,Unnamed: 0,annotation_ID,bbox_orig,filename,labeler,cat_orig,cluster_id,cat_refined,bbox_refined,agree
0,0,1,"[4428, 2707, 125, 103]",BDA_12C_20181127_1.JPG,steven_sesnie@fws.gov,Canadian Goose,0,Canadian Goose,"[4445.5, 2719.5, 95.0, 80.5]",1
1,1,2,"[4308, 2731, 105, 67]",BDA_12C_20181127_1.JPG,steven_sesnie@fws.gov,Canadian Goose,1,Canadian Goose,"[4312.5, 2739.5, 98.0, 44.0]",1
2,2,3,"[3707, 1761, 110, 101]",BDA_12C_20181127_1.JPG,steven_sesnie@fws.gov,Canadian Goose,2,Canadian Goose,"[3725.5, 1779.0, 73.5, 70.5]",1
3,3,4,"[3628, 1882, 90, 38]",BDA_12C_20181127_1.JPG,steven_sesnie@fws.gov,Canadian Goose,3,Canadian Goose,"[3628.0, 1882.0, 92.0, 38.0]",1
4,4,5,"[3669, 1927, 69, 82]",BDA_12C_20181127_1.JPG,steven_sesnie@fws.gov,Canadian Goose,4,Canadian Goose,"[3679.0, 1929.0, 65.0, 82.0]",1


In [24]:
#Creating new column, calculating agreement per morphological class
df['super_agree'] = 'No'
df.loc[df['orig_superclass'] == df["ref_superclass"], 'super_agree'] = 'Yes'
df.head()

Unnamed: 0.1,Unnamed: 0,annotation_ID,bbox_orig,filename,labeler,cat_orig,cluster_id,cat_refined,bbox_refined,orig_superclass,ref_superclass,agree,super_agree
0,0,1,"[4428, 2707, 125, 103]",BDA_12C_20181127_1.JPG,steven_sesnie@fws.gov,Canadian Goose,0,Canadian Goose,"[4445.5, 2719.5, 95.0, 80.5]",Goose,Goose,Yes,Yes
1,1,2,"[4308, 2731, 105, 67]",BDA_12C_20181127_1.JPG,steven_sesnie@fws.gov,Canadian Goose,1,Canadian Goose,"[4312.5, 2739.5, 98.0, 44.0]",Goose,Goose,Yes,Yes
2,2,3,"[3707, 1761, 110, 101]",BDA_12C_20181127_1.JPG,steven_sesnie@fws.gov,Canadian Goose,2,Canadian Goose,"[3725.5, 1779.0, 73.5, 70.5]",Goose,Goose,Yes,Yes
3,3,4,"[3628, 1882, 90, 38]",BDA_12C_20181127_1.JPG,steven_sesnie@fws.gov,Canadian Goose,3,Canadian Goose,"[3628.0, 1882.0, 92.0, 38.0]",Goose,Goose,Yes,Yes
4,4,5,"[3669, 1927, 69, 82]",BDA_12C_20181127_1.JPG,steven_sesnie@fws.gov,Canadian Goose,4,Canadian Goose,"[3679.0, 1929.0, 65.0, 82.0]",Goose,Goose,Yes,Yes


In [13]:
# Group by species class and calculate the average and standard deviation of "agree" column
#df["agree"] = df["agree"].map({"Yes": 1, "No": 0})

# Group by species class and calculate the average and standard deviation of "agree" column
grouped_data = df.groupby("cat_refined")["agree"].agg(["mean", "std"]).reset_index()
print(df['agree'].mean())
print(df['agree'].std())

# Rename the columns for clarity
#grouped_data.rename(columns={"mean": "average_agree", "std": "std_deviation"}, inplace=True)


0.7393396812254192
0.43900625327809106


In [14]:
print(grouped_data)

         cat_refined      mean       std
0    American Wigeon  0.564885  0.497675
1     Canadian Goose  0.937972  0.241308
2            Gadwall  0.435897  0.502356
3            Mallard  0.831399  0.374414
4   Northern Pintail  0.817820  0.386089
5  Northern Shoveler  0.461538  0.518875
6              Other  0.471888  0.499711
7     Sandhill Crane  0.997738  0.047565
8               Teal  0.538462  0.518875


In [27]:
# Group by species class and calculate the average and standard deviation of "agree" column
df["super_agree"] = df["super_agree"].map({"Yes": 1, "No": 0})

# Group by species class and calculate the average and standard deviation of "agree" column
grouped_data = df.groupby("ref_superclass")["super_agree"].agg(["mean", "std"]).reset_index()
grouped_data

Unnamed: 0,ref_superclass,mean,std
0,Crane,0.997738,0.047565
1,Duck,0.929738,0.255597
2,Goose,0.939648,0.238238
3,Other,0.471888,0.499711


In [5]:
# Group by species class and calculate the average and standard deviation of "agree" column
grouped_data = df.groupby("cat_refined")["agree"].agg(["mean", "std"]).reset_index()
print(df['agree'].mean())
print(df['agree'].std())

# Group by species class and calculate the average and standard deviation of "agree" column
grouped_data = df.groupby("ref_superclass")["agree"].agg(["mean", "std"]).reset_index()
print(grouped_data)

TypeError: Could not convert YesYesNoNoNoYesYesYesYesNoYesNoYesYesYesYesYesNoNoNoNoNoNoNoYesYesYesYesYesYesYesNoNoNoYesNoNoNoNoNoYesYesYesYesYesNoNoNoNoNoNoNoNoNoNoNoYesYesYesYesYesYesYesYesYesYesYesYesYesYesYesYesYesYesYesYesYesYesYesYesYesYesYesNoNoYesYesYesYesNoNoNoNoNoNoNoNoNoNoYesYesYesYesYesYesNoNoNoNoNoNoNoNoNoNoNoYesYesNoYesNoYesYesYesYesYesYesYesYesNoYes to numeric

In [43]:
#Saving as a dataframe
agreement = df.groupby(["labeler","class_refined"])["agree"].value_counts()

In [50]:
#Saving as a CSV
savepath = "/Users/rowanconverse/Library/CloudStorage/OneDrive-UniversityofNewMexico/Dissertation/1_Chapter/consensus/data/expert/"
agreement.to_csv(savepath+"agreementsuperclass.csv")

In [8]:
#Counting number of dropped annotations that were not matched with a cluster
missing = df[df["cluster_id"] == -1]
len(missing)

1238

In [37]:
savepath = "/Users/rowanconverse/Library/CloudStorage/OneDrive-UniversityofNewMexico/Dissertation/1_Chapter/consensus/data/expert/"
#exportanalysis = datetime.now().strftime('%Y%m%d_zooniverseanalysislabels_seagull_nodrops.csv')
clusters.to_csv(path+"clustercheck.csv")

In [12]:
#Labeler reliability: overall agreement with consensus
df['agree'] = 'No'
df.loc[df['cat_orig'] == df["cat_refined"], 'agree'] = 'Yes'
df.groupby("labeler")["agree"].value_counts()

labeler                          agree
andrew_stetter@fws.gov           Yes       949
                                 No        196
barry_wilson@fws.gov             Yes      1669
                                 No        646
bill_johnson@fws.gov             Yes      1497
                                 No        486
dan_collins@fws.gov              Yes      1069
                                 No        326
david.butler@tpwd.texas.gov      Yes      1873
                                 No        874
jeff_sanchez@fws.gov             Yes      1460
                                 No        331
jena_moon@fws.gov                No        590
                                 Yes       486
john_vradenburg@fws.gov          Yes      1205
                                 No        221
josh_vest@fws.gov                Yes       497
                                 No         11
jude_smith@fws.gov               No       1072
                                 Yes       772
kammie_kruse@fws.gov 

In [40]:
#Range of expert counts per image
range = df.groupby(['filename', 'labeler']).size()
r = range.reset_index()

r.rename( columns={0 :'count'}, inplace=True )
r.groupby("filename")["count"].apply(minmax)

filename
BDA_12C_20181127_1.JPG        (68, 93)
BDA_12C_20181127_2.JPG      (288, 570)
BDA_12C_20181127_3.JPG     (220, 1043)
BDA_18A4_20181106_1.JPG     (159, 165)
BDA_18A4_20181106_2.JPG     (300, 367)
BDA_18A4_20181106_3.JPG       (62, 66)
BDA_18A4_20181106_4.JPG     (113, 117)
BDA_18A4_20181107_1.JPG       (75, 83)
BDA_18A4_20181107_2.JPG       (26, 36)
BDA_18A4_20181107_3.JPG       (88, 91)
BDA_18A4_20181107_4.JPG       (83, 86)
mxw_L13_20181215_1.JPG        (38, 39)
Name: count, dtype: object