# Batch 3 - Iteration 2: Determine the annotators for the second iteration inter-annotator reliability measure

In [1]:
import json
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from tqdm import tqdm

tqdm.pandas()

In [54]:
# Paths
OAK_FP = '/oak/stanford/groups/deho/building_compliance/'
CF_FP = os.path.join(OAK_FP, 'CloudFactory')
OUTPUT_FP = os.path.join(CF_FP, 'images', 'batch3_revise')

In [3]:
# Annotators
annotator_dict = {
    'Grace': 'grace.odemba@es.cloudfactory.com', 
    'Jackline': 'jackline.ogutu@es.cloudfactory.com', 
    'Joy': 'joy.mwende@es.cloudfactory.com', 
    'Nyambe': 'nyambe.mwangelwa@es.cloudfactory.com'
}
# Nyambe was the reviewer for Batch 3 - part 3 parcels

In [4]:
def get_parcel_annotators(json_file):
    with open(json_file, 'r') as f:
        a_dict = json.load(f)
        
    initial_annotator = []
    any_annotator = []
        
    initial_annotator_tasks = a_dict['metadata']['system']['taskStatusLog']
    for task in initial_annotator_tasks:
        initial_annotator.append(task['status']['creator'])
    
    for annotation in a_dict['annotations']:
        any_annotator.append(annotation['creator'])
        any_annotator.append(annotation['updatedBy'])
        
    # Remove stanford annotators
    initial_annotator = [a for a in initial_annotator if '@law.stanford' not in a]
    any_annotator = [a for a in any_annotator if '@law.stanford' not in a]
    
    # Consolidate any annotators
    any_annotator = set(initial_annotator + any_annotator)
        
    return set(initial_annotator), set(any_annotator)

In [5]:
def determine_iter2_annotator(iter1_annotators, batch, annotator_dict):
    possible_annotators = set(annotator_dict.values())
    
    # Remove Iteration 1 annotators
    possible_annotators = possible_annotators.difference(iter1_annotators)
    
    # If Batch3 part 3, remove Nyambe as they were the reviewer for most parcels
    if batch == 'Batch3_part3':
        possible_annotators = possible_annotators.difference({'nyambe.mwangelwa@es.cloudfactory.com'})
    
    return possible_annotators

In [60]:
def main(OAK_FP, CF_FP, OUTPUT_FP, annotator_dict):
    # Iteration 2 APNs
    iter2_apns = pd.read_pickle(os.path.join(CF_FP, 'images/batch3_revise/sample.p'))
    
    # Collect Iteration 1 annotators
    annotator_df = pd.DataFrame()
    for stratum, apns in tqdm(iter2_apns.items()):
        for apn in apns:
            for batch in ['Batch3_part1', 'Batch3_part2', 'Batch3_part3', 'Batch2', 'Batch1_100']:
                batch_path = os.path.join(CF_FP, 'exports', batch)
                try: 
                    initial_an, any_an = get_parcel_annotators(json_file=os.path.join(batch_path, f"{apn}.json"))
                    parcel_df = pd.DataFrame.from_dict(
                        {'APN': [apn], 'Batch': [batch], 'Initial': [initial_an], 'Any': [any_an]})
                    annotator_df = pd.concat([annotator_df, parcel_df])
                except FileNotFoundError:
                    pass
    
    total_iter2 = len(iter2_apns['positives']) + len(iter2_apns['negatives_sample']) + len(iter2_apns['true_negatives_sample'])
    assert len(annotator_df) == total_iter2
    
    # Define Iteration 2 possible annotators
    annotator_df['possible_annotators'] = annotator_df.apply(
    lambda row: determine_iter2_annotator(
        iter1_annotators=row['Any'], batch=row['Batch'], annotator_dict=annotator_dict), axis=1)
    
    # Final annotator
    np.random.seed(1)
    iter2_annotator_df = pd.DataFrame()
    iter2_counts = {v: 0 for v in annotator_dict.values()}
    even_split = len(annotator_df) // len(annotator_dict)

    for apn in annotator_df['APN'].unique():
        apn_row = annotator_df.loc[annotator_df['APN'] == apn].iloc[0]
        possible_annotators = apn_row['possible_annotators']

        # Even out annotator assignments
        for annotator in possible_annotators:
            if iter2_counts[annotator] >= even_split + 2:
                possible_annotators = possible_annotators.difference({annotator})

        if len(possible_annotators) == 0:
            selected_annotator = None
        else:
            if 'nyambe.mwangelwa@es.cloudfactory.com' in possible_annotators:
                selected_annotator = 'nyambe.mwangelwa@es.cloudfactory.com'
            else:
                selected_annotator = np.random.choice(list(possible_annotators), 1)[0]

            # Update dict
            iter2_counts[selected_annotator] += 1

        iter2_annotator_df = pd.concat(
            [iter2_annotator_df, pd.DataFrame.from_dict({'APN': [apn], 'Annotator': [selected_annotator]})])

    # Check that all parcels have an annotator
    assert iter2_annotator_df['Annotator'].isna().sum() == 0
    
    # Annotator distribution
    print(iter2_counts)
    
    # Save
    #iter2_annotator_df = iter2_annotator_df.merge(annotator_df, validate='one_to_one')
    iter2_annotator_df.to_csv(os.path.join(OUTPUT_FP, 'annotators.csv'))
    return iter2_annotator_df

In [61]:
if __name__ == '__main__':
    iter2_annotator_df = main(OAK_FP, CF_FP, OUTPUT_FP, annotator_dict)

100%|██████████| 3/3 [00:02<00:00,  1.22it/s]


{'grace.odemba@es.cloudfactory.com': 325, 'jackline.ogutu@es.cloudfactory.com': 326, 'joy.mwende@es.cloudfactory.com': 329, 'nyambe.mwangelwa@es.cloudfactory.com': 329}


In [63]:
iter2_annotator_df.groupby('Annotator')['APN'].count()

Annotator
grace.odemba@es.cloudfactory.com        325
jackline.ogutu@es.cloudfactory.com      326
joy.mwende@es.cloudfactory.com          329
nyambe.mwangelwa@es.cloudfactory.com    329
Name: APN, dtype: int64