In [1]:
import os
import geopandas as gpd
import pandas as pd
import fiona
from earthai.geo import reproject_on_the_fly
import statistics
import matplotlib.pyplot as plt
from datetime import date

In [2]:
# Enable fiona driver
gpd.io.file.fiona.drvsupport.supported_drivers['KML'] = 'rw'

In [3]:
# read in annotations
df_annotations = pd.DataFrame()

for path, subdirs, files in os.walk('Completed'):
    for name in files:
        if name.endswith(".kml"): 
            full_path = os.path.join(path, name)
            name = full_path.split("/")[-2]
            plant = full_path.split("/")[-1]
            
            # Read file
            tmp = gpd.read_file(full_path, driver='KML')               
            tmp['Processed by'] = name.strip()
            tmp['tile_id'] = plant.split(".")[0].strip()

            df_annotations = df_annotations.append(tmp, ignore_index=True)

In [4]:
# read in statuses
df_status = pd.DataFrame()

sets=['Annotator 1', 'Annotator 2', 'Annotator 3']
for s in sets:
    tmp = pd.read_excel("Cement Plant Annotation - Group 3.xlsx", sheet_name=s)
    tmp = tmp[['tile_id','Date Processed','Processed by','Status','Notes']]
    
    df_status = df_status.append(tmp, ignore_index=True)
    
# convert to string
df_status['Processed by'] = df_status['Processed by'].astype(str)
df_status['tile_id'] = df_status['tile_id'].astype(str)

# to lower case
df_status['Processed by'] = df_status['Processed by'].str.lower()

# remove any leading/trailing spaces
df_status['tile_id'] = df_status['tile_id'].str.strip()
df_status['Processed by'] = df_status['Processed by'].str.strip()

# fix status
df_status.Status = df_status.Status.fillna('')
df_status.Status = df_status.Status.str.lower()
df_status.Status = df_status.Status.str.strip()

In [5]:
# join annotations and statuses
df = pd.merge(df_status, df_annotations, how='inner', left_on=['tile_id', 'Processed by'], right_on=['tile_id', 'Processed by'])

# write raw annotations to geojson
gdf = gpd.GeoDataFrame(df, geometry='geometry')
gdf.to_file("output/raw_annotations_set3.geojson", driver='GeoJSON')

### Add Predictions

In [6]:
# add geometry type (point, linestring, polygon)
df['geom_type'] = df.geometry.apply(lambda x: x.type)

# drop duplicates in case annotator accidentally exported the same annotation twice
df = df.drop_duplicates()

df['predicted_plant_type'] = ''
df['predicted_production_type'] = ''
df['predicted_plant_area_sq_m'] = 0.0
df['predicted_kiln_area_sq_m'] = 0.0

grouped = df.groupby(['tile_id', 'Processed by'])

for name, group in grouped:
    
    # add production type label
    if len(group[group.geom_type == 'Point']) > 0:
        df.loc[(df['tile_id'] == name[0]) & (df['Processed by'] == name[1]), 'predicted_production_type'] = 'Dry' 
    else:
        df.loc[(df['tile_id'] == name[0]) & (df['Processed by'] == name[1]), 'predicted_production_type'] = 'Wet'
        
    # add plant type label
    if len(group[group.geom_type == 'LineString']) > 1:
        df.loc[(df['tile_id'] == name[0]) & (df['Processed by'] == name[1]), 'predicted_plant_type'] = 'Integrated' 
    else:
        df.loc[(df['tile_id'] == name[0]) & (df['Processed by'] == name[1]), 'predicted_plant_type'] = 'Grinding' 
        
    # calculate plant area
    if len(group[group.geom_type == 'Polygon']) == 1:
        g_utm, crs = reproject_on_the_fly(group[group.geom_type == 'Polygon'].iloc[0].geometry)
        df.loc[(df['tile_id'] == name[0]) & (df['Processed by'] == name[1]), 'predicted_plant_area_sq_m'] = g_utm.area 
        
    # calculate kiln area
    grouped2 = group[group.geom_type == 'LineString'].groupby(['tile_id', 'Processed by', 'Name'])
    total_area = 0.0
    for name2, group2 in grouped2:
        if len(group2) == 2:
            g_utm1, crs1 = reproject_on_the_fly(group2.iloc[0].geometry)
            g_utm2, crs2 = reproject_on_the_fly(group2.iloc[1].geometry)
            total_area += g_utm1.length * g_utm2.length 
    df.loc[(df['tile_id'] == name[0]) & (df['Processed by'] == name[1]), 'predicted_kiln_area_sq_m'] = total_area 

### Consensus

In [7]:
df = df[['tile_id', 'Processed by', 'predicted_plant_type', 'predicted_production_type', 'predicted_plant_area_sq_m', 'predicted_kiln_area_sq_m']].drop_duplicates()

grouped = df.groupby(['tile_id'])

df_output = pd.DataFrame(columns=['tile_id', 'num_labelers', 'plant_type', 'production_type',
                                  'plant_area_mean', 'plant_area_median', 'plant_area_max', 
                                  'kiln_area_mean', 'kiln_area_median', 'kiln_area_max'])

for name, group in grouped:  
    try:
        plant_type = statistics.mode(list(group.predicted_plant_type.values))
    except:        
        plant_type = ''
        print("Can't find mode of plant type", "({})".format(list(group.predicted_plant_type.values)), "for plant", name[0])

    try:
        production_type = statistics.mode(list(group.predicted_production_type.values))
    except:        
        production_type = ''
        print("Can't find mode of production type", "({})".format(list(group.predicted_production_type.values)), "for plant", name[0])


    plant_area_mean = statistics.mean(list(group.predicted_plant_area_sq_m.values))
    plant_area_median = statistics.median(list(group.predicted_plant_area_sq_m.values))
    plant_area_max = max(list(group.predicted_plant_area_sq_m.values))

    kiln_area_mean = statistics.mean(list(group.predicted_kiln_area_sq_m.values))
    kiln_area_median = statistics.median(list(group.predicted_kiln_area_sq_m.values))
    kiln_area_max = max(list(group.predicted_kiln_area_sq_m.values))

    df_output = df_output.append({'tile_id': name,
                                  'num_labelers': len(group),
                                  'plant_type': plant_type, 
                                  'production_type': production_type,
                                  'plant_area_mean': plant_area_mean, 
                                  'plant_area_median': plant_area_median, 
                                  'plant_area_max': plant_area_max, 
                                  'kiln_area_mean': kiln_area_mean, 
                                  'kiln_area_median': kiln_area_median, 
                                  'kiln_area_max': kiln_area_max}, ignore_index=True)
    

Can't find mode of plant type (['Integrated', 'Grinding']) for plant M
Can't find mode of production type (['Dry', 'Wet']) for plant M
Can't find mode of plant type (['Integrated', 'Grinding']) for plant M
Can't find mode of production type (['Dry', 'Wet']) for plant M
Can't find mode of plant type (['Integrated', 'Grinding']) for plant M
Can't find mode of production type (['Dry', 'Wet']) for plant M
Can't find mode of production type (['Wet', 'Dry']) for plant M
Can't find mode of production type (['Wet', 'Dry']) for plant M
Can't find mode of plant type (['Grinding', 'Integrated']) for plant M
Can't find mode of production type (['Wet', 'Dry']) for plant M
Can't find mode of plant type (['Grinding', 'Integrated']) for plant M
Can't find mode of production type (['Wet', 'Dry']) for plant M
Can't find mode of plant type (['Integrated', 'Grinding']) for plant M
Can't find mode of production type (['Dry', 'Wet']) for plant M
Can't find mode of production type (['Dry', 'Wet']) for plant 

In [8]:
df_output.head()

Unnamed: 0,tile_id,num_labelers,plant_type,production_type,plant_area_mean,plant_area_median,plant_area_max,kiln_area_mean,kiln_area_median,kiln_area_max
0,MGRS-43SED-0067-2020-01,3,Integrated,Dry,357751.987128,402881.453013,412888.171408,552.943222,507.159947,680.312082
1,MGRS-43SED-0138-2020-01,3,Integrated,Dry,222490.738315,304748.717347,321521.149827,195.362496,216.462809,219.853482
2,MGRS-43SED-0191-2020-03,3,Integrated,Dry,449422.371871,433954.619988,499705.930508,238.487064,231.620766,304.293475
3,MGRS-43SFC-0029-2020-01,3,Integrated,Dry,380902.406522,380033.864829,396630.182362,375.903044,369.81207,581.661427
4,MGRS-43SGB-0043-2020-01,3,Integrated,Dry,385362.92829,385862.791525,461855.038773,264.028944,262.407658,346.844135


In [9]:
df_output.to_csv('output/aggregated_annotations_set3.csv', index=False)

In [10]:
df_output.num_labelers.value_counts()

3    710
1    311
2    132
Name: num_labelers, dtype: int64