In [1]:
import os
import geopandas as gpd
import pandas as pd
import fiona
from earthai.geo import reproject_on_the_fly
import statistics
import matplotlib.pyplot as plt
from datetime import date

In [2]:
# Enable fiona driver
gpd.io.file.fiona.drvsupport.supported_drivers['KML'] = 'rw'

In [3]:
# read in annotations
df_annotations = pd.DataFrame()

for path, subdirs, files in os.walk('Completed'):
    for name in files:
        if name.endswith(".kml"): 
            full_path = os.path.join(path, name)
            name = full_path.split("/")[-2]
            plant = full_path.split("/")[-1]
            
            # Read file
            tmp = gpd.read_file(full_path, driver='KML')               
            tmp['Processed By'] = name.strip()
            tmp['uid'] = plant.split(".")[0].strip()

            df_annotations = df_annotations.append(tmp, ignore_index=True)

In [4]:
# function to remove middle name from excel spreadsheet
def remove_middle_name(name):
    arr = name.split()
    if len(arr) > 2:
        return arr[0] + ' ' + arr[2]
    elif len(arr) == 2:
        return arr[0] + ' ' + arr[1]
    else:
        return name

In [5]:
# read in statuses
df_status = pd.DataFrame()

sets=['Annotator 1', 'Annotator 2', 'Annotator 3']
for s in sets:
    tmp = pd.read_excel("Cement Plant Annotation Worksheet.xlsx", sheet_name=s)
    tmp = tmp[['uid','latitude','longitude','Date Processed','Processed By','status','notes']]
    
    df_status = df_status.append(tmp, ignore_index=True)
    
# convert to string
df_status['Processed By'] = df_status['Processed By'].astype(str)
df_status['uid'] = df_status['uid'].astype(str)

# remove middle name from 
df_status['Processed By'] = df_status['Processed By'].apply(remove_middle_name).str.lower()

# remove any leading/trailing spaces
df_status['uid'] = df_status['uid'].str.strip()
df_status['Processed By'] = df_status['Processed By'].str.strip()

# fix status
df_status.status = df_status.status.fillna('')
df_status.status = df_status.status.str.lower()

In [6]:
# join annotations and statuses
df = pd.merge(df_status, df_annotations, how='inner', left_on=['uid', 'Processed By'], right_on=['uid', 'Processed By'])

# write raw annotations to geojson
gdf = gpd.GeoDataFrame(df, geometry='geometry')
gdf.to_file("output/raw_annotations_set1.geojson", driver='GeoJSON')

### Add Predictions

In [7]:
# add geometry type (point, linestring, polygon)
df['geom_type'] = df.geometry.apply(lambda x: x.type)

# drop duplicates in case annotator accidentally exported the same annotation twice
df = df.drop_duplicates()

df['predicted_plant_type'] = ''
df['predicted_production_type'] = ''
df['predicted_plant_area_sq_m'] = 0.0
df['predicted_kiln_area_sq_m'] = 0.0

grouped = df.groupby(['uid', 'Processed By'])

for name, group in grouped:
    
    # add production type label
    if len(group[group.geom_type == 'Point']) > 0:
        df.loc[(df['uid'] == name[0]) & (df['Processed By'] == name[1]), 'predicted_production_type'] = 'Dry' 
    else:
        df.loc[(df['uid'] == name[0]) & (df['Processed By'] == name[1]), 'predicted_production_type'] = 'Wet'
        
    # add plant type label
    if len(group[group.geom_type == 'LineString']) > 1:
        df.loc[(df['uid'] == name[0]) & (df['Processed By'] == name[1]), 'predicted_plant_type'] = 'Integrated' 
    else:
        df.loc[(df['uid'] == name[0]) & (df['Processed By'] == name[1]), 'predicted_plant_type'] = 'Grinding' 
        
    # calculate plant area
    if len(group[group.geom_type == 'Polygon']) == 1:
        g_utm, crs = reproject_on_the_fly(group[group.geom_type == 'Polygon'].iloc[0].geometry)
        df.loc[(df['uid'] == name[0]) & (df['Processed By'] == name[1]), 'predicted_plant_area_sq_m'] = g_utm.area 
        
    # calculate kiln area
    grouped2 = group[group.geom_type == 'LineString'].groupby(['uid', 'Processed By', 'Name'])
    total_area = 0.0
    for name2, group2 in grouped2:
        if len(group2) == 2:
            g_utm1, crs1 = reproject_on_the_fly(group2.iloc[0].geometry)
            g_utm2, crs2 = reproject_on_the_fly(group2.iloc[1].geometry)
            total_area += g_utm1.length * g_utm2.length 
    df.loc[(df['uid'] == name[0]) & (df['Processed By'] == name[1]), 'predicted_kiln_area_sq_m'] = total_area 

### Consensus

In [8]:
df = df[['uid', 'latitude', 'longitude', 'Processed By', 'predicted_plant_type', 'predicted_production_type', 'predicted_plant_area_sq_m', 'predicted_kiln_area_sq_m']].drop_duplicates()

grouped = df.groupby(['uid', 'latitude', 'longitude'])

df_output = pd.DataFrame(columns=['uid', 'latitude', 'longitude', 'num_labelers', 'plant_type', 'production_type',
                                  'plant_area_mean', 'plant_area_median', 'plant_area_max', 
                                  'kiln_area_mean', 'kiln_area_median', 'kiln_area_max'])

for name, group in grouped:  
    try:
        plant_type = statistics.mode(list(group.predicted_plant_type.values))
    except:        
        plant_type = ''
        print("Can't find mode of plant type", "({})".format(list(group.predicted_plant_type.values)), "for plant", name[0])

    try:
        production_type = statistics.mode(list(group.predicted_production_type.values))
    except:        
        production_type = ''
        print("Can't find mode of production type", "({})".format(list(group.predicted_production_type.values)), "for plant", name[0])


    plant_area_mean = statistics.mean(list(group.predicted_plant_area_sq_m.values))
    plant_area_median = statistics.median(list(group.predicted_plant_area_sq_m.values))
    plant_area_max = max(list(group.predicted_plant_area_sq_m.values))

    kiln_area_mean = statistics.mean(list(group.predicted_kiln_area_sq_m.values))
    kiln_area_median = statistics.median(list(group.predicted_kiln_area_sq_m.values))
    kiln_area_max = max(list(group.predicted_kiln_area_sq_m.values))

    df_output = df_output.append({'uid': name[0],
                                  'latitude': name[1],
                                  'longitude': name[2],
                                  'num_labelers': len(group),
                                  'plant_type': plant_type, 
                                  'production_type': production_type,
                                  'plant_area_mean': plant_area_mean, 
                                  'plant_area_median': plant_area_median, 
                                  'plant_area_max': plant_area_max, 
                                  'kiln_area_mean': kiln_area_mean, 
                                  'kiln_area_median': kiln_area_median, 
                                  'kiln_area_max': kiln_area_max}, ignore_index=True)
    

Can't find mode of plant type (['Integrated', 'Grinding']) for plant ZMB0002


In [9]:
# rename "uid" to "uid_v3"
df_output = df_output.rename({'uid': 'uid_v3'}, axis=1)

# read in v4.1 dataset to add correct UIDs and reported capacity to dataframe
new_uid_df = pd.read_csv("../../asset-datasets-v4p1/cement_dataset_v4.1.csv")
new_uid_df = new_uid_df[['uid', 'latitude', 'longitude', 'reported_capacity']]

# rename "uid" to "uid_v4.1"
new_uid_df = new_uid_df.rename({'uid': 'uid_v4.1'}, axis=1)

# set number of decimals to 6
new_uid_df.latitude = new_uid_df.latitude.round(6)
new_uid_df.longitude = new_uid_df.longitude.round(6)
df_output.latitude = df_output.latitude.round(6)
df_output.longitude = df_output.longitude.round(6)

# join datasets on lat/long
joined_df = pd.merge(df_output, new_uid_df, on=['latitude', 'longitude'], how='outer', indicator=True)

In [10]:
joined_df = joined_df[(joined_df._merge == 'both')]
joined_df = joined_df[['uid_v3', 'uid_v4.1', 'latitude', 'longitude', 'reported_capacity', 'num_labelers', 'plant_type', 'production_type', 'plant_area_mean', 'plant_area_median', 'plant_area_max', 'kiln_area_mean', 'kiln_area_median', 'kiln_area_max']]
joined_df.to_csv('output/aggregated_annotations_set1.csv', index=False)

In [11]:
with pd.option_context('display.max_rows', 300):
    display(joined_df.head())

Unnamed: 0,uid_v3,uid_v4.1,latitude,longitude,reported_capacity,num_labelers,plant_type,production_type,plant_area_mean,plant_area_median,plant_area_max,kiln_area_mean,kiln_area_median,kiln_area_max
0,AGO0001,AGO0002,-12.342644,13.581766,0.35,3,Grinding,Wet,36269.85,35561.16,38132.39,92.443571,0.0,277.330714
1,ALB0003,ALB0001,41.549091,19.725338,1.4,3,Integrated,Dry,424918.9,405673.4,491371.8,313.077171,298.996568,348.919489
2,ARE0003,ARE0008,25.552944,56.226827,3.2,3,Integrated,Dry,1083715.0,1127529.0,1131933.0,650.92639,600.554569,756.156393
3,ARG0001,ARG0003,-36.982432,-60.243942,2.4,3,Integrated,Dry,699048.3,512173.4,1302721.0,373.217198,385.777426,394.531326
4,ARG0002,ARG0020,-32.862443,-66.850896,0.35,3,Integrated,Dry,603686.4,558810.0,724138.7,353.003363,346.905465,443.664794


In [12]:
joined_df.num_labelers.value_counts()

3    281
2      7
Name: num_labelers, dtype: int64