In [1]:
import os
import geopandas as gpd
import pandas as pd
import fiona
from earthai.geo import reproject_on_the_fly
import statistics
import matplotlib.pyplot as plt
from datetime import date

In [2]:
# Enable fiona driver
gpd.io.file.fiona.drvsupport.supported_drivers['KML'] = 'rw'

In [3]:
# read in annotations
df_annotations = pd.DataFrame()

for path, subdirs, files in os.walk('Completed'):
    for name in files:
        if name.endswith(".kml"): 
            full_path = os.path.join(path, name)
            name = full_path.split("/")[-2]
            plant = full_path.split("/")[-1]
            
            # Read file
            tmp = gpd.read_file(full_path, driver='KML')               
            tmp['Processed By'] = name.strip()
            tmp['uid'] = plant.split(".")[0].strip()

            df_annotations = df_annotations.append(tmp, ignore_index=True)

In [4]:
# read in statuses
df_status = pd.DataFrame()

sets=['Annotator 1', 'Annotator 2', 'Annotator 3']
for s in sets:
    tmp = pd.read_excel("cement_dataset_v4.1.xlsx", sheet_name=s)
    tmp = tmp[['uid','latitude','longitude','Date Processed','Processed By','status','notes']]
    
    df_status = df_status.append(tmp, ignore_index=True)
    
# convert to string
df_status['Processed By'] = df_status['Processed By'].astype(str)
df_status['uid'] = df_status['uid'].astype(str)

# to lower case
df_status['Processed By'] = df_status['Processed By'].str.lower()

# remove any leading/trailing spaces
df_status['uid'] = df_status['uid'].str.strip()
df_status['Processed By'] = df_status['Processed By'].str.strip()

# fix status
df_status.status = df_status.status.fillna('')
df_status.status = df_status.status.str.lower()
df_status.status = df_status.status.str.strip()

In [5]:
# join annotations and statuses
df = pd.merge(df_status, df_annotations, how='inner', left_on=['uid', 'Processed By'], right_on=['uid', 'Processed By'])

# write raw annotations to geojson
gdf = gpd.GeoDataFrame(df, geometry='geometry')
# gdf.to_file("output/raw_annotations_set2.geojson", driver='GeoJSON')

### Add Predictions

In [6]:
# add geometry type (point, linestring, polygon)
df['geom_type'] = df.geometry.apply(lambda x: x.type)

# drop duplicates in case annotator accidentally exported the same annotation twice
df = df.drop_duplicates()

df['predicted_plant_type'] = ''
df['predicted_production_type'] = ''
df['predicted_plant_area_sq_m'] = 0.0
df['predicted_kiln_area_sq_m'] = 0.0

grouped = df.groupby(['uid', 'Processed By'])

for name, group in grouped:
    
    # add production type label
    if len(group[group.geom_type == 'Point']) > 0:
        df.loc[(df['uid'] == name[0]) & (df['Processed By'] == name[1]), 'predicted_production_type'] = 'Dry' 
    else:
        df.loc[(df['uid'] == name[0]) & (df['Processed By'] == name[1]), 'predicted_production_type'] = 'Wet'
        
    # add plant type label
    if len(group[group.geom_type == 'LineString']) > 1:
        df.loc[(df['uid'] == name[0]) & (df['Processed By'] == name[1]), 'predicted_plant_type'] = 'Integrated' 
    else:
        df.loc[(df['uid'] == name[0]) & (df['Processed By'] == name[1]), 'predicted_plant_type'] = 'Grinding' 
        
    # calculate plant area
    if len(group[group.geom_type == 'Polygon']) == 1:
        g_utm, crs = reproject_on_the_fly(group[group.geom_type == 'Polygon'].iloc[0].geometry)
        df.loc[(df['uid'] == name[0]) & (df['Processed By'] == name[1]), 'predicted_plant_area_sq_m'] = g_utm.area 
        
    # calculate kiln area
    grouped2 = group[group.geom_type == 'LineString'].groupby(['uid', 'Processed By', 'Name'])
    total_area = 0.0
    for name2, group2 in grouped2:
        if len(group2) == 2:
            g_utm1, crs1 = reproject_on_the_fly(group2.iloc[0].geometry)
            g_utm2, crs2 = reproject_on_the_fly(group2.iloc[1].geometry)
            total_area += g_utm1.length * g_utm2.length 
    df.loc[(df['uid'] == name[0]) & (df['Processed By'] == name[1]), 'predicted_kiln_area_sq_m'] = total_area 

### Consensus

In [7]:
df = df[['uid', 'latitude', 'longitude', 'Processed By', 'predicted_plant_type', 'predicted_production_type', 'predicted_plant_area_sq_m', 'predicted_kiln_area_sq_m']].drop_duplicates()

grouped = df.groupby(['uid', 'latitude', 'longitude'])

df_output = pd.DataFrame(columns=['uid', 'latitude', 'longitude', 'num_labelers', 'plant_type', 'production_type',
                                  'plant_area_mean', 'plant_area_median', 'plant_area_max', 
                                  'kiln_area_mean', 'kiln_area_median', 'kiln_area_max'])

for name, group in grouped:  
    try:
        plant_type = statistics.mode(list(group.predicted_plant_type.values))
    except:        
        plant_type = ''
        print("Can't find mode of plant type", "({})".format(list(group.predicted_plant_type.values)), "for plant", name[0])

    try:
        production_type = statistics.mode(list(group.predicted_production_type.values))
    except:        
        production_type = ''
        print("Can't find mode of production type", "({})".format(list(group.predicted_production_type.values)), "for plant", name[0])


    plant_area_mean = statistics.mean(list(group.predicted_plant_area_sq_m.values))
    plant_area_median = statistics.median(list(group.predicted_plant_area_sq_m.values))
    plant_area_max = max(list(group.predicted_plant_area_sq_m.values))

    kiln_area_mean = statistics.mean(list(group.predicted_kiln_area_sq_m.values))
    kiln_area_median = statistics.median(list(group.predicted_kiln_area_sq_m.values))
    kiln_area_max = max(list(group.predicted_kiln_area_sq_m.values))

    df_output = df_output.append({'uid': name[0],
                                  'latitude': name[1],
                                  'longitude': name[2],
                                  'num_labelers': len(group),
                                  'plant_type': plant_type, 
                                  'production_type': production_type,
                                  'plant_area_mean': plant_area_mean, 
                                  'plant_area_median': plant_area_median, 
                                  'plant_area_max': plant_area_max, 
                                  'kiln_area_mean': kiln_area_mean, 
                                  'kiln_area_median': kiln_area_median, 
                                  'kiln_area_max': kiln_area_max}, ignore_index=True)
    

Can't find mode of production type (['Wet', 'Dry']) for plant BGD0017
Can't find mode of plant type (['Integrated', 'Grinding']) for plant BRA0023
Can't find mode of plant type (['Integrated', 'Grinding']) for plant BRA0026
Can't find mode of plant type (['Integrated', 'Grinding']) for plant BRA0027
Can't find mode of plant type (['Integrated', 'Grinding']) for plant BRA0048
Can't find mode of production type (['Wet', 'Dry']) for plant BRA0049
Can't find mode of production type (['Wet', 'Dry']) for plant BRA0087
Can't find mode of plant type (['Integrated', 'Grinding']) for plant CAN0003
Can't find mode of plant type (['Integrated', 'Grinding']) for plant CAN0014
Can't find mode of plant type (['Grinding', 'Integrated']) for plant CHN0073
Can't find mode of production type (['Wet', 'Dry']) for plant CHN0073
Can't find mode of plant type (['Grinding', 'Integrated']) for plant CHN0074
Can't find mode of plant type (['Grinding', 'Integrated']) for plant CHN0075
Can't find mode of plant ty

In [8]:
# read in v4.1 dataset to add reported capacity to dataframe
new_uid_df = pd.read_csv("../../asset-datasets-v4p1/cement_dataset_v4.1.csv")
new_uid_df = new_uid_df[['uid', 'reported_capacity']]

# join datasets on lat/long
joined_df = pd.merge(df_output, new_uid_df, on=['uid'], how='outer', indicator=True)

In [9]:
joined_df = joined_df[(joined_df._merge == 'both')]
joined_df = joined_df[['uid', 'latitude', 'longitude', 'reported_capacity', 'num_labelers', 'plant_type', 'production_type', 'plant_area_mean', 'plant_area_median', 'plant_area_max', 'kiln_area_mean', 'kiln_area_median', 'kiln_area_max']]
# joined_df.to_csv('output/aggregated_annotations_set2.csv', index=False)

In [10]:
with pd.option_context('display.max_rows', 300):
    display(joined_df.head())

Unnamed: 0,uid,latitude,longitude,reported_capacity,num_labelers,plant_type,production_type,plant_area_mean,plant_area_median,plant_area_max,kiln_area_mean,kiln_area_median,kiln_area_max
0,AFG0001,35.9658,68.686338,,3,Integrated,Wet,59694.296096,60032.03,60465.4,1097.331025,1182.355153,1423.136914
1,AGO0001,-12.537825,13.496729,,3,Grinding,Wet,255977.671935,285921.2,312462.4,0.0,0.0,0.0
2,AGO0002,-12.342644,13.581766,0.35,3,Grinding,Wet,34878.829874,34656.2,36134.05,76.007485,0.0,228.022454
3,AGO0003,-11.185243,14.030804,,3,Integrated,Dry,533124.293332,432354.5,752180.2,304.226839,321.271889,344.329822
4,AGO0004,-9.101295,13.567408,,3,Integrated,Dry,930376.764663,1017445.0,1105695.0,694.202968,681.840779,821.392787


In [11]:
joined_df.num_labelers.value_counts()

3    2553
2     231
1      18
4       4
Name: num_labelers, dtype: int64

### Review plants where:
- num labelers is 0 or 1
- num labelers is 2 but there wasn't consensus on plant type or production type
- kiln is blocked

In [12]:
# 0 labelers, not duplicate plant
review_df0 = df_status[(~df_status.uid.isin(joined_df.uid.values)) & (df_status.status != 'duplicate plant')][['uid', 'latitude', 'longitude']].drop_duplicates()

# 1 labeler or no consensus
review_df1 = joined_df[(joined_df.num_labelers < 2) | (joined_df.plant_type == '') | (joined_df.production_type == '')][['uid', 'latitude', 'longitude']]

# plants with issues at 2 or more plants
# remove plants where all labelers agree that it is a duplicate plant or the kiln is under a cover
review_df2 = pd.pivot_table(df_status.reset_index(), index=['uid', 'latitude', 'longitude'], values='index', columns=['status'], aggfunc=lambda x: len(x.unique())).reset_index().fillna(0.0)
review_df2 = review_df2[(review_df2['no issues'] < 2.0) & (review_df2['duplicate plant'] < 3.0) & (review_df2['kiln under a cover'] < 3.0)][['uid', 'latitude', 'longitude']]

review_df = pd.concat([review_df0, review_df1, review_df2]).drop_duplicates()

review_gdf = gpd.GeoDataFrame(review_df, geometry=gpd.points_from_xy(review_df.longitude, review_df.latitude), crs="EPSG:4326")

In [13]:
review_gdf.to_file("output/sw_cement_annotations.geojson", driver='GeoJSON')

In [14]:
len(review_gdf)

203

In [15]:
review_gdf.head()

status,uid,latitude,longitude,geometry
1165,COL0019,6.549877,-74.798042,POINT (-74.79804 6.54988)
2004,MLI0003,15.088743,-9.504169,POINT (-9.50417 15.08874)
116,BGD0017,23.934081,90.613085,POINT (90.61308 23.93408)
178,BRA0023,-19.607266,-44.058174,POINT (-44.05817 -19.60727)
181,BRA0026,-24.518867,-48.856014,POINT (-48.85601 -24.51887)
