In [2]:
import os
import geopandas as gpd
import pandas as pd
import fiona
from earthai.geo import reproject_on_the_fly
import statistics
import matplotlib.pyplot as plt
from datetime import date
import re

In [3]:
# Enable fiona driver
gpd.io.file.fiona.drvsupport.supported_drivers['KML'] = 'rw'

In [4]:
# read in annotations
df_annotations = pd.DataFrame()

for path, subdirs, files in os.walk('Completed'):
    for name in files:
        if name.endswith(".kml"): 
            full_path = os.path.join(path, name)
            name = full_path.split("/")[-2]
            plant = full_path.split("/")[-1]
            
            # Read file
            tmp = gpd.read_file(full_path, driver='KML')               
            tmp['Processed By'] = name.strip()
            tmp['uid'] = plant.split(".")[0].strip()

            df_annotations = df_annotations.append(tmp, ignore_index=True)
            
# for path, subdirs, files in os.walk('Astraea to Review'):
#     for name in files:
#         if name.endswith(".kml"): 
#             full_path = os.path.join(path, name)
#             name = full_path.split("/")[-2]
#             plant = full_path.split("/")[-1]
            
#             # Read file
#             tmp = gpd.read_file(full_path, driver='KML')               
#             tmp['Processed By'] = name.strip()
#             tmp['uid'] = plant.split(".")[0].strip()

#             df_annotations = df_annotations.append(tmp, ignore_index=True)

In [5]:
# read in statuses
df_status = pd.DataFrame()

sets=['Annotator 1', 'Annotator 2', 'Annotator 3']
for s in sets:
    tmp = pd.read_excel("cement_dataset_v4.1.xlsx", sheet_name=s)
    tmp = tmp[['uid','latitude','longitude','Date Processed','Processed By','status','notes']]
    
    df_status = df_status.append(tmp, ignore_index=True)
    
# convert to string
df_status['Processed By'] = df_status['Processed By'].astype(str)
df_status['uid'] = df_status['uid'].astype(str)

# to lower case
df_status['Processed By'] = df_status['Processed By'].str.lower()

# remove any leading/trailing spaces
df_status['uid'] = df_status['uid'].str.strip()
df_status['Processed By'] = df_status['Processed By'].str.strip()

# fix status
df_status.status = df_status.status.fillna('')
df_status.status = df_status.status.str.lower()
df_status.status = df_status.status.str.strip()

In [6]:
# # check if merge worked
df = pd.merge(df_status, df_annotations, how='outer', left_on=['uid', 'Processed By'], right_on=['uid', 'Processed By'], indicator=True)

# df._merge.value_counts()
# df[df._merge == 'both']['Processed By'].value_counts()
# df[df._merge == 'right_only']['Processed By'].value_counts()
# df[df._merge == 'left_only']['Processed By'].value_counts()

In [7]:
df[df._merge == 'both']['Processed By'].value_counts()

benice wairimu ruga          2972
marvin mwangi                2910
john oduor otieno            2695
maina lawrence irungu        2644
isaack odhiambo otieno       2431
caroline kioko               2286
vincent kipngetich           2134
fiona atieno                 2050
rasoa simiyu                 1915
njoki muriithi               1609
regina wanjala nyambu        1545
emily nyawira waithera       1381
clement omunga               1354
ephantus maina               1011
erick karanja                 963
damaris kwamboka okenyuri     962
moses njau                    955
joshua gichuki mwangi         657
solomon gitahi gachure        293
Name: Processed By, dtype: int64

In [8]:
df[df._merge == 'right_only']['Processed By'].value_counts()

Series([], Name: Processed By, dtype: int64)

In [9]:
df[df._merge == 'left_only']['Processed By'].value_counts()

isaack odhiambo otieno       116
joshua gichuki mwangi         31
rasoa simiyu                  22
maina lawrence irungu         22
njoki muriithi                11
caroline kioko                11
solomon gitahi gachure        10
john oduor otieno              9
fiona atieno                   9
erick karanja                  6
marvin mwangi                  6
moses njau                     5
clement omunga                 5
benice wairimu ruga            5
regina wanjala nyambu          4
nan                            3
vincent kipngetich             3
damaris kwamboka okenyuri      2
ephantus maina                 1
emily nyawira waithera         1
Name: Processed By, dtype: int64

In [10]:
df_status_count = df_status.status.value_counts().reset_index()
df_status_count.columns = ['status', 'count']
df_status_count['percent'] = (df_status_count['count'] / df_status_count['count'].sum())*100
df_status_count

Unnamed: 0,status,count,percent
0,no issues,7408,87.803722
1,kiln under a cover,561,6.649283
2,plant under construction,148,1.754178
3,kiln is blocked,136,1.611947
4,unclear imagery,95,1.125993
5,plant not found,50,0.592628
6,duplicate plant,28,0.331872
7,cloudy imagery,7,0.082968
8,no imagery,4,0.04741


In [10]:
# join annotations and statuses
df = pd.merge(df_status, df_annotations, how='left', left_on=['uid', 'Processed By'], right_on=['uid', 'Processed By'])

# drop duplicates in case annotator accidentally exported the same annotation twice
df = df.drop_duplicates()

In [11]:
df['Acquisition_Date'] = pd.to_datetime(df.Description, errors='coerce')
df['Acquisition_Year'] = df['Acquisition_Date'].dt.year

In [12]:
df_year_count = df.Acquisition_Year.value_counts().reset_index()
df_year_count.columns = ['year', 'count']
df_year_count['percent'] = (df_year_count['count'] / df_year_count['count'].sum())*100
df_year_count['year'] = df_year_count['year'].astype(int)
df_year_count = df_year_count[(df_year_count.year != 2026) & (df_year_count.year != 2107)]
df_year_count.sort_values('year')

Unnamed: 0,year,count,percent
20,2000,6,0.018426
21,2001,2,0.006142
16,2002,24,0.073706
19,2003,11,0.033782
13,2004,131,0.402309
18,2005,11,0.033782
15,2006,32,0.098274
17,2007,14,0.042995
14,2008,35,0.107487
12,2009,164,0.503655


### Quality Assurance

In [13]:
def check_annotation_quality(group):
    # add geometry type (point, linestring, polygon)
    group['geom_type'] = group.geometry.apply(lambda x: x.type)
    
    s = ""
    
    # check polygon if missing
    if len(group[group.geom_type == 'Polygon']) == 0:
        s += "Polygon is missing; "
    # check if multiple polygons
    elif len(group[group.geom_type == 'Polygon']) > 1:
        s += "Multiple polygons annotated; "
        
    # check number of linestrings for each kiln
    grouped2 = group[group.geom_type == 'LineString'].groupby(['uid', 'Processed By', 'Name'])
    for name2, group2 in grouped2:
        # check if missing measurement
        if len(group2) < 2:
            s += "Length or width of kiln {} is missing; ".format(name2[2])
        # check if too many measurements
        elif len(group2) > 2:
            s += "More than 2 line measurements for kiln {}; ".format(name2[2])
            
    # check acquisition dates
    for idx, row in group.iterrows():
        if pd.notnull(row.Acquisition_Date):
            if row.Acquisition_Date.year < 1900 or row.Acquisition_Date.year > 2021:
                s += "Date format is incorrect on one of annotations; "
            
    return s

In [14]:
# Remove previous feedback
!rm -r Completed/*
!rm -r CloudFactory\ to\ Review/*
!rm Completed.tar.gz
!rm CloudFactory\ to\ Review.tar.gz

In [None]:
# check if we have all the annotations
grouped = df.groupby(['uid'])
results_df = pd.DataFrame(columns=['uid', 'annotator1', 'issues1', 'annotator2', 'issues2', 'annotator3', 'issues3'])

for name, group in grouped:
    
    res_dict = {}
    res_dict['uid'] = name    
    
    grouped2 = group.groupby(['uid', 'Processed By', 'status'])
    idx = 1
    for name2, group2 in grouped2:
        if len(group2[group2.Name.notnull()]) == 2811:
            s = "Excel file needs to be removed from output"
        elif name2[2].strip() == '':
            s = "Status is missing"
        elif name2[2] == 'no issues':
            if len(group2[group2.Name.notnull()]) == 0:
                s = "Annotations are missing"
            else:
                s = check_annotation_quality(group2)
        else:
            s = ''
        res_dict['annotator{}'.format(idx)] = name2[1] 
        res_dict['issues{}'.format(idx)] = s 
        idx += 1
        
        tmp = df_annotations[(df_annotations["Processed By"] == name2[1]) & (df_annotations.uid == name)]
        
        if name2[1] != 'nan':
            if not os.path.isdir('Completed/{}'.format(name2[1])):
                os.mkdir('Completed/{}'.format(name2[1])) 
                os.mkdir('CloudFactory to Review/{}'.format(name2[1])) 

            if len(tmp) > 0:
                if s == '':
                    with fiona.drivers():
                        tmp.to_file('Completed/{}/{}.kml'.format(name2[1], name), driver='KML')
                else:
                    with fiona.drivers():
                        tmp.to_file('CloudFactory to Review/{}/{}.kml'.format(name2[1], name), driver='KML')
                        
    results_df = results_df.append(res_dict, ignore_index=True)

In [None]:
def check_overall(row):
    num_issues = 0
    if row.issues1 != '':
        num_issues += 1
    if row.issues2 != '':
        num_issues += 1
    if row.issues3 != '':
        num_issues += 1
    return 3 - num_issues

results_df['num_completed'] = results_df.apply(check_overall,axis=1)

In [None]:
!pip install openpyxl

In [None]:
import openpyxl as pxl

excel_book = pxl.load_workbook("cement_dataset_v4.1.xlsx")
with pd.ExcelWriter("cement_dataset_v4.1.xlsx", engine='openpyxl') as writer:
    writer.book = excel_book
    writer.sheets = {
        worksheet.title: worksheet
        for worksheet in excel_book.worksheets
    }
    results_df.to_excel(writer, 'Feedback', index=False)
    writer.save()

### TODO: Go back and fix above labels.

In [None]:
!tar -czvf "CloudFactory to Review.tar.gz" "CloudFactory to Review/"
!tar -czvf Completed.tar.gz Completed/