In [1]:
import os
import geopandas as gpd
import pandas as pd
import fiona
from earthai.geo import reproject_on_the_fly
import statistics
import matplotlib.pyplot as plt
from datetime import date
import re

In [2]:
# Enable fiona driver
gpd.io.file.fiona.drvsupport.supported_drivers['KML'] = 'rw'

In [3]:
# read in annotations
df_annotations = pd.DataFrame()

for path, subdirs, files in os.walk('Completed'):
    for name in files:
        if name.endswith(".kml"): 
            full_path = os.path.join(path, name)
            name = full_path.split("/")[-2]
            plant = full_path.split("/")[-1]
            
            # Read file
            tmp = gpd.read_file(full_path, driver='KML')               
            tmp['Processed By'] = name.strip()
            tmp['uid'] = plant.split(".")[0].strip()

            df_annotations = df_annotations.append(tmp, ignore_index=True)
            
for path, subdirs, files in os.walk('Astraea to Review'):
    for name in files:
        if name.endswith(".kml"): 
            full_path = os.path.join(path, name)
            name = full_path.split("/")[-2]
            plant = full_path.split("/")[-1]
            
            # Read file
            tmp = gpd.read_file(full_path, driver='KML')               
            tmp['Processed By'] = name.strip()
            tmp['uid'] = plant.split(".")[0].strip()

            df_annotations = df_annotations.append(tmp, ignore_index=True)

In [4]:
# function to remove middle name from excel spreadsheet
def remove_middle_name(name):
    arr = name.split()
    if len(arr) > 2:
        return arr[0] + ' ' + arr[2]
    elif len(arr) == 2:
        return arr[0] + ' ' + arr[1]
    else:
        return name

In [5]:
# read in statuses
df_status = pd.DataFrame()

sets=['Annotator 1', 'Annotator 2', 'Annotator 3']
for s in sets:
    tmp = pd.read_excel("Cement Plant Annotation Worksheet.xlsx", sheet_name=s)
    tmp = tmp[['uid','latitude','longitude','Date Processed','Processed By','status','notes']]
    
    df_status = df_status.append(tmp, ignore_index=True)
    
# convert to string
df_status['Processed By'] = df_status['Processed By'].astype(str)
df_status['uid'] = df_status['uid'].astype(str)

# remove middle name from 
df_status['Processed By'] = df_status['Processed By'].apply(remove_middle_name).str.lower()

# remove any leading/trailing spaces
df_status['uid'] = df_status['uid'].str.strip()
df_status['Processed By'] = df_status['Processed By'].str.strip()

# fix status
df_status.status = df_status.status.fillna('')
df_status.status = df_status.status.str.lower()

In [6]:
# # check if merge worked
df = pd.merge(df_status, df_annotations, how='outer', left_on=['uid', 'Processed By'], right_on=['uid', 'Processed By'], indicator=True)

# df._merge.value_counts()
# df[df._merge == 'both']['Processed By'].value_counts()
# df[df._merge == 'right_only']['Processed By'].value_counts()
# df[df._merge == 'left_only']['Processed By'].value_counts()

In [7]:
df[df._merge == 'both']['Processed By'].value_counts()

rasoa simiyu          385
john otieno           313
benice ruga           285
isaack otieno         266
moses njau            243
vincent kipngetich    240
marvin mwangi         216
joshua mwangi         213
damaris okenyuri      211
maina irungu          174
fiona atieno          173
erick karanja         171
caroline kioko        169
regina nyambu         169
solomon gachure       127
emily waithera        119
njoki muriithi        112
clement omunga        104
ephantus maina         89
Name: Processed By, dtype: int64

In [8]:
df[df._merge == 'right_only']['Processed By'].value_counts()

isaack otieno         4
vincent kipngetich    4
marvin mwangi         4
regina nyambu         1
Name: Processed By, dtype: int64

In [9]:
df[df._merge == 'left_only']['Processed By'].value_counts()

benice ruga           4
marvin mwangi         4
njoki muriithi        3
rasoa simiyu          2
fiona atieno          2
ephantus maina        2
damaris okenyuri      1
vincent kipngetich    1
john otieno           1
regina nyambu         1
Name: Processed By, dtype: int64

In [10]:
df_status_count = df_status.status.value_counts().reset_index()
df_status_count.columns = ['status', 'count']
df_status_count['percent'] = (df_status_count['count'] / df_status_count['count'].sum())*100
df_status_count

Unnamed: 0,status,count,percent
0,no issues,777,88.395904
1,kiln under a cover,40,4.550626
2,unclear imagery,21,2.389078
3,kiln is blocked,20,2.275313
4,plant under construction,15,1.706485
5,plant not found,3,0.341297
6,cloudy imagery,2,0.227531
7,no imagery,1,0.113766


In [11]:
# join annotations and statuses
df = pd.merge(df_status, df_annotations, how='left', left_on=['uid', 'Processed By'], right_on=['uid', 'Processed By'])

# drop duplicates in case annotator accidentally exported the same annotation twice
df = df.drop_duplicates()

In [12]:
df['Acquisition_Date'] = pd.to_datetime(df.Description, errors='coerce')
df['Acquisition_Year'] = df['Acquisition_Date'].dt.year

In [13]:
df_year_count = df.Acquisition_Year.value_counts().reset_index()
df_year_count.columns = ['year', 'count']
df_year_count['percent'] = (df_year_count['count'] / df_year_count['count'].sum())*100
df_year_count['year'] = df_year_count['year'].astype(int)
df_year_count = df_year_count[(df_year_count.year != 2026) & (df_year_count.year != 2107)]
df_year_count.sort_values('year')

Unnamed: 0,year,count,percent
16,2002,6,0.164204
17,2003,3,0.082102
12,2004,41,1.122058
19,2005,2,0.054735
15,2006,11,0.30104
18,2007,2,0.054735
14,2008,11,0.30104
13,2009,27,0.738916
11,2010,58,1.587302
8,2011,75,2.052545


### Quality Assurance

In [14]:
def check_annotation_quality(group):
    # add geometry type (point, linestring, polygon)
    group['geom_type'] = group.geometry.apply(lambda x: x.type)
    
    s = ""
    
    # check polygon if missing
    if len(group[group.geom_type == 'Polygon']) == 0:
        s += "Polygon is missing; "
    # check if multiple polygons
    elif len(group[group.geom_type == 'Polygon']) > 1:
        s += "Multiple polygons annotated; "
        
    # check number of linestrings for each kiln
    grouped2 = group[group.geom_type == 'LineString'].groupby(['uid', 'Processed By', 'Name'])
    for name2, group2 in grouped2:
        # check if missing measurement
        if len(group2) < 2:
            s += "Length or width of kiln {} is missing; ".format(name2[2])
        # check if too many measurements
        elif len(group2) > 2:
            s += "More than 2 line measurements for kiln {}; ".format(name2[2])
            
    return s

In [15]:
# Remove previous feedback
!rm -r Completed/*
!rm -r CloudFactory\ to\ Review/*
!rm Completed.tar.gz
!rm CloudFactory\ to\ Review.tar.gz

rm: cannot remove 'Completed.tar.gz': No such file or directory
rm: cannot remove 'CloudFactory to Review.tar.gz': No such file or directory


In [16]:
# check if we have all the annotations
grouped = df.groupby(['uid'])
results_df = pd.DataFrame(columns=['uid', 'annotator1', 'issues1', 'annotator2', 'issues2', 'annotator3', 'issues3'])

for name, group in grouped:
    
    res_dict = {}
    res_dict['uid'] = name    
    
    grouped2 = group.groupby(['uid', 'Processed By', 'status'])
    idx = 1
    for name2, group2 in grouped2:
        if name2[2].strip() == '':
            s = "Status is missing"
        elif name2[2] == 'no issues':
            if len(group2[group2.Name.notnull()]) == 0:
                s = "Annotations are missing"
            else:
                s = check_annotation_quality(group2)
        else:
            s = ''
        res_dict['annotator{}'.format(idx)] = name2[1] 
        res_dict['issues{}'.format(idx)] = s 
        idx += 1
        
        tmp = df_annotations[(df_annotations["Processed By"] == name2[1]) & (df_annotations.uid == name)]
        
        if name2[1] != 'nan':
            if not os.path.isdir('Completed/{}'.format(name2[1])):
                os.mkdir('Completed/{}'.format(name2[1])) 
                os.mkdir('CloudFactory to Review/{}'.format(name2[1])) 

            if len(tmp) > 0:
                if s == '':
                    with fiona.drivers():
                        tmp.to_file('Completed/{}/{}.kml'.format(name2[1], name), driver='KML')
                else:
                    with fiona.drivers():
                        tmp.to_file('CloudFactory to Review/{}/{}.kml'.format(name2[1], name), driver='KML')
                
    results_df = results_df.append(res_dict, ignore_index=True)

In [17]:
def check_overall(row):
    num_issues = 0
    if row.issues1 != '':
        num_issues += 1
    if row.issues2 != '':
        num_issues += 1
    if row.issues3 != '':
        num_issues += 1
    return 3 - num_issues

results_df['num_completed'] = results_df.apply(check_overall,axis=1)

In [18]:
# with pd.option_context('display.max_rows', 300):
#     display(results_df)

today = date.today()
results_df.to_csv('feedback/quality_assurance_{}.csv'.format(today), index=False)

### TODO: Go back and fix above labels.

In [20]:
!tar -czvf "CloudFactory to Review.tar.gz" "CloudFactory to Review/"
!tar -czvf Completed.tar.gz Completed/

CloudFactory to Review/
CloudFactory to Review/maina irungu/
CloudFactory to Review/marvin mwangi/
CloudFactory to Review/regina nyambu/
CloudFactory to Review/isaack otieno/
CloudFactory to Review/caroline kioko/
CloudFactory to Review/rasoa simiyu/
CloudFactory to Review/solomon gachure/
CloudFactory to Review/njoki muriithi/
CloudFactory to Review/erick karanja/
CloudFactory to Review/ephantus maina/
CloudFactory to Review/emily waithera/
CloudFactory to Review/benice ruga/
CloudFactory to Review/moses njau/
CloudFactory to Review/john otieno/
CloudFactory to Review/fiona atieno/
CloudFactory to Review/vincent kipngetich/
CloudFactory to Review/joshua mwangi/
CloudFactory to Review/damaris okenyuri/
CloudFactory to Review/clement omunga/
Completed/
Completed/maina irungu/
Completed/maina irungu/EGY0009.kml
Completed/maina irungu/RUS0013.kml
Completed/maina irungu/BRA0036.kml
Completed/maina irungu/CHN0175.kml
Completed/maina irungu/Mar2004.kml
Completed/maina irungu/BOL0005.kml
Comp