In [1]:
import os
import geopandas as gpd
import pandas as pd
import fiona
from earthai.geo import reproject_on_the_fly
import statistics
import matplotlib.pyplot as plt
from datetime import date
import re

In [2]:
# Enable fiona driver
gpd.io.file.fiona.drvsupport.supported_drivers['KML'] = 'rw'

In [3]:
# read in annotations
df_annotations = pd.DataFrame()

for path, subdirs, files in os.walk('Completed'):
    for name in files:
        if name.endswith(".kml"): 
            full_path = os.path.join(path, name)
            name = full_path.split("/")[-2]
            plant = full_path.split("/")[-1]
            
            # Read file
            tmp = gpd.read_file(full_path, driver='KML')               
            tmp['Processed by'] = name.strip()
            tmp['tile_id'] = plant.split(".")[0].strip()

            df_annotations = df_annotations.append(tmp, ignore_index=True)
            
for path, subdirs, files in os.walk('Astraea to Review'):
    for name in files:
        if name.endswith(".kml"): 
            full_path = os.path.join(path, name)
            name = full_path.split("/")[-2]
            plant = full_path.split("/")[-1]
            
            # Read file
            tmp = gpd.read_file(full_path, driver='KML')               
            tmp['Processed by'] = name.strip()
            tmp['tile_id'] = plant.split(".")[0].strip()

            df_annotations = df_annotations.append(tmp, ignore_index=True)

In [4]:
# read in statuses
df_status = pd.DataFrame()

sets=['Annotator 1', 'Annotator 2', 'Annotator 3']
for s in sets:
    tmp = pd.read_excel("Cement Plant Annotation - Group 3.xlsx", sheet_name=s)
    tmp = tmp[['tile_id','Date Processed','Processed by','Status','Notes']]
    
    df_status = df_status.append(tmp, ignore_index=True)
    
# convert to string
df_status['Processed by'] = df_status['Processed by'].astype(str)
df_status['tile_id'] = df_status['tile_id'].astype(str)

# to lower case
df_status['Processed by'] = df_status['Processed by'].str.lower()

# remove any leading/trailing spaces
df_status['tile_id'] = df_status['tile_id'].str.strip()
df_status['Processed by'] = df_status['Processed by'].str.strip()

# fix status
df_status.Status = df_status.Status.fillna('')
df_status.Status = df_status.Status.str.lower()
df_status.Status = df_status.Status.str.strip()

In [5]:
df_status_count = df_status.Status.value_counts().reset_index()
df_status_count.columns = ['status', 'count']
df_status_count['percent'] = (df_status_count['count'] / df_status_count['count'].sum())*100
df_status_count

Unnamed: 0,status,count,percent
0,no issues,2307,46.130774
1,steel plant,1498,29.954009
2,plant not found,901,18.016397
3,,230,4.59908
4,kiln under a cover,26,0.519896
5,unclear imagery,16,0.319936
6,kiln is blocked,15,0.29994
7,plant under construction,7,0.139972
8,cloudy imagery,1,0.019996


In [6]:
# join annotations and statuses
df = pd.merge(df_status, df_annotations, how='left', left_on=['tile_id', 'Processed by'], right_on=['tile_id', 'Processed by'])

# drop duplicates in case annotator accidentally exported the same annotation twice
df = df.drop_duplicates()

In [7]:
df = df[df['Processed by'] != 'courtney']

In [8]:
df['Acquisition_Date'] = pd.to_datetime(df.Description, errors='coerce')
df['Acquisition_Year'] = df['Acquisition_Date'].dt.year

In [9]:
df_year_count = df.Acquisition_Year.value_counts().reset_index()
df_year_count.columns = ['year', 'count']
df_year_count['percent'] = (df_year_count['count'] / df_year_count['count'].sum())*100
df_year_count['year'] = df_year_count['year'].astype(int)
df_year_count = df_year_count[(df_year_count.year != 2026) & (df_year_count.year != 2107)]
df_year_count.sort_values('year')

Unnamed: 0,year,count,percent
14,2002,1,0.00821
13,2008,7,0.057467
12,2009,22,0.180609
11,2010,32,0.262704
10,2011,61,0.50078
9,2012,69,0.566456
8,2013,256,2.101634
7,2014,479,3.932354
6,2015,535,4.392086
5,2016,546,4.482391


### Quality Assurance

In [10]:
def check_annotation_quality(group):
    # add geometry type (point, linestring, polygon)
    group['geom_type'] = group.geometry.apply(lambda x: x.type)
    
    s = ""
    
    # check polygon if missing
    if len(group[group.geom_type == 'Polygon']) == 0:
        s += "Polygon is missing; "
    # check if multiple polygons
    elif len(group[group.geom_type == 'Polygon']) > 1:
        s += "Multiple polygons annotated; "
        
    # check number of linestrings for each kiln
    grouped2 = group[group.geom_type == 'LineString'].groupby(['tile_id', 'Processed by', 'Name'])
    for name2, group2 in grouped2:
        # check if missing measurement
        if len(group2) < 2:
            s += "Length or width of kiln {} is missing; ".format(name2[2])
        # check if too many measurements
        elif len(group2) > 2:
            s += "More than 2 line measurements for kiln {}; ".format(name2[2])
            
    # check acquisition dates
    for idx, row in group.iterrows():
        if pd.notnull(row.Acquisition_Date):
            if row.Acquisition_Date.year < 1900 or row.Acquisition_Date.year > 2021:
                s += "Date format is incorrect on one of annotations; "
            
    return s

In [11]:
# Remove previous feedback
!rm -r Completed/*
!rm -r CloudFactory\ to\ Review/*
!rm Completed.tar.gz
!rm CloudFactory\ to\ Review.tar.gz

rm: cannot remove 'CloudFactory to Review/*': No such file or directory
rm: cannot remove 'CloudFactory to Review.tar.gz': No such file or directory


In [13]:
# check if we have all the annotations
grouped = df.groupby(['tile_id'])
results_df = pd.DataFrame(columns=['tile_id', 'annotator1', 'issues1', 'annotator2', 'issues2', 'annotator3', 'issues3'])

for name, group in grouped:
    
    res_dict = {}
    res_dict['tile_id'] = name    
    
    grouped2 = group.groupby(['tile_id', 'Processed by', 'Status'])
    idx = 1
    for name2, group2 in grouped2:
        if len(group2[group2.Name.notnull()]) == 1667:
            s = "Excel file needs to be removed from output"
        elif name2[2].strip() == '':
            s = "Status is missing"
        elif name2[2] == 'no issues':
            if len(group2[group2.Name.notnull()]) == 0:
                s = "Annotations are missing"
            else:
                s = check_annotation_quality(group2)
        else:
            s = ''
        res_dict['annotator{}'.format(idx)] = name2[1] 
        res_dict['issues{}'.format(idx)] = s 
        idx += 1
        
        tmp = df_annotations[(df_annotations["Processed by"] == name2[1]) & (df_annotations.tile_id == name)]
        
        if name2[1] != 'nan':
            if not os.path.isdir('Completed/{}'.format(name2[1])):
                os.mkdir('Completed/{}'.format(name2[1])) 
                os.mkdir('CloudFactory to Review/{}'.format(name2[1])) 

            if len(tmp) > 0:
                if s == '':
                    with fiona.drivers():
                        tmp.to_file('Completed/{}/{}.kml'.format(name2[1], name), driver='KML')
                else:
                    with fiona.drivers():
                        tmp.to_file('CloudFactory to Review/{}/{}.kml'.format(name2[1], name), driver='KML')
                        
    results_df = results_df.append(res_dict, ignore_index=True)

In [14]:
def check_overall(row):
    num_issues = 0
    if row.issues1 != '':
        num_issues += 1
    if row.issues2 != '':
        num_issues += 1
    if row.issues3 != '':
        num_issues += 1
    return 3 - num_issues

results_df['num_completed'] = results_df.apply(check_overall,axis=1)

### Write Output

In [15]:
!pip install openpyxl

Collecting openpyxl
  Using cached openpyxl-3.0.7-py2.py3-none-any.whl (243 kB)
Processing /home/jovyan/.cache/pip/wheels/e2/bd/55/048b4fd505716c4c298f42ee02dffd9496bb6d212b266c7f31/et_xmlfile-1.0.1-py3-none-any.whl
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.0.1 openpyxl-3.0.7


In [16]:
import openpyxl as pxl

excel_book = pxl.load_workbook("Cement Plant Annotation - Group 3.xlsx")
with pd.ExcelWriter("Cement Plant Annotation - Group 3.xlsx", engine='openpyxl') as writer:
    writer.book = excel_book
    writer.sheets = {
        worksheet.title: worksheet
        for worksheet in excel_book.worksheets
    }
    results_df.to_excel(writer, 'Feedback', index=False)
    writer.save()

In [17]:
!tar -czvf "CloudFactory to Review.tar.gz" "CloudFactory to Review/"
!tar -czvf Completed.tar.gz Completed/

CloudFactory to Review/
CloudFactory to Review/marvin mwangi/
CloudFactory to Review/emily nyawira waithera/
CloudFactory to Review/damaris kwamboka okenyuri/
CloudFactory to Review/caroline kioko/
CloudFactory to Review/rasoa simiyu/
CloudFactory to Review/eva irungu/
CloudFactory to Review/njoki muriithi/
CloudFactory to Review/erick karanja/
CloudFactory to Review/john oduor otieno/
CloudFactory to Review/ephantus maina/
CloudFactory to Review/joshua gichuki mwangi/
CloudFactory to Review/zindzi damianna/
CloudFactory to Review/maina lawrence irungu/
CloudFactory to Review/moses njau/
CloudFactory to Review/vincent omondi/
CloudFactory to Review/fiona atieno/
CloudFactory to Review/vincent kipngetich/
CloudFactory to Review/isaack odhiambo otieno/
CloudFactory to Review/mwangi emmanuel/
CloudFactory to Review/clement omunga/
Completed/
Completed/marvin mwangi/
Completed/marvin mwangi/MGRS-48RTU-0041-2020-01.kml
Completed/marvin mwangi/MGRS-45TWJ-0536-2020-01.kml
Completed/marvin mwa