In [1]:
import os
import geopandas as gpd
import pandas as pd
import fiona
from earthai.geo import reproject_on_the_fly
import statistics
import matplotlib.pyplot as plt
from datetime import date
import re

In [2]:
# Enable fiona driver
gpd.io.file.fiona.drvsupport.supported_drivers['KML'] = 'rw'

In [3]:
# read in annotations
df_annotations = pd.DataFrame()

for path, subdirs, files in os.walk('Completed'):
    for name in files:
        if name.endswith(".kml"): 
            full_path = os.path.join(path, name)
            name = full_path.split("/")[-2]
            plant = full_path.split("/")[-1]
            
            # Read file
            tmp = gpd.read_file(full_path, driver='KML')               
            tmp['Processed By'] = name.strip()
            tmp['tile_id'] = plant.split(".")[0].strip()

            df_annotations = df_annotations.append(tmp, ignore_index=True)
            
for path, subdirs, files in os.walk('Astraea to Review'):
    for name in files:
        if name.endswith(".kml"): 
            full_path = os.path.join(path, name)
            name = full_path.split("/")[-2]
            plant = full_path.split("/")[-1]
            
            # Read file
            tmp = gpd.read_file(full_path, driver='KML')               
            tmp['Processed By'] = name.strip()
            tmp['tile_id'] = plant.split(".")[0].strip()

            df_annotations = df_annotations.append(tmp, ignore_index=True)

In [4]:
# read in statuses
df_status = pd.DataFrame()

sets=['Annotator 1']
for s in sets:
    tmp = pd.read_excel("Cement Plant Localization - Group 2 (Refresh) - 5th March 2021.xlsx", sheet_name=s)
    tmp = tmp[['tile_id','Plant_found','Class', 'Level','imagery_date','Processed By','Status']]
    
    df_status = df_status.append(tmp, ignore_index=True)
    
# convert to string
df_status['Processed By'] = df_status['Processed By'].astype(str)
df_status['tile_id'] = df_status['tile_id'].astype(str)

# to lower case
df_status['Processed By'] = df_status['Processed By'].str.lower()

# remove any leading/trailing spaces
df_status['tile_id'] = df_status['tile_id'].str.strip()
df_status['Processed By'] = df_status['Processed By'].str.strip()

# fix status
df_status.Status = df_status.Status.fillna('')
df_status.Status = df_status.Status.str.lower()
df_status.Status = df_status.Status.str.strip()

# fix status
df_status.Plant_found = df_status.Plant_found.fillna('')
df_status.Plant_found = df_status.Plant_found.str.lower()
df_status.Plant_found = df_status.Plant_found.str.strip()

In [5]:
# # check if merge worked
df = pd.merge(df_status, df_annotations, how='outer', left_on=['tile_id', 'Processed By'], right_on=['tile_id', 'Processed By'], indicator=True)

# df._merge.value_counts()
# df[df._merge == 'both']['Processed By'].value_counts()
# df[df._merge == 'right_only']['Processed By'].value_counts()
# df[df._merge == 'left_only']['Processed By'].value_counts()

In [6]:
df[df._merge == 'both']['Processed By'].value_counts()

benice wairimu ruga          375
caroline kioko               222
fiona atieno                 219
maina lawrence irungu        161
john oduor otieno            159
isaack odhiambo otieno       101
erick karanja                 95
marvin mwangi                 92
njoki muriithi                60
moses njau                    51
clement omunga                33
rasoa simiyu                  27
damaris kwamboka okenyuri     22
joshua gichuki mwangi         21
ephantus maina                20
emily nyawira waithera        19
vincent kipngetich            11
Name: Processed By, dtype: int64

In [7]:
df[df._merge == 'right_only']['Processed By'].value_counts()

Series([], Name: Processed By, dtype: int64)

In [8]:
df[df._merge == 'left_only']['Processed By'].value_counts()

marvin mwangi                690
john oduor otieno            639
benice wairimu ruga          441
caroline kioko               428
erick karanja                365
maina lawrence irungu        274
isaack odhiambo otieno       259
clement omunga               256
fiona atieno                 230
njoki muriithi               220
ephantus maina               209
vincent kipngetich           197
damaris kwamboka okenyuri    160
rasoa simiyu                 123
joshua gichuki mwangi         75
moses njau                    46
emily nyawira waithera        30
nan                            2
Name: Processed By, dtype: int64

In [9]:
df_status_count = df_status.Status.value_counts().reset_index()
df_status_count.columns = ['status', 'count']
df_status_count['percent'] = (df_status_count['count'] / df_status_count['count'].sum())*100
df_status_count

Unnamed: 0,status,count,percent
0,no issues,5932,97.710427
1,no imagery,91,1.498929
2,unclear imagery,22,0.362379
3,cloudy imagery,15,0.247076
4,plant already found,9,0.148246
5,,2,0.032944


In [10]:
# join annotations and statuses
df = pd.merge(df_status, df_annotations, how='left', left_on=['tile_id', 'Processed By'], right_on=['tile_id', 'Processed By'])

# drop duplicates in case annotator accidentally exported the same annotation twice
df = df.drop_duplicates()

In [11]:
df['Acquisition_Date'] = pd.to_datetime(df.Description, errors='coerce')
df['Acquisition_Year'] = df['Acquisition_Date'].dt.year

df['Acquisition_Date2'] = pd.to_datetime(df.imagery_date, errors='coerce')
df['Acquisition_Year2'] = df['Acquisition_Date2'].dt.year

In [12]:
df_year_count = df.Acquisition_Year.value_counts().reset_index()
df_year_count.columns = ['year', 'count']
df_year_count['percent'] = (df_year_count['count'] / df_year_count['count'].sum())*100
df_year_count['year'] = df_year_count['year'].astype(int)
df_year_count = df_year_count[(df_year_count.year != 2026) & (df_year_count.year != 2107)]
df_year_count.sort_values('year')

Unnamed: 0,year,count,percent
10,2010,3,0.178571
11,2011,1,0.059524
7,2012,6,0.357143
9,2013,5,0.297619
8,2014,5,0.297619
6,2015,6,0.357143
4,2016,35,2.083333
3,2017,47,2.797619
2,2018,115,6.845238
1,2019,398,23.690476


### Quality Assurance

In [13]:
def check_annotation_quality(group):
    # add geometry type (point, linestring, polygon)
    group['geom_type'] = group.geometry.apply(lambda x: x.type)
    
    s = ""
    
    # check if point is missing
    if len(group[group.geom_type == 'Point']) == 0:
        s += "Point is missing; "
            
    # check acquisition dates
    for idx, row in group.iterrows():
        if pd.notnull(row.Acquisition_Date):
            if row.Acquisition_Date.year < 1900 or row.Acquisition_Date.year > 2021:
                s += "Date format is incorrect on one of annotations; "
        else:
            s += "Date is missing for annotated point; "
    
        if row.Name.strip() == '':
            s += "Name is missing for annotated point; "
            
    return s

In [14]:
# Remove previous feedback
!rm -r Completed/*
!rm -r CloudFactory\ to\ Review/*
!rm Completed.tar.gz
!rm CloudFactory\ to\ Review.tar.gz

In [15]:
# check if we have all the annotations
grouped = df.groupby(['tile_id'])
results_df = pd.DataFrame(columns=['tile_id', 'annotator1', 'issues1'])

for name, group in grouped:
    
    res_dict = {}
    res_dict['tile_id'] = name    
    
    grouped2 = group.groupby(['tile_id', 'Processed By', 'Status', 'Plant_found', 'Class', 'Level', 'Acquisition_Date2', 'Acquisition_Year2'])
    idx = 1
    for name2, group2 in grouped2:
        s = ""
        if name2[2].strip() == '':
            s = "Status is missing; "
        elif name2[3].strip() == '':
            s = "Plant_found is missing; "
        elif name2[2] == 'no issues' and name2[3] == 'yes':
            if len(group2[group2.Name.notnull()]) == 0:
                s = "Annotations are missing; "
            else:
                s = check_annotation_quality(group2)
        
        if name2[4].strip() == '':
            s += "Class is missing; "
        if name2[5].strip() == '':
            s += "Level is missing; "
        if pd.isnull(name2[6]):
            s += "imagery_date is missing or format is incorrect; "
        else:
            if name2[7] < 1900 or name2[7] > 2021:
                s += "imagery_date format is incorrect; "

            
        res_dict['annotator{}'.format(idx)] = name2[1] 
        res_dict['issues{}'.format(idx)] = s 
        idx += 1
        
        tmp = df_annotations[(df_annotations["Processed By"] == name2[1]) & (df_annotations.tile_id == name)]
        
        if name2[1] != 'nan':
            if not os.path.isdir('Completed/{}'.format(name2[1])):
                os.mkdir('Completed/{}'.format(name2[1])) 
                os.mkdir('CloudFactory to Review/{}'.format(name2[1])) 

            if len(tmp) > 0:
                if s == '':
                    with fiona.drivers():
                        tmp.to_file('Completed/{}/{}.kml'.format(name2[1], name), driver='KML')
                else:
                    with fiona.drivers():
                        tmp.to_file('CloudFactory to Review/{}/{}.kml'.format(name2[1], name), driver='KML')
                        
    results_df = results_df.append(res_dict, ignore_index=True)

In [16]:
def check_overall(row):
    num_issues = 0
    if row.issues1 != '':
        num_issues += 1
    return 1 - num_issues

results_df['num_completed'] = results_df.apply(check_overall,axis=1)

In [17]:
!pip install openpyxl

Collecting openpyxl
  Using cached openpyxl-3.0.7-py2.py3-none-any.whl (243 kB)
Processing /home/jovyan/.cache/pip/wheels/e2/bd/55/048b4fd505716c4c298f42ee02dffd9496bb6d212b266c7f31/et_xmlfile-1.0.1-py3-none-any.whl
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.0.1 openpyxl-3.0.7


In [18]:
results_df.to_csv('feedback.csv', index=False)

In [19]:
# import openpyxl as pxl

# results_df = results_df.drop(['annotator2', 'issues2'], axis=1)

# excel_book = pxl.load_workbook("Cement Plant Localization - Group 2 (Refresh) - 5th March 2021.xlsx")
# with pd.ExcelWriter("Cement Plant Localization - Group 2 (Refresh) - 5th March 2021.xlsx", engine='openpyxl') as writer:
#     writer.book = excel_book
#     writer.sheets = {
#         worksheet.title: worksheet
#         for worksheet in excel_book.worksheets
#     }
#     results_df.to_excel(writer, 'Feedback', index=False)
#     writer.save()

### TODO: Go back and fix above labels.

In [20]:
!tar -czvf "CloudFactory to Review.tar.gz" "CloudFactory to Review/"
!tar -czvf Completed.tar.gz Completed/

CloudFactory to Review/
CloudFactory to Review/marvin mwangi/
CloudFactory to Review/emily nyawira waithera/
CloudFactory to Review/damaris kwamboka okenyuri/
CloudFactory to Review/caroline kioko/
CloudFactory to Review/rasoa simiyu/
CloudFactory to Review/njoki muriithi/
CloudFactory to Review/erick karanja/
CloudFactory to Review/john oduor otieno/
CloudFactory to Review/ephantus maina/
CloudFactory to Review/joshua gichuki mwangi/
CloudFactory to Review/maina lawrence irungu/
CloudFactory to Review/moses njau/
CloudFactory to Review/fiona atieno/
CloudFactory to Review/vincent kipngetich/
CloudFactory to Review/isaack odhiambo otieno/
CloudFactory to Review/benice wairimu ruga/
CloudFactory to Review/clement omunga/
Completed/
Completed/marvin mwangi/
Completed/marvin mwangi/MGRS-49QDF-0488-2020-01.kml
Completed/marvin mwangi/MGRS-49QHG-0075-2020-04.kml
Completed/marvin mwangi/MGRS-50RNN-0143-2020-02.kml
Completed/marvin mwangi/MGRS-49QEF-0289-2020-01.kml
Completed/marvin mwangi/MG