In [1]:
import os
import geopandas as gpd
import pandas as pd
import fiona
from earthai.geo import reproject_on_the_fly
import statistics
import matplotlib.pyplot as plt
from datetime import date
import re

In [2]:
# Enable fiona driver
gpd.io.file.fiona.drvsupport.supported_drivers['KML'] = 'rw'

In [3]:
# read in annotations
df_annotations = pd.DataFrame()

for path, subdirs, files in os.walk('Completed'):
    for name in files:
        if name.endswith(".kml"): 
            full_path = os.path.join(path, name)
            name = full_path.split("/")[-2]
            plant = full_path.split("/")[-1]
            
            # Read file
            tmp = gpd.read_file(full_path, driver='KML')               
            tmp['Processed By'] = name.strip()
            tmp['tile_id'] = plant.split(".")[0].strip()

            df_annotations = df_annotations.append(tmp, ignore_index=True)
            
for path, subdirs, files in os.walk('Astraea to Review'):
    for name in files:
        if name.endswith(".kml"): 
            full_path = os.path.join(path, name)
            name = full_path.split("/")[-2]
            plant = full_path.split("/")[-1]
            
            # Read file
            tmp = gpd.read_file(full_path, driver='KML')               
            tmp['Processed By'] = name.strip()
            tmp['tile_id'] = plant.split(".")[0].strip()

            df_annotations = df_annotations.append(tmp, ignore_index=True)

In [4]:
# read in statuses
df_status = pd.DataFrame()

sets=['Annotation Tab']
for s in sets:
    tmp = pd.read_excel("Steel Plant Localization - Group One Worksheet.xlsx", sheet_name=s)
    tmp = tmp[['tile_id','plant_found','class', 'level','imagery_date','Processed By','status']]
    
    df_status = df_status.append(tmp, ignore_index=True)
    
# convert to string
df_status['Processed By'] = df_status['Processed By'].astype(str)
df_status['tile_id'] = df_status['tile_id'].astype(str)

# to lower case
df_status['Processed By'] = df_status['Processed By'].str.lower()

# remove any leading/trailing spaces
df_status['tile_id'] = df_status['tile_id'].str.strip()
df_status['Processed By'] = df_status['Processed By'].str.strip()

# fix status
df_status.status = df_status.status.fillna('')
df_status.status = df_status.status.str.lower()
df_status.status = df_status.status.str.strip()

# fix status
df_status.plant_found = df_status.plant_found.fillna('')
df_status.plant_found = df_status.plant_found.str.lower()
df_status.plant_found = df_status.plant_found.str.strip()

In [5]:
# # check if merge worked
df = pd.merge(df_status, df_annotations, how='outer', left_on=['tile_id', 'Processed By'], right_on=['tile_id', 'Processed By'], indicator=True)

# df._merge.value_counts()
# df[df._merge == 'both']['Processed By'].value_counts()
# df[df._merge == 'right_only']['Processed By'].value_counts()
# df[df._merge == 'left_only']['Processed By'].value_counts()

In [6]:
df[df._merge == 'both']['Processed By'].value_counts()

maina lawrence irungu        105
fiona atieno                  81
john oduor otieno             63
joshua gichuki mwangi         50
rasoa simiyu                  29
moses njau                    29
isaack odhiambo otieno        23
emily nyawira waithera        22
marvin mwangi                 18
benice wairimu ruga           17
caroline kioko                16
vincent kipngetich            14
clement omunga                 9
mwangi emmanuel                8
eva irungu                     6
damaris kwamboka okenyuri      4
ephantus maina                 4
Name: Processed By, dtype: int64

In [7]:
df[df._merge == 'right_only']['Processed By'].value_counts()

eva irungu    9
Name: Processed By, dtype: int64

In [8]:
df[df._merge == 'left_only']['Processed By'].value_counts()

benice wairimu ruga          655
marvin mwangi                578
rasoa simiyu                 558
caroline kioko               471
isaack odhiambo otieno       456
john oduor otieno            450
joshua gichuki mwangi        442
maina lawrence irungu        404
vincent kipngetich           403
clement omunga               379
fiona atieno                 344
ephantus maina               269
eva irungu                   239
damaris kwamboka okenyuri    224
erick karanja                181
moses njau                   172
njoki muriithi               154
emily nyawira waithera       153
mwangi emmanuel               89
Name: Processed By, dtype: int64

In [9]:
df_status_count = df_status.status.value_counts().reset_index()
df_status_count.columns = ['status', 'count']
df_status_count['percent'] = (df_status_count['count'] / df_status_count['count'].sum())*100
df_status_count

Unnamed: 0,status,count,percent
0,no issues,6489,92.35696
1,no imagery,410,5.835468
2,unclear imagery,72,1.024765
3,plant already found,39,0.555081
4,cloudy imagery,9,0.128096
5,duplicate plant,7,0.09963


In [10]:
# join annotations and statuses
df = pd.merge(df_status, df_annotations, how='left', left_on=['tile_id', 'Processed By'], right_on=['tile_id', 'Processed By'])

# drop duplicates in case annotator accidentally exported the same annotation twice
df = df.drop_duplicates()

In [11]:
df['Acquisition_Date'] = pd.to_datetime(df.Description, errors='coerce')
df['Acquisition_Year'] = df['Acquisition_Date'].dt.year

df['Acquisition_Date2'] = pd.to_datetime(df.imagery_date, errors='coerce')
df['Acquisition_Year2'] = df['Acquisition_Date2'].dt.year

In [12]:
df_year_count = df.Acquisition_Year.value_counts().reset_index()
df_year_count.columns = ['year', 'count']
df_year_count['percent'] = (df_year_count['count'] / df_year_count['count'].sum())*100
df_year_count['year'] = df_year_count['year'].astype(int)
df_year_count = df_year_count[(df_year_count.year != 2026) & (df_year_count.year != 2107)]
df_year_count.sort_values('year')

Unnamed: 0,year,count,percent
5,2004,7,1.405622
11,2010,1,0.200803
10,2011,2,0.401606
9,2012,3,0.60241
8,2013,3,0.60241
4,2014,11,2.208835
7,2015,3,0.60241
6,2016,5,1.004016
3,2017,19,3.815261
2,2018,43,8.634538


### Quality Assurance

In [13]:
def check_annotation_quality(group):
    # add geometry type (point, linestring, polygon)
    group['geom_type'] = group.geometry.apply(lambda x: x.type)
    
    s = ""
    
    # check if point is missing
    if len(group[group.geom_type == 'Point']) == 0:
        s += "Point is missing; "
            
    # check acquisition dates
    for idx, row in group.iterrows():
        if pd.notnull(row.Acquisition_Date):
            if row.Acquisition_Date.year < 1900 or row.Acquisition_Date.year > 2021:
                s += "Date format is incorrect on one of annotations; "
        else:
            s += "Date is missing for annotated point; "
    
        if row.Name.strip() == '':
            s += "Name is missing for annotated point; "
            
    return s

In [14]:
# Remove previous feedback
!rm -r Completed/*
!rm -r CloudFactory\ to\ Review/*
!rm Completed.tar.gz
!rm CloudFactory\ to\ Review.tar.gz

In [15]:
# check if we have all the annotations
grouped = df.groupby(['tile_id'])
results_df = pd.DataFrame(columns=['tile_id', 'annotator1', 'issues1'])

for name, group in grouped:
    
    res_dict = {}
    res_dict['tile_id'] = name    
    
    grouped2 = group.groupby(['tile_id', 'Processed By', 'status', 'plant_found', 'class', 'level', 'Acquisition_Date2', 'Acquisition_Year2'])
    idx = 1
    for name2, group2 in grouped2:
        s = ""
        if name2[2].strip() == '':
            s = "Status is missing; "
        elif name2[3].strip() == '':
            s = "Plant_found is missing; "
        elif name2[2] == 'no issues' and name2[3] == 'yes':
            if len(group2[group2.Name.notnull()]) == 0:
                s = "Annotations are missing; "
            else:
                s = check_annotation_quality(group2)
        
        if name2[4].strip() == '':
            s += "Class is missing; "
        if name2[5].strip() == '':
            s += "Level is missing; "
        if pd.isnull(name2[6]):
            s += "imagery_date is missing or format is incorrect; "
        else:
            if name2[7] < 1900 or name2[7] > 2021:
                s += "imagery_date format is incorrect; "

            
        res_dict['annotator{}'.format(idx)] = name2[1] 
        res_dict['issues{}'.format(idx)] = s 
        idx += 1
        
        tmp = df_annotations[(df_annotations["Processed By"] == name2[1]) & (df_annotations.tile_id == name)]
        
        if name2[1] != 'nan':
            if not os.path.isdir('Completed/{}'.format(name2[1])):
                os.mkdir('Completed/{}'.format(name2[1])) 
                os.mkdir('CloudFactory to Review/{}'.format(name2[1])) 

            if len(tmp) > 0:
                if s == '':
                    with fiona.drivers():
                        tmp.to_file('Completed/{}/{}.kml'.format(name2[1], name), driver='KML')
                else:
                    with fiona.drivers():
                        tmp.to_file('CloudFactory to Review/{}/{}.kml'.format(name2[1], name), driver='KML')
                        
    results_df = results_df.append(res_dict, ignore_index=True)

In [16]:
def check_overall(row):
    num_issues = 0
    if row.issues1 != '':
        num_issues += 1
    return 1 - num_issues

results_df['num_completed'] = results_df.apply(check_overall,axis=1)

In [17]:
!pip install openpyxl



In [18]:
results_df.to_csv('feedback.csv', index=False)

In [19]:
# import openpyxl as pxl

# excel_book = pxl.load_workbook("Steel Plant Localization - Group One Worksheet.xlsx")
# with pd.ExcelWriter("Steel Plant Localization - Group One Worksheet.xlsx", engine='openpyxl') as writer:
#     writer.book = excel_book
#     writer.sheets = {
#         worksheet.title: worksheet
#         for worksheet in excel_book.worksheets
#     }
#     results_df.to_excel(writer, 'Feedback', index=False)
#     writer.save()

### TODO: Go back and fix above labels.

In [20]:
!tar -czvf "CloudFactory to Review.tar.gz" "CloudFactory to Review/"
!tar -czvf Completed.tar.gz Completed/

CloudFactory to Review/
CloudFactory to Review/marvin mwangi/
CloudFactory to Review/emily nyawira waithera/
CloudFactory to Review/damaris kwamboka okenyuri/
CloudFactory to Review/caroline kioko/
CloudFactory to Review/rasoa simiyu/
CloudFactory to Review/eva irungu/
CloudFactory to Review/njoki muriithi/
CloudFactory to Review/erick karanja/
CloudFactory to Review/john oduor otieno/
CloudFactory to Review/ephantus maina/
CloudFactory to Review/joshua gichuki mwangi/
CloudFactory to Review/maina lawrence irungu/
CloudFactory to Review/moses njau/
CloudFactory to Review/fiona atieno/
CloudFactory to Review/vincent kipngetich/
CloudFactory to Review/isaack odhiambo otieno/
CloudFactory to Review/benice wairimu ruga/
CloudFactory to Review/mwangi emmanuel/
CloudFactory to Review/clement omunga/
Completed/
Completed/marvin mwangi/
Completed/marvin mwangi/MGRS-50TQL-0266-2020-01.kml
Completed/marvin mwangi/MGRS-48RUS-0059-2020-01.kml
Completed/marvin mwangi/MGRS-49TCF-0402-2020-01.kml
Com