In [1]:
import os
import geopandas as gpd
import pandas as pd
import fiona
from earthai.geo import reproject_on_the_fly
import statistics
import matplotlib.pyplot as plt
from datetime import date
import re

In [2]:
# Enable fiona driver
gpd.io.file.fiona.drvsupport.supported_drivers['KML'] = 'rw'

In [3]:
# read in annotations
df_annotations = pd.DataFrame()

for subdirectory in os.listdir("."):
    if subdirectory.startswith("annotations"):
        for filename in os.listdir(subdirectory):
            if filename.endswith(".kml"): 
                # Read file
                tmp = gpd.read_file(os.path.join(subdirectory, filename), driver='KML')
                if "-" in filename:
                    tmp['Processed By'] = filename.split("-")[0].strip().lower()
                else:
                    match = re.match(r"([a-zA-Z\s]+)([0-9]+)", filename, re.I)
                    if match:
                        items = match.groups()
                        tmp['Processed By'] = items[0].strip().lower()
                tmp['file_path'] = filename

                df_annotations = df_annotations.append(tmp, ignore_index=True)

df_annotations['uid'] = df_annotations['Name'].str.split("-", n=1).str[0]

# remove any leading/trailing spaces
df_annotations['uid'] = df_annotations['uid'].str.strip()
df_annotations['Processed By'] = df_annotations['Processed By'].str.strip()

In [4]:
# read in statuses
df_status = pd.DataFrame()

sets=['Annotator 1', 'Annotator 2', 'Annotator 3']
for s in sets:
    tmp = pd.read_excel("cement_dataset_v4.1.xlsx", sheet_name=s)
    tmp = tmp[['uid','latitude','longitude','Date Processed','Processed By','status','notes']]
    
    df_status = df_status.append(tmp, ignore_index=True)
    
# convert to string
df_status['Processed By'] = df_status['Processed By'].astype(str)
df_status['uid'] = df_status['uid'].astype(str)

# to lower case
df_status['Processed By'] = df_status['Processed By'].str.lower()

# remove any leading/trailing spaces
df_status['uid'] = df_status['uid'].str.strip()
df_status['Processed By'] = df_status['Processed By'].str.strip()

# fix status
df_status.status = df_status.status.fillna('')
df_status.status = df_status.status.str.lower()

In [5]:
# # check if merge worked
df = pd.merge(df_status, df_annotations, how='outer', left_on=['uid', 'Processed By'], right_on=['uid', 'Processed By'], indicator=True)

# df._merge.value_counts()
# df[df._merge == 'both']['Processed By'].value_counts()
# df[df._merge == 'right_only']['Processed By'].value_counts()
# df[df._merge == 'left_only']['Processed By'].value_counts()

In [6]:
df[df._merge == 'both']['Processed By'].value_counts()

marvin mwangi                2144
maina lawrence irungu        1781
benice wairimu ruga          1654
fiona atieno                 1650
vincent kipngetich           1357
regina wanjala nyambu        1333
caroline kioko               1332
john oduor otieno            1207
njoki muriithi               1167
emily nyawira waithera        836
rasoa simiyu                  774
clement omunga                763
damaris kwamboka okenyuri     688
moses njau                    673
ephantus maina                589
erick karanja                 567
solomon gitahi gachure        309
Name: Processed By, dtype: int64

In [7]:
df[df._merge == 'right_only']['Processed By'].value_counts()

joshua mwangi                1550
isaack odhiambo               936
isaack odhiambo feb           930
maina lawrence iungu          350
caroline kioko                180
solomon gachure               142
damaris kwamboka okenyuri      50
benice wairimu ruga            28
maina lawrence irungu          20
fiona atieno                   15
ephantus maina                 12
emily nyawira waithera          9
marvin mwangi                   9
john oduor otieno               9
solomon gitahi gachure          8
vincent kipngetich              3
clement omunga                  2
rasoa simiyu                    2
erick karanja                   1
regina wanjala nyambu           1
Name: Processed By, dtype: int64

In [8]:
df[df._merge == 'left_only']['Processed By'].value_counts()

nan                          2252
isaack odhiambo otieno        586
joshua gichuki mwangi         343
maina lawrence irungu         103
john oduor otieno             100
solomon gitahi gachure         61
rasoa simiyu                   56
marvin mwangi                  44
vincent kipngetich             40
ephantus maina                 19
njoki muriithi                 18
clement omunga                 18
caroline kioko                 14
fiona atieno                   12
damaris kwamboka okenyuri       8
moses njau                      6
regina wanjala nyambu           3
benice wairimu ruga             3
erick karanja                   1
emily nyawira waithera          1
Name: Processed By, dtype: int64

In [9]:
df_status_count = df_status.status.value_counts().reset_index()
df_status_count.columns = ['status', 'count']
df_status_count['percent'] = (df_status_count['count'] / df_status_count['count'].sum())*100
df_status_count

Unnamed: 0,status,count,percent
0,no issues,5292,62.753469
1,,2447,29.016957
2,kiln under a cover,370,4.387525
3,plant under construction,114,1.351832
4,kiln is blocked,88,1.04352
5,unclear imagery,55,0.6522
6,plant not found,33,0.39132
7,duplicate plant,23,0.272738
8,cloudy imagery,5,0.059291
9,no imagery,4,0.047433


In [10]:
# join annotations and statuses
df = pd.merge(df_status, df_annotations, how='left', left_on=['uid', 'Processed By'], right_on=['uid', 'Processed By'])

# drop duplicates in case annotator accidentally exported the same annotation twice
df = df.drop_duplicates()

In [11]:
df['Acquisition_Date'] = pd.to_datetime(df.Description, errors='coerce')
df['Acquisition_Year'] = df['Acquisition_Date'].dt.year

In [12]:
df_year_count = df.Acquisition_Year.value_counts().reset_index()
df_year_count.columns = ['year', 'count']
df_year_count['percent'] = (df_year_count['count'] / df_year_count['count'].sum())*100
df_year_count['year'] = df_year_count['year'].astype(int)
df_year_count = df_year_count[(df_year_count.year != 2026) & (df_year_count.year != 2107)]
df_year_count.sort_values('year')

Unnamed: 0,year,count,percent
20,2000,6,0.032282
21,2001,2,0.010761
16,2002,19,0.102227
18,2003,9,0.048424
13,2004,66,0.355106
19,2005,7,0.037663
15,2006,23,0.123749
17,2007,17,0.091467
14,2008,27,0.145271
12,2009,119,0.640267


### Quality Assurance

In [13]:
def check_annotation_quality(group):
    # add geometry type (point, linestring, polygon)
    group['geom_type'] = group.geometry.apply(lambda x: x.type)
    
    s = ""
    
    # check polygon if missing
    if len(group[group.geom_type == 'Polygon']) == 0:
        s += "Polygon is missing; "
    # check if multiple polygons
    elif len(group[group.geom_type == 'Polygon']) > 1:
        s += "Multiple polygons annotated; "
        
    # check number of linestrings for each kiln
    grouped2 = group[group.geom_type == 'LineString'].groupby(['uid', 'Processed By', 'Name'])
    for name2, group2 in grouped2:
        # check if missing measurement
        if len(group2) < 2:
            s += "Length or width of kiln {} is missing; ".format(name2[2])
        # check if too many measurements
        elif len(group2) > 2:
            s += "More than 2 line measurements for kiln {}; ".format(name2[2])
            
    return s

In [None]:
# Remove previous feedback
!rm -r Completed/*
!rm -r CloudFactory\ to\ Review/*
!rm Completed.tar.gz
!rm CloudFactory\ to\ Review.tar.gz

In [15]:
# check if we have all the annotations
grouped = df.groupby(['uid'])
results_df = pd.DataFrame(columns=['uid', 'annotator1', 'issues1', 'annotator2', 'issues2', 'annotator3', 'issues3'])

for name, group in grouped:
    
    res_dict = {}
    res_dict['uid'] = name    
    
    grouped2 = group.groupby(['uid', 'Processed By', 'status'])
    idx = 1
    for name2, group2 in grouped2:
        if name2[2].strip() == '':
            s = "Status is missing"
        elif name2[2] == 'no issues':
            if len(group2[group2.Name.notnull()]) == 0:
                s = "Annotations are missing"
            else:
                s = check_annotation_quality(group2)
        elif name2[2] == 'duplicate plant':
            s = "ignore duplicate"
        else:
            s = ''
        res_dict['annotator{}'.format(idx)] = name2[1] 
        res_dict['issues{}'.format(idx)] = s 
        idx += 1
        
        tmp = df_annotations[(df_annotations["Processed By"] == name2[1]) & (df_annotations.uid == name)]
        
        if name2[1] != 'nan':
            if not os.path.isdir('Completed/{}'.format(name2[1])):
                os.mkdir('Completed/{}'.format(name2[1])) 
                os.mkdir('CloudFactory to Review/{}'.format(name2[1])) 

            if len(tmp) > 0:
                if s == '':
                    with fiona.drivers():
                        tmp.to_file('Completed/{}/{}.kml'.format(name2[1], name), driver='KML')
                else:
                    with fiona.drivers():
                        tmp.to_file('CloudFactory to Review/{}/{}.kml'.format(name2[1], name), driver='KML')
                        
    results_df = results_df.append(res_dict, ignore_index=True)

In [16]:
def check_overall(row):
    num_issues = 0
    if row.issues1 != '':
        num_issues += 1
    if row.issues2 != '':
        num_issues += 1
    if row.issues3 != '':
        num_issues += 1
    return 3 - num_issues

results_df['num_completed'] = results_df.apply(check_overall,axis=1)

In [19]:
!pip install openpyxl

Collecting openpyxl
  Downloading openpyxl-3.0.6-py2.py3-none-any.whl (242 kB)
[K     |████████████████████████████████| 242 kB 17.4 MB/s eta 0:00:01
[?25hCollecting jdcal
  Downloading jdcal-1.4.1-py2.py3-none-any.whl (9.5 kB)
Collecting et-xmlfile
  Downloading et_xmlfile-1.0.1.tar.gz (8.4 kB)
Building wheels for collected packages: et-xmlfile
  Building wheel for et-xmlfile (setup.py) ... [?25ldone
[?25h  Created wheel for et-xmlfile: filename=et_xmlfile-1.0.1-py3-none-any.whl size=8917 sha256=ab546a6ba97600379f723f539d49b45f649e97a9a21a23ed2b615d4b8f54a60f
  Stored in directory: /home/jovyan/.cache/pip/wheels/e2/bd/55/048b4fd505716c4c298f42ee02dffd9496bb6d212b266c7f31
Successfully built et-xmlfile
Installing collected packages: jdcal, et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.0.1 jdcal-1.4.1 openpyxl-3.0.6


In [21]:
import openpyxl as pxl

excel_book = pxl.load_workbook("cement_dataset_v4.1.xlsx")
with pd.ExcelWriter("cement_dataset_v4.1.xlsx", engine='openpyxl') as writer:
    writer.book = excel_book
    writer.sheets = {
        worksheet.title: worksheet
        for worksheet in excel_book.worksheets
    }
    results_df.to_excel(writer, 'Feedback', index=False)
    writer.save()

### TODO: Go back and fix above labels.

In [22]:
!tar -czvf "CloudFactory to Review.tar.gz" "CloudFactory to Review/"
!tar -czvf Completed.tar.gz Completed/

CloudFactory to Review/
CloudFactory to Review/marvin mwangi/
CloudFactory to Review/marvin mwangi/CHN0023.kml
CloudFactory to Review/marvin mwangi/JPN0006.kml
CloudFactory to Review/marvin mwangi/CZE0004.kml
CloudFactory to Review/marvin mwangi/USA0067.kml
CloudFactory to Review/marvin mwangi/BGD0018.kml
CloudFactory to Review/marvin mwangi/CHN0076.kml
CloudFactory to Review/marvin mwangi/VEN0009.kml
CloudFactory to Review/marvin mwangi/GBR0007.kml
CloudFactory to Review/marvin mwangi/CHN0898.kml
CloudFactory to Review/marvin mwangi/TKM0006.kml
CloudFactory to Review/marvin mwangi/CHN0027.kml
CloudFactory to Review/marvin mwangi/USA0072.kml
CloudFactory to Review/marvin mwangi/VNM0090.kml
CloudFactory to Review/marvin mwangi/USA0100.kml
CloudFactory to Review/marvin mwangi/USA0001.kml
CloudFactory to Review/marvin mwangi/VEN0007.kml
CloudFactory to Review/marvin mwangi/CHN0025.kml
CloudFactory to Review/marvin mwangi/CHN0052.kml
CloudFactory to Review/marvin mwangi/USA0070.kml
CloudFa