# Consolidate Sentinel-2 Model Deployment Scores into a Single GeoJSON File

## Import libraries

In [None]:
import os
import glob

import geopandas as gpd
import pandas as pd

import boto3

from IPython.display import clear_output

## Inputs and parameters

In [None]:
s3_path = 'S2-RGB-macro-localization-model-deployment4/'
score_tar_files = ['S2-deployment-chip-scores-CHN-10km-nowater-2020-set1.tar', \
                   'S2-deployment-chip-scores-CHN-10km-nowater-2020-set2.tar', \
                   'S2-deployment-chip-scores-CHN-10km-nowater-2020-set3.tar', \
                   'S2-deployment-chip-scores-CHN-10km-nowater-2020-set4.tar']
LOCAL_DIR = '/scratch/'

In [None]:
s3 = boto3.resource('s3')
bucket = s3.Bucket('sfi-shared-assets')

## Outputs

In [None]:
consolidated_score_gjson = 'S2-deployment-chip-scores-CHN-10km-nowater-2020-consolidated.geojson'

## Download deployment results and untar

In [None]:
for f in score_tar_files:
    bucket.download_file(s3_path+f,
                         LOCAL_DIR+f)
    
    !tar -xf {LOCAL_DIR+f} -C {LOCAL_DIR}

## Get list of GeoJSONS

In [None]:
gjson_file_list = []
for f in score_tar_files:
    flist = glob.glob(LOCAL_DIR+f.replace('.tar','/*.geojson'))
    gjson_file_list.append(flist)
gjson_file_list = sum(gjson_file_list, [])

In [None]:
print('Number of GeoJSON files:', len(gjson_file_list))

## Merge data into single GeoDataFrame

In [None]:
full_scene_gdf = gpd.read_file(gjson_file_list[0])

In [None]:
for i in range(0, len(gjson_file_list)):
    clear_output(wait=True)
    print('Merging score file ', str(i+1), ' out of ', str(len(gjson_file_list)))
    
    tmp_gdf = gpd.read_file(gjson_file_list[i])
    full_scene_gdf = pd.concat([full_scene_gdf, tmp_gdf], ignore_index=True, sort=False)

## Remove duplicate chips

* Sort by eo_cloud_cover
* Drop duplicates by `geometry`, `cmtv4p1_uid`, `stlv4p1_uid`
* Keep the first record

This removes duplicates by choosing the chip with the lowest scene cloud coverage, but retains duplicated chips if more than one plant intersects with the chip.

In [None]:
unq_chip_gdf = full_scene_gdf.sort_values('s2_eo_cloud_cover') \
                             .drop_duplicates(subset=['geometry', 'cmtv4p1_uid', 'stlv4p1_uid'],
                                              keep='first',
                                              ignore_index=True)

In [None]:
print(len(full_scene_gdf))

In [None]:
print(len(unq_chip_gdf))

# Write consolidated scores to GeoJSON and upload to S3

In [None]:
unq_chip_gdf.to_file(consolidated_score_gjson, driver='GeoJSON')

In [None]:
bucket.upload_file(consolidated_score_gjson,
                   s3_path+consolidated_score_gjson)

In [None]:
os.remove(consolidated_score_gjson)