# Defines Landsat 8 Deployment Chip Centers for 10km Grid

This notebook creates a GeoJSON file defining chip centers at which to deploy the Landsat 8 TIR macro-localization model.

This addresses the issue that Landsat 8 scenes with the same grid id taken at different dates do not map to the exact same projected extents, which is required when combining these images in the 3-band dataset for deployment. This code thus defines a per-scene grid of tile centroids that we can use to create chips of the desired size, centered at the same lat/long.

* Uses deployment regions defined from 10km Grid output from previous step
* Uses Landsat scenes from catalog defined in previous step to define a scene-aligned grid
* Outputs centers of tiles as deployment chip centroids

## Import required libraries

In [None]:
from earthai.all import *
import earthai.chipping.strategy as chp
import pyspark.sql.functions as F
import geopandas as gpd
import pandas as pd
import os
import boto3
import shutil
import glob
from shapely import wkt

## Define input and output files and parameters

### Parameters

* `chip_size` is the size of chips (length) to create (in pixels)
* `pred_thresh` is the prediction threshold for selecting deployment grid cells

In [None]:
chip_size = 35 # 1.05 km for Landsat 8
pred_thresh = 0.002

### Input files

* `deployment_gjson` is output GeoJSON of the deployment region
* `catalog_csv` is a csv file of the catalog returned from EOD

In [None]:
deployment_gjson = '../../resources/macro-loc-model-deployment/L8-deployment-region-CHN-10km-pthsh'+str(pred_thresh)+'.geojson'
catalog_csv = '../../resources/macro-loc-model-deployment/L8-deployment-catalog-CHN-10km-pthsh'+str(pred_thresh)+'.csv'

### Output files and paths

* `output_path` defines directory to write data to
* `chip_extents_gjson_prefix` is output prefix for GeoJSON files of chip extents
* `chip_centroids_gjson_prefix` is an output prefix for GeoJSON files with centroids of chip extents
* `s3_path` is tag for S3 bucket

In [None]:
output_path = '../../resources/macro-loc-model-deployment/l8-chip-centers/'
chip_extents_gjson_prefix = 'L8-deployment-chip-extents-CHN-10km-pthsh'+str(pred_thresh)+'_'
chip_centroids_gjson_prefix = 'L8-deployment-chip-centroids-CHN-10km-pthsh'+str(pred_thresh)+'_'

s3_path = 'L8-TIR-macro-localization-model-deployment'

## Load in and join deployment region to Landsat 8 catalog

### Load in deployment region

In [None]:
macro_deployment_gdf = gpd.read_file(deployment_gjson)

### Load in Landsat 8 catalog

In [None]:
site_cat_pdf = pd.read_csv(catalog_csv, index_col=False)
site_cat_gdf = gpd.GeoDataFrame(site_cat_pdf,
                                geometry=site_cat_pdf.geometry.apply(wkt.loads),
                                crs='EPSG:4326')
site_cat_gdf.eod_epsg4326_geometry_simplified = site_cat_gdf.eod_epsg4326_geometry_simplified.apply(wkt.loads)
site_cat_gdf.proj_geometry = site_cat_gdf.proj_geometry.apply(wkt.loads)

### Join

In [None]:
site_cat_gdf = gpd.sjoin(macro_deployment_gdf, site_cat_gdf)

In [None]:
reg_cnt = site_cat_gdf['index'].nunique()
l8_scene_cnt = site_cat_gdf.eod_grid_id.nunique()
cat_cnt = len(site_cat_gdf)
print('Number of Geometries in deployment region: ', reg_cnt)
print('Number of Landsat 8 scenes in deployment regions: ', l8_scene_cnt)
print('Number of catalog entries: ', cat_cnt)

## Initialize Spark

Set the number of partitions to be proportional to catalog size.

In [None]:
partitions = round(len(site_cat_gdf) / 4)
spark = create_earthai_spark_session(**{
    "spark.default.parallelism": partitions,
    "spark.sql.shuffle.partitions": partitions,
})

## Read and create image chips for 10km grid

* Uses chip reader to create uniform, same-sized chips covering the deployment region w/ SceneAlignedGrid
* Filter out blank chips at edge of scenes
* Handle rare edge case where returned chip is less than specified size (when reach edge of a scene)
* Compute tile extents in EPSG:4326
* Find distinct Landsat-8 grid + tile rows

Loops over number of scenes and creates separate GeoJSON files for each scene. This helps avoid growing large vector files and is more resilient.

In [None]:
overwrite = False
l8_unq_scenes = site_cat_gdf.sort_values('eod_grid_id').eod_grid_id.unique()
if not os.path.exists(output_path):
    os.mkdir(output_path)

In [None]:
for i in range(0, l8_scene_cnt):
    
    # Check for existence of output file
    scene_id = l8_unq_scenes[i]
    output_gjson = output_path+chip_centroids_gjson_prefix+scene_id+'.geojson'
    
    # Skip if file exists and we don't want to overwrite it
    if os.path.isfile(output_gjson) and not overwrite:
        print('Skipping scene ', scene_id, ' (file exists)')
    else:
    
        # Get catalog entries for specified scene
        site_cat_i = site_cat_gdf[site_cat_gdf.eod_grid_id == scene_id]

        # Create chips within scene
        site_chip_unq = spark.read.chip(site_cat_i, ['BQA'],
                            chipping_strategy=chp.SceneAlignedGrid(chip_size, chip_size)) \
                      .select('index', 'eod_grid_id', 'BQA') \
                      .withColumn('tot_cell_count', rf_data_cells('BQA')) \
                      .filter(F.col('tot_cell_count') == chip_size*chip_size) \
                      .withColumn('BQA_min', rf_tile_min('BQA')) \
                      .filter(F.col('BQA_min') > 1.0) \
                      .withColumn('tile_extent_4326', st_reproject(st_geometry(rf_extent('BQA')), 
                                                                   rf_crs('BQA'), lit('EPSG:4326'))) \
                      .drop('BQA', 'tot_cell_count', 'BQA_min') \
                      .distinct()
        
        # Load into pandas data frame
        site_chip_pdf = site_chip_unq.toPandas()
        chp_cnt = len(site_chip_pdf)
                    
        # Skip if data frame is empty
        if chp_cnt == 0:
            print('Skipping scene ', scene_id, ' (no coverage)')
        else:            
            
            # Create unique chip id
            tile_id = [scene_id+'-'+str(row).zfill(5) for row in list(range(1,chp_cnt+1))]
    
            # Write out chips extents to GeoJSON file
            tile_geom_gdf = gpd.GeoDataFrame({'region_id': site_chip_pdf['index'],
                                  'scene_id': site_chip_pdf.eod_grid_id,
                                  'tile_id': tile_id,
                                  'tile_extent': site_chip_pdf.tile_extent_4326},
                                  geometry='tile_extent',
                                  crs='EPSG:4326')
            tile_geom_gdf.to_file(output_path+chip_extents_gjson_prefix+scene_id+'.geojson', driver='GeoJSON')
    
            # Find chip centroids and write to GeoJSON
            tile_centroid_gdf = tile_geom_gdf
            tile_centroid_gdf['tile_cntr'] = tile_centroid_gdf.geometry.centroid
            tile_centroid_gdf = tile_centroid_gdf.set_geometry('tile_cntr').drop('tile_extent', axis=1)
            tile_centroid_gdf.to_file(output_gjson, driver='GeoJSON')
    
            print('Done creating ', chp_cnt, ' chip centroids for scene ', scene_id, 
              ' (', i+1, ' out of ', l8_scene_cnt, ')')

## Tar up files and upload to S3

### Chip centroids

In [None]:
chip_cntr_files = glob.glob(output_path+'*centroids*.geojson')
chip_cntr_subdir = output_path+chip_centroids_gjson_prefix.split('_')[0]

In [None]:
if not os.path.exists(chip_cntr_subdir):
    os.mkdir(chip_cntr_subdir)
[shutil.move(f, chip_cntr_subdir) for f in chip_cntr_files]

In [None]:
unix_code = 'tar -C '+output_path+' -cvf '+chip_cntr_subdir.split('/')[-1]+'.tar '+chip_cntr_subdir.split('/')[-1]

In [None]:
os.system(unix_code)

In [None]:
s3 = boto3.resource('s3')
bucket = s3.Bucket('sfi-shared-assets')

bucket.upload_file(chip_cntr_subdir.split('/')[-1]+'.tar', 
                   s3_path+'/'+chip_cntr_subdir.split('/')[-1]+'.tar')

### Chip extents

In [None]:
chip_ext_files = glob.glob(output_path+'*extents*.geojson')
chip_ext_subdir = output_path+chip_extents_gjson_prefix.split('_')[0]

In [None]:
if not os.path.exists(chip_ext_subdir):
    os.mkdir(chip_ext_subdir)
[shutil.move(f, chip_ext_subdir) for f in chip_ext_files]

In [None]:
unix_code = 'tar -C '+output_path+' -cvf '+chip_ext_subdir.split('/')[-1]+'.tar '+chip_ext_subdir.split('/')[-1]

In [None]:
os.system(unix_code)

In [None]:
bucket.upload_file(chip_ext_subdir.split('/')[-1]+'.tar', 
                   s3_path+'/'+chip_ext_subdir.split('/')[-1]+'.tar')

## Clean up large files on local

In [None]:
shutil.rmtree(output_path)

In [None]:
os.remove(chip_cntr_subdir.split('/')[-1]+'.tar')
os.remove(chip_ext_subdir.split('/')[-1]+'.tar')