# Creation of RGB Sentinel-2 Chips for Landcover

This notebook creates the Sentinel-2 RGB image chips for landcover from the EarthAI catalog.

* Sentinel-2, red, green, and blue bands
* Chips are 3-km on a side
* Most recent cloud-free data

In [None]:
# Import required packages
from earthai.init import *
import earthai.chipping.strategy

import pyspark.sql.functions as F
from pyspark.sql.window import Window
from pyspark.sql.functions import rank, col

from pyrasterframes.rf_types import TileUDT
from pyspark.sql.functions import udf

import geopandas as gpd
import pandas as pd
import folium
import folium.plugins

import os
import rasterio
from rasterio.plot import show

import boto3

%matplotlib inline

## Get landcover locations in China

In [None]:
# Landcover polygons in China
land_geojson = '/home/jovyan/sfi-asset-level-data/src/main/resources/cement_steel_land_geoms/landcover_datasetv4_UTM_CHINA_exactlocPOLYS_3000m.geojson'

# Read into GeoDataFrame, keep only subset of columns
land_gpd = gpd.read_file(land_geojson)
land_gpd = land_gpd[['id', 'geometry']]

# Convert to WGS84 for use w/ Earth OnDemand API
land_gpd = land_gpd.to_crs('OGC:CRS84')

# Total of 1977 land cover
land_gpd.rename(columns={'id': 'uid'}, inplace=True)
land_gpd

In [None]:
# Plot landcover polygons
style_function = lambda x: {'fillColor': '#f003fc', 'color': '#f003fc'}
land_polys = folium.features.GeoJson(land_gpd.to_json(), style_function=style_function)
m = folium.Map(location=[land_gpd.geometry.centroid.y.mean(), 
                         land_gpd.geometry.centroid.x.mean()],
               zoom_start=4)

m.add_children(land_polys)
m

## Get catalog of Sentinel-2 scenes that intersect with landcover

In [None]:
# Query EarthAI Catalog to find S2 scenes that intersect with landcover polygons
# Searching over recent 2 months - can increase this if needed to be find high quality data

# Below, do NOT impose a maximum cloud cover filter; since sites are small, it's possible that a high-cloud
# coverage scene is relatively clear over the small region we need; will select highest quality scenes
# after masking steps below

# June - July 2020
catalog_2020 = earth_ondemand.read_catalog(
    land_gpd.geometry,
    start_datetime='2020-06-01', 
    end_datetime='2020-07-31',
    max_cloud_cover=100,
    collections='sentinel2_l2a'
)

In [None]:
# Join catalog to landcover sites
land_cat_2020 = gpd.sjoin(land_gpd, catalog_2020)

# June - July 2020
print("June - July 2020")
print("----------------------------------------------")
# Number of landcover sites with S2 imagery
land_2020_plnt_cnt = land_cat_2020['uid'].nunique()
print("Number of landcover sites with S2 imagery: ", land_2020_plnt_cnt)

# Number of S2 scenes
land_2020_S2_cnt = land_cat_2020['id'].nunique()
print("Total number of catalog entries:", land_cat_2020['uid'].count())
print("Number of unique S2 scenes:", land_2020_S2_cnt)

In [None]:
# Plot landcover polygons - testing
style_function = lambda x: {'fillColor': '#f003fc', 'color': '#f003fc'}
land_polys = folium.features.GeoJson(land_gpd.to_json(), style_function=style_function)
m = folium.Map(location=[land_gpd.geometry.centroid.y.mean(), 
                         land_gpd.geometry.centroid.x.mean()],
               zoom_start=3)

# Plot S2 scene boundaries
s2_gpd2p = catalog_2020[['id','geometry']]
style_function = lambda x: {'fillColor': '#32a852', 'color': '#32a852'}
s2_polys = folium.features.GeoJson(s2_gpd2p.to_json(), style_function=style_function)


m.add_children(s2_polys)
m.add_children(land_polys)
m

## Read and create image chips for landcover

In [None]:
# Use chip reader with centroid centered extent
# Limit to Red, Green, and Blue, and Scene Classification
# Want 3000 m per size, 10 m pix = 300 cell size
land_chip_2020 = spark.read.chip(land_cat_2020, catalog_col_names=['B04_10m','B03_10m','B02_10m'],
                                 chipping_strategy=earthai.chipping.strategy.CentroidCentered(300))

# Get SCL_20m separately; 3000 / 20 m = 150 cell size; addresses limitations in chip reader
land_chip_scl_2020 = spark.read.chip(land_cat_2020, catalog_col_names=['SCL_20m'],
                                     chipping_strategy=earthai.chipping.strategy.CentroidCentered(150))

# Keep only columns of interest
land_chip_2020 = land_chip_2020.select('uid','id','datetime',\
                                       'B04_10m', 'B03_10m', 'B02_10m').repartition('uid')
land_chip_scl_2020 = land_chip_scl_2020.select('uid','id',\
                                               'SCL_20m').repartition('uid')

# Join, and upsample SCL_20m to 10 m
land_chip_2020 = land_chip_2020.join(land_chip_scl_2020, on=['uid','id'], how='inner') \
                               .withColumn('SCL_10m', rf_resample('SCL_20m', 'B04_10m')) \
                               .drop('SCL_20m')

In [None]:
# Mask chips w/ SCL
# Example notebook discussing masking in more detail: /home/jovyan/examples/tutorials/geo-ops/masking-sentinel2.ipynb
# 
# Sentinel-2 Level-2A scene classification description: 
#   https://earth.esa.int/web/sentinel/technical-guides/sentinel-2-msi/level-2a/algorithm  

# In order to apply a mask, the tile must have a NoData defined. Sentinel-2 measurement bands have a cell type of uint16raw, 
# which indicates that there is no NoData value defined. The first lines of the code below sets the cell types to uint16, 
# whose NoData value is 0. This will cause any zero-valued cells in the measurement band to be considered NoData. In 
# Sentinel-2, these areas correspond to the BQA fill areas.

# The next several lines mask out data according to bit values.
# The final lines calculate the total cell count, number of unmasked cells, and masked fraction

# Define scene classifications to mask
# This masks nodata, saturated/defective, and clouds
bad_scl_values = [0, 1, 8, 9, 10]

# Mask
land_chip_2020_masked = land_chip_2020.withColumn('Red', rf_convert_cell_type('B04_10m', 'uint16')) \
                                      .withColumn('Green', rf_convert_cell_type('B03_10m', 'uint16')) \
                                      .withColumn('Blue', rf_convert_cell_type('B02_10m', 'uint16')) \
                                      .withColumn('Red_masked', rf_mask_by_values('Red', 'SCL_10m', bad_scl_values)) \
                                      .withColumn('Green_masked', rf_mask_by_values('Green', 'SCL_10m', bad_scl_values)) \
                                      .withColumn('Blue_masked', rf_mask_by_values('Blue', 'SCL_10m', bad_scl_values)) \
                                      .withColumn('tot_cell_count', rf_data_cells('Red')) \
                                      .withColumn('unmsk_cell_count', rf_data_cells('Red_masked')) \
                                      .withColumn('mask_fraction', 
                                                  (1.0 - F.col('unmsk_cell_count')/F.col('tot_cell_count')))

In [None]:
# Find the tile(s) for each landcover site that has the highest number of unmasked cells
# (And handle rare edge-case where returned chip is less than 300 x 300)

land_2020_maxcnt = land_chip_2020_masked.groupby('uid').max('unmsk_cell_count') \
                                        .withColumnRenamed('max(unmsk_cell_count)', 'max_unmsk_cell_count')
land_chip_2020_masked = land_chip_2020_masked.join(land_2020_maxcnt, 'uid', 'left')
land_chip_2020_fltr = land_chip_2020_masked.filter(F.col('tot_cell_count') == 90000) \
                                           .filter(F.col('unmsk_cell_count') == F.col('max_unmsk_cell_count'))

In [None]:
# If there's >1 tile per landcover site, grab the first record, and remove duplicates
# Take the Red, Green, and Blue bands with NoData, unmasked

land_chip_2020_c25 = land_chip_2020_fltr.filter(col('mask_fraction') < 0.25)
land_chip_2020_unq = land_chip_2020_c25.groupby('uid') \
                                       .agg(F.first('id').alias('id'),
                                            F.first('datetime').alias('datetime'),
                                            F.first('Red').alias('Red'),
                                            F.first('Green').alias('Green'),
                                            F.first('Blue').alias('Blue'))

## Write chips out as GeoTIFFs

In [None]:
# Output path (putting on scratch to test)
output_path = '/scratch/ALD_S2_RGB_landcover_chips_v4_20200601_20200731'

# Create a column of unique filenames
# Format: [uid]_landcover_v4_S2_RGB_20200601_20200731
land_chip_2020_unq = land_chip_2020_unq.withColumn('file_path_name', 
                                                   F.concat_ws('_', F.col('uid'), lit('landcover_v4_S2_RGB_20200601_20200731')))

In [None]:
# Write out chips
land_chip_2020_unq.write.chip(output_path, filenameCol='file_path_name', 
                              catalog=True, 
                              metadata=['uid', 'id', 'datetime'])

In [None]:
!ls -lR /scratch/ALD_S2_RGB_landcover_chips_v4_20200601_20200731

In [None]:
# Check out what's in one of the chips for fun
tiffs = os.listdir(output_path)

with rasterio.open(output_path+'/'+tiffs[0]) as src:
    for k, v in src.meta.items():
        print(k, '\t\t', v)
        
    print('\n', 'T A G S :')
    for k, v in src.tags().items():
        print(k, '\t\t', v)
        
    print('\n B A N D S :')
    for b in range(1, src.count + 1):
        for k in src.tags(b):
            print("\tBand", b, '\t\t', k, '\t\t', v)
        print("\tBand", b, '\t\t', src.colorinterp[b-1])
    ax = show(src.read(1), transform=src.transform)

In [None]:
!tar -cvf /scratch/ALD_S2_RGB_landcover_chips_v4_20200601_20200731.tar /scratch/ALD_S2_RGB_landcover_chips_v4_20200601_20200731

In [None]:
!ls -lh /scratch/ALD_S2_RGB_landcover_chips_v4_20200601_20200731.tar

_Below workflow is temporary; AWS credentials not working in beta. Copies the tar file to local, and uses another script to upload to S3 using a different instance_

In [None]:
!cp /scratch/ALD_S2_RGB_landcover_chips_v4_20200601_20200731.tar /home/jovyan/sfi-asset-level-data/src/main/resources/ALD_S2_RGB_landcover_chips_v4_20200601_20200731.tar

## Upload tar file to S3 bucket

In [None]:
# Upload tar file to S3 bucket
s3 = boto3.resource('s3')
bucket = s3.Bucket('sfi-shared-assets')

bucket.upload_file('/scratch/ALD_S2_RGB_landcover_chips_v4_20200601_20200731.tar', 
                   'S2-macroloc-model/ALD_S2_RGB_landcover_chips_v4_20200601_20200731.tar')

In [None]:
! aws s3 ls sfi-shared-assets/S2-macroloc-model/