# Creation of RGB Sentinel-2 Chips for Cement Plants

This notebook creates the Sentinel-2 RGB image chips for cement plants from the EarthAI catalog.

* Cement plants with exact locations in China
* Sentinel-2, red, green, and blue bands
* Chips are 3-km on a side
* Most recent cloud-free data

In [None]:
# Import required packages
from earthai.init import *
import earthai.chipping.strategy

import pyspark.sql.functions as F
from pyspark.sql.window import Window
from pyspark.sql.functions import rank, col

from pyrasterframes.rf_types import TileUDT
from pyspark.sql.functions import udf

import geopandas as gpd
import pandas as pd
import folium
import folium.plugins

import os
import rasterio
from rasterio.plot import show

import boto3

%matplotlib inline

## Get cement plant locations in China

In [None]:
# Cement assets in China with exact locations
cement_geojson = '/home/jovyan/sfi-asset-level-data/src/main/resources/cement_steel_land_geoms/cement_datasetv4_UTM_CHINA_exactlocPOLYS_1200m.geojson'

# Read into GeoDataFrame, keep only subset of columns
cement_gpd = gpd.read_file(cement_geojson)
cement_gpd = cement_gpd[['uid','latitude', 'longitude', 'geometry']]

# Convert to WGS84 for use w/ Earth OnDemand API
cement_gpd = cement_gpd.to_crs('OGC:CRS84')

# Total of 404 cement plants
cement_gpd

In [None]:
# Plot cement polygons
style_function = lambda x: {'fillColor': '#f003fc', 'color': '#f003fc'}
cement_polys = folium.features.GeoJson(cement_gpd.to_json(), style_function=style_function)
m = folium.Map(location=[cement_gpd.geometry.centroid.y.mean(), 
                         cement_gpd.geometry.centroid.x.mean()],
               zoom_start=4)

m.add_children(cement_polys)
m

## Get catalog of Sentinel-2 scenes that intersect with cement plants

In [None]:
# Query EarthAI Catalog to find S2 scenes that intersect with cement polygons
# Searching over recent 2 months - can increase this if needed to be find high quality data

# Below, do NOT impose a maximum cloud cover filter; since sites are small, it's possible that a high-cloud
# coverage scene is relatively clear over the small region we need; will select highest quality scenes
# after masking steps below

# June - July 2020
catalog_2020 = earth_ondemand.read_catalog(
    cement_gpd.geometry,
    start_datetime='2020-06-01', 
    end_datetime='2020-07-31',
    max_cloud_cover=100,
    collections='sentinel2_l2a'
)

In [None]:
# Join catalog to cement plant sites
cement_cat_2020 = gpd.sjoin(cement_gpd, catalog_2020)

# June - July 2020
print("June - July 2020")
print("----------------------------------------------")
# Number of plants with S2 imagery
cement_2020_plnt_cnt = cement_cat_2020['uid'].nunique()
print("Number of cement plants with S2 imagery: ", cement_2020_plnt_cnt)

# Number of S2 scenes
cement_2020_S2_cnt = cement_cat_2020['id'].nunique()
print("Total number of catalog entries:", cement_cat_2020['uid'].count())
print("Number of unique S2 scenes:", cement_2020_S2_cnt)

In [None]:
# Plot cement polygons - testing
style_function = lambda x: {'fillColor': '#f003fc', 'color': '#f003fc'}
cement_polys = folium.features.GeoJson(cement_gpd.to_json(), style_function=style_function)
m = folium.Map(location=[cement_gpd.geometry.centroid.y.mean(), 
                         cement_gpd.geometry.centroid.x.mean()],
               zoom_start=3)

# Plot S2 scene boundaries
s2_gpd2p = catalog_2020[['id','geometry']]
style_function = lambda x: {'fillColor': '#32a852', 'color': '#32a852'}
s2_polys = folium.features.GeoJson(s2_gpd2p.to_json(), style_function=style_function)


m.add_children(s2_polys)
m.add_children(cement_polys)
m

## Read and create image chips for cement plants

In [None]:
# Use chip reader with centroid centered extent
# Limit to Red, Green, and Blue, and Scene Classification
# Want 3000 m per size, 10 m pix = 300 cell size
cement_chip_2020 = spark.read.chip(cement_cat_2020, catalog_col_names=['B04_10m','B03_10m','B02_10m'],
                                   chipping_strategy=earthai.chipping.strategy.CentroidCentered(300))

# Get SCL_20m separately; 3000 / 20 m = 150 cell size; addresses limitations in chip reader
cement_chip_scl_2020 = spark.read.chip(cement_cat_2020, catalog_col_names=['SCL_20m'],
                                       chipping_strategy=earthai.chipping.strategy.CentroidCentered(150))

# Keep only columns of interest
cement_chip_2020 = cement_chip_2020.select('uid','latitude','longitude','id','datetime',\
                                           'B04_10m', 'B03_10m', 'B02_10m').repartition('uid')
cement_chip_scl_2020 = cement_chip_scl_2020.select('uid','id',\
                                                   'SCL_20m').repartition('uid')

# Join, and upsample SCL_20m to 10 m
cement_chip_2020 = cement_chip_2020.join(cement_chip_scl_2020, on=['uid','id'], how='inner') \
                                   .withColumn('SCL_10m', rf_resample('SCL_20m', 'B04_10m')) \
                                   .drop('SCL_20m')

In [None]:
# Mask chips w/ SCL
# Example notebook discussing masking in more detail: /home/jovyan/examples/tutorials/geo-ops/masking-sentinel2.ipynb
# 
# Sentinel-2 Level-2A scene classification description: 
#   https://earth.esa.int/web/sentinel/technical-guides/sentinel-2-msi/level-2a/algorithm  

# In order to apply a mask, the tile must have a NoData defined. Sentinel-2 measurement bands have a cell type of uint16raw, 
# which indicates that there is no NoData value defined. The first lines of the code below sets the cell types to uint16, 
# whose NoData value is 0. This will cause any zero-valued cells in the measurement band to be considered NoData. In 
# Sentinel-2, these areas correspond to the BQA fill areas.

# The next several lines mask out data according to bit values.
# The final lines calculate the total cell count, number of unmasked cells, and masked fraction

# Define scene classifications to mask
# This masks nodata, saturated/defective, and clouds
bad_scl_values = [0, 1, 8, 9, 10]

# Mask
cement_chip_2020_masked = cement_chip_2020.withColumn('Red', rf_convert_cell_type('B04_10m', 'uint16')) \
                                          .withColumn('Green', rf_convert_cell_type('B03_10m', 'uint16')) \
                                          .withColumn('Blue', rf_convert_cell_type('B02_10m', 'uint16')) \
                                          .withColumn('Red_masked', rf_mask_by_values('Red', 'SCL_10m', bad_scl_values)) \
                                          .withColumn('Green_masked', rf_mask_by_values('Green', 'SCL_10m', bad_scl_values)) \
                                          .withColumn('Blue_masked', rf_mask_by_values('Blue', 'SCL_10m', bad_scl_values)) \
                                          .withColumn('tot_cell_count', rf_data_cells('Red')) \
                                          .withColumn('unmsk_cell_count', rf_data_cells('Red_masked')) \
                                          .withColumn('mask_fraction', 
                                                      (1.0 - F.col('unmsk_cell_count')/F.col('tot_cell_count')))

In [None]:
# Find the tile(s) for each plant that has the highest number of unmasked cells
# (And handle rare edge-case where returned chip is less than 300 x 300)

cement_2020_maxcnt = cement_chip_2020_masked.groupby('uid').max('unmsk_cell_count') \
                                            .withColumnRenamed('max(unmsk_cell_count)', 'max_unmsk_cell_count')
cement_chip_2020_masked = cement_chip_2020_masked.join(cement_2020_maxcnt, 'uid', 'left')
cement_chip_2020_fltr = cement_chip_2020_masked.filter(F.col('tot_cell_count') == 90000) \
                                               .filter(F.col('unmsk_cell_count') == F.col('max_unmsk_cell_count'))

In [None]:
# If there's >1 tile per plant, grab the first record, and remove duplicates
# Take the Red, Green, and Blue bands with NoData, unmasked

cement_chip_2020_c25 = cement_chip_2020_fltr.filter(col('mask_fraction') < 0.25)
cement_chip_2020_unq = cement_chip_2020_c25.groupby('uid') \
                                           .agg(F.first('latitude').alias('latitude'),
                                                F.first('longitude').alias('longitude'),
                                                F.first('id').alias('id'),
                                                F.first('datetime').alias('datetime'),
                                                F.first('Red').alias('Red'),
                                                F.first('Green').alias('Green'),
                                                F.first('Blue').alias('Blue'))

## Write chips out as GeoTIFFs

In [None]:
# Output path (putting on scratch to test)
output_path = '/scratch/ALD_S2_RGB_cement_chips_v4_20200601_20200731'

# Create a column of unique filenames
# Format: [uid]_cement_v4_S2_RGB_20200601_20200731
cement_chip_2020_unq = cement_chip_2020_unq.withColumn('file_path_name', 
                                                       F.concat_ws('_', F.col('uid'), lit('cement_v4_S2_RGB_20200601_20200731')))

In [None]:
# Write out chips
cement_chip_2020_unq.write.chip(output_path, filenameCol='file_path_name', 
                                catalog=True, 
                                metadata=['uid', 'latitude', 'longitude', 'id', 'datetime'])

In [None]:
!ls -lR /scratch/ALD_S2_RGB_cement_chips_v4_20200601_20200731

In [None]:
# Check out what's in one of the chips for fun
tiffs = os.listdir(output_path)

with rasterio.open(output_path+'/'+tiffs[0]) as src:
    for k, v in src.meta.items():
        print(k, '\t\t', v)
        
    print('\n', 'T A G S :')
    for k, v in src.tags().items():
        print(k, '\t\t', v)
        
    print('\n B A N D S :')
    for b in range(1, src.count + 1):
        for k in src.tags(b):
            print("\tBand", b, '\t\t', k, '\t\t', v)
        print("\tBand", b, '\t\t', src.colorinterp[b-1])
    ax = show(src.read(1), transform=src.transform)

In [None]:
!tar -cvf /scratch/ALD_S2_RGB_cement_chips_v4_20200601_20200731.tar /scratch/ALD_S2_RGB_cement_chips_v4_20200601_20200731

In [None]:
!ls -lh /scratch/ALD_S2_RGB_cement_chips_v4_20200601_20200731.tar

_Below workflow is temporary; AWS credentials not working in beta. Copies the tar file to local, and uses another script to upload to S3 using a different instance_

In [None]:
!cp /scratch/ALD_S2_RGB_cement_chips_v4_20200601_20200731.tar /home/jovyan/sfi-asset-level-data/src/main/resources/ALD_S2_RGB_cement_chips_v4_20200601_20200731.tar

## Upload tar file to S3 bucket