# Deployment of RGB Sentinel-2 Macrolocalization Model - 2020

This notebook deploys the RGB Sentinel-2 macrolocalization models for cement and steel plants for the year 2020 on known plants

## Import required libraries

In [None]:
!pip install fastai==1.0.61

In [None]:
from earthai.all import *
import earthai.chipping.strategy as chp
import pyspark.sql.functions as pys
from pyspark.sql.functions import lit, col, udf

import geopandas as gpd
import pandas as pd
import rasterio

import os
import shutil
import boto3
import glob

from fastai import *
from fastai.vision import *

## Create Spark Session

* Important to do this before defining the udfs for scoring
* Set number of partitions on par with the number of catalog items per scene

In [None]:
partitions = 2500
spark = create_earthai_spark_session(**{
    "spark.default.parallelism": partitions,
    "spark.sql.shuffle.partitions": partitions,
})

## Define input/output files and paths, and parameters

### Parameters

* `chip_size` is the size of chips (length) to create (in pixels)
* `unmsk_frac` is the minimum threshold on the fraction of unmasked cells required to keep site in sample
* `year` defines the year of selected scenes
* `month` defines the month of selected scenes (format: January = "01", Februaray = "02", etc.)

In [None]:
chip_size = 300 # 3 km for Sentinel-2
unmsk_frac = 0.75

year = '2020'
month = '06'

### Input files and paths

* `s3_path` defines S3 high-level folder for S2 RGB macro-localization data
* `cement_site_geojson` is GeoJSON of cement plants with exact locations
* `steel_site_geojson` is GeoJSON of steel plants with exact locations
* `MODEL_PATH` is the path on S3 to the Densenet161 multiclass model
* `LOCAL_DIR` specifies where to keep put files locally for analysis

In [None]:
s3_path = 'S2-RGB-macro-localization-model-deployment'

cement_site_geojson = "../../resources/macro-loc-model-build/cement_exact_china_v4.1.geojson"
steel_site_geojson = "../../resources/macro-loc-model-build/steel_exact_china_v4.1.geojson"

MODEL_PATH = 'S2-RGB-macro-localization-model-build3/S2-RGB-model-results3/densenet161_multiclass_final.pkl'

LOCAL_DIR = '/scratch/'

### Output files and paths

* `output_score_file` define output GeoJSON of scores for known plants

In [None]:
output_score_file = '../../resources/macro-loc-model-deployment/S2-known-plant-chip-fastai-scores-CHN-10km-pthsh0.002_'+year+month+'.geojson'

## Download Model and Define Scoring Function

In [None]:
s3 = boto3.resource('s3')
bucket = s3.Bucket('sfi-shared-assets')

### Download model and load learner

In [None]:
def download_model(MODEL_PATH):
    if not os.path.exists(LOCAL_DIR+MODEL_PATH.split("/")[-1].replace(".pkl", "")):
        os.makedirs(LOCAL_DIR + MODEL_PATH.split("/")[-1].replace(".pkl", ""))
    bucket.download_file(MODEL_PATH, LOCAL_DIR+MODEL_PATH.split("/")[-1].replace(".pkl", "") + "/export.pkl")

In [None]:
download_model(MODEL_PATH)

In [None]:
multi_model = load_learner(LOCAL_DIR + MODEL_PATH.split("/")[-1].replace(".pkl", ""))

### Define scoring function for PNGs

In [None]:
def score_pngs(path):
    
    # Get ImageDataBunch for Fastai
    data = (ImageDataBunch.from_folder(path, train='all', bs=16, num_workers=0, seed=42).normalize(imagenet_stats))
    
    # Create empty lists to store results
    data_cnt = len(data.train_ds)
    uid = []
    site_type = []
    cement_prob = []
    steel_prob = []
    
    # Loop over images and get scores and metadata
    for i in range(0, data_cnt):
        
        # Model results
        p_model = multi_model.predict(data.train_ds.x[i])
        
        # Cement probability
        cement_prob.append(to_np(p_model[2])[0].item())
    
        # Steel probability
        steel_prob.append(to_np(p_model[2])[2].item())
    
        # Metadata for chip
        uid.append(str(data.items[i]).split('/')[-1].split('_')[0])
        site_type.append(str(data.items[i]).split('_')[-1].split('.')[0])
        
    # Return data frame
    score_pdf = pd.DataFrame({'uid': uid,
                              'site_type': site_type,
                              'cement_prob': cement_prob,
                              'steel_prob': steel_prob})
    
    return(score_pdf)

## Define EOD Catalog Read and Chipping Functions

### Get catalog of Sentinel-2 scenes that intersect with chip centroids

Queries EarthAI Catalog to find S2 scenes that intersect with chip centroids.

* Returns specified scenes for year, month
* Join back to chip centroids for chipping

In [None]:
def eod_read_catalog(geom, year, month):
    
    # Start/end date formatting
    start_date_dict = {"01": year+'-01-01',
                   "02": year+'-02-01',
                   "03": year+'-03-01',
                   "04": year+'-04-01',
                   "05": year+'-05-01',
                   "06": year+'-06-01',
                   "07": year+'-07-01',
                   "08": year+'-08-01',
                   "09": year+'-09-01',
                   "10": year+'-10-01',
                   "11": year+'-11-01',
                   "12": year+'-12-01'}
    end_date_dict =   {"01": year+'-01-31',
                   "02": year+'-02-28',
                   "03": year+'-03-31',
                   "04": year+'-04-30',
                   "05": year+'-05-31',
                   "06": year+'-06-30',
                   "07": year+'-07-31',
                   "08": year+'-08-31',
                   "09": year+'-09-30',
                   "10": year+'-10-31',
                   "11": year+'-11-30',
                   "12": year+'-12-31'}
    
    # Query catalog
    site_cat = earth_ondemand.read_catalog(
        geo=geom,
        start_datetime=start_date_dict[month], 
        end_datetime=end_date_dict[month],
        max_cloud_cover=100,
        collections='sentinel2_l2a'
        )
    if len(site_cat) > 0:
        site_cat = gpd.sjoin(geom, site_cat)
        
    return(site_cat)

## Create Image Chips

* Read and create image chips for 10km grid
* Select highest quality chips per site

In [None]:
def create_chips(site_cat, chip_size=300, unmsk_frac=0.75, repartition_size=partitions):
    
    # Reads in 20-m QA band to use in masking and selection of best scenes
    # Filter out blank chips at edge of scenes
    # Handle rare edge case where returned chip is less than specified size (when reach edge of a scene)
    # Mask chips by QA band and compute count of unmasked cells
    # Remove chips with less than a minimum fraction of unmasked cells    
    chip_size_scl = int(chip_size/2)   
    bad_scl_values = [0, 1, 2, 3, 8, 9, 10]
    site_chips = spark.read.chip(site_cat, ['SCL_20m'],
                                 chipping_strategy=chp.CentroidCentered(chip_size_scl)) \
                          .select('uid', 'id', 'SCL_20m') \
                          .withColumn('mask', rf_make_constant_tile(1, chip_size_scl, chip_size_scl, 'uint16')) \
                          .withColumn('tot_cell_count', rf_data_cells('SCL_20m')) \
                          .filter(pys.col('tot_cell_count') == chip_size_scl*chip_size_scl) \
                          .withColumn('mask', rf_mask_by_values('mask', 'SCL_20m', bad_scl_values)) \
                          .withColumn('unmsk_cell_count', rf_data_cells('mask')) \
                          .filter(pys.col('unmsk_cell_count') >= unmsk_frac*chip_size_scl*chip_size_scl) \
                          .repartition(repartition_size, 'uid', 'id')
    
    # Find the chip(s) with the highest number of unmasked cells
    # If there's >1 chip (a tie) take the first record  
    chpinf_pdf = site_chips.select('uid', 'id', 'unmsk_cell_count').toPandas()
    chpinf_pdf['grpid'] = chpinf_pdf['uid']    
    site_maxcnt = chpinf_pdf.sort_values('unmsk_cell_count', ascending=False) \
                            .groupby(['grpid']).first() \
                            .drop('unmsk_cell_count', axis=1)
    
    # Read in thermal band for highest quality chip
    site_cat = site_cat.merge(site_maxcnt, on=['uid', 'id'], how='inner')
    site_chip_unq = spark.read.chip(site_cat, ['B04_10m','B03_10m','B02_10m'],
                                chipping_strategy=chp.CentroidCentered(chip_size)) \
                     .select('uid', 'site_type', 'id', 'datetime', 'B04_10m', 'B03_10m', 'B02_10m') \
                     .withColumn('Red', 
                                 rf_convert_cell_type(
                                     rf_local_multiply(
                                         rf_rescale(rf_convert_cell_type('B04_10m', 'uint16')), 65535), 'uint16')) \
                     .withColumn('Green', 
                                 rf_convert_cell_type(
                                     rf_local_multiply(
                                         rf_rescale(rf_convert_cell_type('B03_10m', 'uint16')), 65535), 'uint16')) \
                     .withColumn('Blue', 
                                 rf_convert_cell_type(
                                     rf_local_multiply(
                                         rf_rescale(rf_convert_cell_type('B02_10m', 'uint16')), 65535), 'uint16')) \
                     .drop('B04_10m', 'B03_10m', 'B02_10m') \
                     .repartition(repartition_size, 'uid')
    
    return(site_chip_unq)

## Convert GeoTIFFs to PNGs

In [None]:
def convert_image(tif_filename, png_filename):
    with rasterio.open(tif_filename) as infile:
        
        profile = infile.profile
        profile['driver'] = 'PNG'
        
        raster = infile.read()
        
        with rasterio.open(png_filename, 'w', **profile) as dst:
            dst.write(raster)

## Create PNGs from RasterFrame

In [None]:
def png_from_rf(rf):
    
    # Delete temporary output paths for geotiffs and pngs and all files if they exist
    if os.path.exists('geotiffs'):
        shutil.rmtree('geotiffs')
    if os.path.exists('pngs'):
        shutil.rmtree('pngs')
    
    # Create GeoTIFFs from RasterFrame
    rf.write.chip('geotiffs', filenameCol='file_path_name', catalog=False)
    tif_file_list = glob.glob('geotiffs/*.tif')
    
    # Create output paths for PNGs to fit Fastai structure
    os.mkdir('pngs')
    os.mkdir('pngs/all')
    png_file_list = [f.replace('.tif', '.png').replace('geotiffs/', 'pngs/all/') for f in tif_file_list]
    
    # Convert and write out PNGs
    for i in range(0, len(tif_file_list)):
        convert_image(tif_file_list[i], png_file_list[i])

## Define Output Function

* Writes out scores to GeoJSON file

In [None]:
def write_chip_scores(rf, pdf, year, month):
    
    # Get tile extents from RasterFrame
    geo_pdf = rf.withColumn('geometry', st_reproject(st_geometry(rf_extent('Red')), 
                                                     rf_crs('Red'), 
                                                     pys.lit('EPSG:4326'))) \
                .select('uid', 'site_type', 'geometry').toPandas()
    geo_pdf['year'] = year
    geo_pdf['month'] = month
    geo_gdf = gpd.GeoDataFrame(geo_pdf, geometry='geometry', crs='EPSG:4326')
    
    # Join with scores
    scores_gdf = pd.merge(geo_gdf, pdf, how='inner', on=['uid', 'site_type'])
    
    scores_gdf.to_file(output_score_file, driver='GeoJSON')

## Read in Cement and Steel Plant sites

In [None]:
cement_site_gdf = gpd.read_file(cement_site_geojson)
cement_site_gdf['site_type'] = 'cement'
steel_site_gdf = gpd.read_file(steel_site_geojson)
steel_site_gdf['site_type'] = 'steel'
chip_cntr_gdf = pd.concat([cement_site_gdf, steel_site_gdf], ignore_index=True)
chip_cnt = len(chip_cntr_gdf)
print("Total count of sites: ", chip_cnt)

In [None]:
print("Number of cement sites: ", len(chip_cntr_gdf[chip_cntr_gdf['site_type'] == 'cement']))
print("Number of steel sites: ", len(chip_cntr_gdf[chip_cntr_gdf['site_type'] == 'steel']))

## Create Chips and Score

For each site:

* Get catalog of Sentinel-2 scenes that intersect with chip centroids
* Read and create image chips
* Score models
* Write scores out to file

In [None]:
# Get catalog of Sentinel-2 scenes that intersect with chip centroids
site_cat_yyyymm = eod_read_catalog(chip_cntr_gdf, year, month)
    
# Read and create image chips
site_chip_yyyymm_unq = create_chips(site_cat_yyyymm, 
                                      chip_size=chip_size, 
                                      unmsk_frac=unmsk_frac,
                                      repartition_size=round(chip_cnt/4))
site_chip_yyyymm_unq = site_chip_yyyymm_unq.withColumn('file_path_name', 
                                                pys.concat_ws('_', pys.col('uid'), pys.col('site_type'))) \
                                           .cache()

In [None]:
# Write out temporary PNGs to score
png_from_rf(site_chip_yyyymm_unq)

In [None]:
# Score PNGs
deployment_scores_pdf = score_pngs('pngs')

In [None]:
# Write scores to GeoJSON file
write_chip_scores(site_chip_yyyymm_unq, deployment_scores_pdf, year, month)