# Deployment of TIR Landsat 8 Macrolocalization Model - 2020

This notebook deploys the TIR Landsat 8 macrolocalization models for cement and steel plants for the year 2020.

## Import required libraries

In [None]:
!pip install fastai==1.0.61

In [None]:
from earthai.all import *
import earthai.chipping.strategy as chp
import pyspark.sql.functions as pys
from pyspark.sql.functions import lit, col, udf

import geopandas as gpd
import pandas as pd
import rasterio

import os
import shutil
import boto3
import glob

from fastai import *
from fastai.vision import *

## Create Spark Session

* Important to do this before defining the udfs for scoring
* Set number of partitions on par with the number of catalog items per scene

In [None]:
partitions = 2500
spark = create_earthai_spark_session(**{
    "spark.default.parallelism": partitions,
    "spark.sql.shuffle.partitions": partitions,
})

## Define input/output files and paths, and parameters

### Parameters

* `chip_size` is the size of chips (length) to create (in pixels)
* `unmsk_frac` is the minimum threshold on the fraction of unmasked cells required to keep site in sample
* `year` defines the year for layer 1 (thermal band, in January); layers 2 and 3 (thermal band, in January and April, respectively) are `year - 1`

In [None]:
chip_size = 35 # 1.05 km for Landsat 8
unmsk_frac = 0.75

year = '2020'

### Input files and paths

* `s3_path` defines S3 high-level folder for L8 TIR macro-localization data
* `cement_site_geojson` is GeoJSON of cement plants with exact locations
* `steel_site_geojson` is GeoJSON of steel plants with exact locations
* `CEMENT_MODEL_PATH` is the path on S3 to the Densenet161 cement model
* `STEEL_MODEL_PATH` is the path on S3 to the Resnet50 steel model
* `LOCAL_DIR` specifies where to keep put files locally for analysis

In [None]:
s3_path = 'L8-TIR-macro-localization-model-deployment'

cement_site_geojson = "../../resources/macro-loc-model-build/cement_exact_china_v4.1.geojson"
steel_site_geojson = "../../resources/macro-loc-model-build/steel_exact_china_v4.1.geojson"

CEMENT_MODEL_PATH = 'L8-TIR-macro-localization-model-build3/L8-TIR-model-results3/densenet161_cement_binary_final.pkl'
STEEL_MODEL_PATH = 'L8-TIR-macro-localization-model-build3/L8-TIR-model-results3/resnet50_steel_binary_final.pkl'

LOCAL_DIR = '/scratch/'

### Output files and paths

* `output_score_file` define output GeoJSON of scores for known plants

In [None]:
output_score_file = '../../resources/macro-loc-model-deployment/L8-known-plant-chip-fastai-scores-CHN-10km-pthsh0.002_'+year+'.geojson'

## Download Models and Define Scoring Functions

In [None]:
s3 = boto3.resource('s3')
bucket = s3.Bucket('sfi-shared-assets')

### Download models and load learners

In [None]:
def download_model(MODEL_PATH):
    if not os.path.exists(LOCAL_DIR+MODEL_PATH.split("/")[-1].replace(".pkl", "")):
        os.makedirs(LOCAL_DIR + MODEL_PATH.split("/")[-1].replace(".pkl", ""))
    bucket.download_file(MODEL_PATH, LOCAL_DIR+MODEL_PATH.split("/")[-1].replace(".pkl", "") + "/export.pkl")

In [None]:
download_model(CEMENT_MODEL_PATH)
download_model(STEEL_MODEL_PATH)

In [None]:
cement_model = load_learner(LOCAL_DIR + CEMENT_MODEL_PATH.split("/")[-1].replace(".pkl", ""))
steel_model = load_learner(LOCAL_DIR + STEEL_MODEL_PATH.split("/")[-1].replace(".pkl", ""))

### Define scoring function for PNGs

In [None]:
def score_pngs(path):
    
    # Get ImageDataBunch for Fastai
    data = (ImageDataBunch.from_folder(path, train='all', bs=16, num_workers=0, seed=42).normalize(imagenet_stats))
    
    # Create empty lists to store results
    data_cnt = len(data.train_ds)
    uid = []
    site_type = []
    cement_prob = []
    steel_prob = []
    
    # Loop over images and get scores and metadata
    for i in range(0, data_cnt):
        
        # Cement results
        p_cement = cement_model.predict(data.train_ds.x[i])
        cement_prob.append(to_np(p_cement[2])[0].item())
    
        # Steel results
        p_steel = steel_model.predict(data.train_ds.x[i])
        steel_prob.append(to_np(p_steel[2])[1].item())
    
        # Metadata for chip
        uid.append(str(data.items[i]).split('/')[-1].split('_')[0])
        site_type.append(str(data.items[i]).split('_')[-1].split('.')[0])
        
    # Return data frame
    score_pdf = pd.DataFrame({'uid': uid,
                              'site_type': site_type,
                              'cement_prob': cement_prob,
                              'steel_prob': steel_prob})
    
    return(score_pdf)

## Define EOD Catalog Read and Chipping Functions

### Get catalog of Landsat 8 scenes that intersect with chip centroids

Queries EarthAI Catalog to find L8 scenes that intersect with chip centroids.

* Returns specified scene for:
* January Year 2
* January Year 1
* April Year 1
* Join back to chip centroids for chipping

In [None]:
def eod_read_catalog(geom, year):
    
    year2 = year
    year1 = str(int(year2) - 1)
    
    # January Year 2
    site_cat_year2_01 = earth_ondemand.read_catalog(
        geo=geom,
        start_datetime=year2+'-01-01', 
        end_datetime=year2+'-01-31',
        max_cloud_cover=100,
        collections='landsat8_l1tp'
    )
    if len(site_cat_year2_01) > 0:
        site_cat_year2_01 = gpd.sjoin(geom, site_cat_year2_01)
    
    # January Year 1
    site_cat_year1_01 = earth_ondemand.read_catalog(
        geo=geom,
        start_datetime=year1+'-01-01', 
        end_datetime=year1+'-01-31',
        max_cloud_cover=100,
        collections='landsat8_l1tp'
    )
    if len(site_cat_year1_01) > 0:
        site_cat_year1_01 = gpd.sjoin(geom, site_cat_year1_01)
    
    # April Year 1
    site_cat_year1_04 = earth_ondemand.read_catalog(
        geo=geom,
        start_datetime=year1+'-04-01', 
        end_datetime=year1+'-04-30',
        max_cloud_cover=100,
        collections='landsat8_l1tp'
    )
    if len(site_cat_year1_04) > 0:
        site_cat_year1_04 = gpd.sjoin(geom, site_cat_year1_04)
        
    return({'site_cat_year2_01': site_cat_year2_01,
            'site_cat_year1_01': site_cat_year1_01,
            'site_cat_year1_04': site_cat_year1_04})


## Create Image Chips

* Read and create image chips for 10km grid
* Select highest quality chips per site

In [None]:
def create_chips(site_cat, chip_size=35, unmsk_frac=0.75, col_suffix='JY2', repartition_size=partitions):
    
    # Create uniform, same-sized chips covering the deployment region
    # Filter out blank chips at edge of scenes
    # Handle rare edge case where returned chip is less than specified size (when reach edge of a scene)
    # Mask chips by QA band and compute count of unmasked cells
    # Remove chips with less than a minimum fraction of unmasked cells
    site_chips = spark.read.chip(site_cat, ['BQA'],
                                 chipping_strategy=chp.CentroidCentered(chip_size)) \
                      .select('uid', 'id', 'BQA') \
                      .withColumn('mask', rf_make_constant_tile(1, chip_size, chip_size, 'uint16')) \
                      .withColumn('tot_cell_count', rf_data_cells('BQA')) \
                      .filter(pys.col('tot_cell_count') == chip_size*chip_size) \
                      .withColumn('BQA_min', rf_tile_min('BQA')) \
                      .filter(pys.col('BQA_min') > 1.0) \
                      .withColumn('mask', # designated fill = yes
                                  rf_mask_by_bit('mask', 'BQA', 0, 1)) \
                      .withColumn('mask', # cloud = yes
                                  rf_mask_by_bit('mask', 'BQA', 4, 1)) \
                      .withColumn('mask', # cloud shadow conf is medium or high
                                  rf_mask_by_bits('mask', 'BQA', 7, 2, [2, 3])) \
                      .withColumn('mask', # cirrus conf is medium or high
                                  rf_mask_by_bits('mask', 'BQA', 11, 2, [2, 3])) \
                      .withColumn('unmsk_cell_count', rf_data_cells('mask')) \
                      .filter(pys.col('unmsk_cell_count') >= unmsk_frac*chip_size*chip_size) \
                      .repartition(repartition_size, 'uid', 'id')
    
    # Find the chip(s) with the highest number of unmasked cells
    # If there's >1 chip (a tie) take the first record
    chpinf_pdf = site_chips.select('uid', 'id', 'unmsk_cell_count').toPandas()
    chpinf_pdf['grpid'] = chpinf_pdf['uid']    
    site_maxcnt = chpinf_pdf.sort_values('unmsk_cell_count', ascending=False) \
                            .groupby(['grpid']).first() \
                            .drop('unmsk_cell_count', axis=1)
    
    # Read in thermal band for highest quality chip
    site_cat = site_cat.merge(site_maxcnt, on=['uid', 'id'], how='inner')
    site_chips_unq = spark.read.chip(site_cat, ['B10'],
                                     chipping_strategy=chp.CentroidCentered(chip_size)) \
                          .select('uid', 'site_type', 'id', 'datetime', 'B10') \
                          .withColumn('B10'+'_'+col_suffix,
                                      rf_convert_cell_type(rf_local_multiply(rf_rescale(rf_convert_cell_type('B10', 'uint16')), 
                                                                             65535), 'uint16')) \
                          .drop('B10') \
                          .withColumnRenamed('id', 'id'+'_'+col_suffix) \
                          .withColumnRenamed('datetime', 'datetime'+'_'+col_suffix) \
                          .repartition(repartition_size, 'uid')
    
    return(site_chips_unq)

## Convert GeoTIFFs to PNGs

In [None]:
def convert_image(tif_filename, png_filename):
    with rasterio.open(tif_filename) as infile:
        
        profile = infile.profile
        profile['driver'] = 'PNG'
        
        raster = infile.read()
        
        with rasterio.open(png_filename, 'w', **profile) as dst:
            dst.write(raster)

## Create PNGs from RasterFrame

In [None]:
def png_from_rf(rf):
    
    # Delete temporary output paths for geotiffs and pngs and all files if they exist
    if os.path.exists('geotiffs'):
        shutil.rmtree('geotiffs')
    if os.path.exists('pngs'):
        shutil.rmtree('pngs')
    
    # Create GeoTIFFs from RasterFrame
    rf.write.chip('geotiffs', filenameCol='file_path_name', catalog=False)
    tif_file_list = glob.glob('geotiffs/*.tif')
    
    # Create output paths for PNGs to fit Fastai structure
    os.mkdir('pngs')
    os.mkdir('pngs/all')
    png_file_list = [f.replace('.tif', '.png').replace('geotiffs/', 'pngs/all/') for f in tif_file_list]
    
    # Convert and write out PNGs
    for i in range(0, len(tif_file_list)):
        convert_image(tif_file_list[i], png_file_list[i])

## Define Output Function

* Writes out scores to GeoJSON file

In [None]:
def write_chip_scores(rf, pdf, year):
    
    # Get tile extents from RasterFrame
    geo_pdf = rf.withColumn('geometry', st_reproject(st_geometry(rf_extent('B10_JY2')), 
                                                     rf_crs('B10_JY2'), 
                                                     pys.lit('EPSG:4326'))) \
                .select('uid', 'site_type', 'geometry').toPandas()
    geo_pdf['year'] = year
    geo_gdf = gpd.GeoDataFrame(geo_pdf, geometry='geometry', crs='EPSG:4326')
    
    # Join with scores
    scores_gdf = pd.merge(geo_gdf, pdf, how='inner', on=['uid', 'site_type'])
    
    scores_gdf.to_file(output_score_file, driver='GeoJSON')

## Read in Cement and Steel Plant sites

In [None]:
cement_site_gdf = gpd.read_file(cement_site_geojson)
cement_site_gdf['site_type'] = 'cement'
steel_site_gdf = gpd.read_file(steel_site_geojson)
steel_site_gdf['site_type'] = 'steel'
chip_cntr_gdf = pd.concat([cement_site_gdf, steel_site_gdf], ignore_index=True)
chip_cnt = len(chip_cntr_gdf)
print("Total count of sites: ", chip_cnt)

In [None]:
print("Number of cement sites: ", len(chip_cntr_gdf[chip_cntr_gdf['site_type'] == 'cement']))
print("Number of steel sites: ", len(chip_cntr_gdf[chip_cntr_gdf['site_type'] == 'steel']))

## Create Chips and Score

For each site:

* Get catalog of Landsat 8 scenes that intersect with chip centroids
* Read and create image chips
* Join TIR chips at different dates into single RasterFrame and score models
* Write scores out to file

In [None]:
# Get catalog of Landsat 8 scenes that intersect with chip centroids
site_cat_list = eod_read_catalog(chip_cntr_gdf, year)
    
# Read and create image chips
# ---------------------------
        
# January Year 2
site_chip_year2_01_unq = create_chips(site_cat_list['site_cat_year2_01'], 
                                      chip_size=chip_size, 
                                      unmsk_frac=unmsk_frac, 
                                      col_suffix='JY2',
                                      repartition_size=round(chip_cnt/4))
        
# January Year 1
site_chip_year1_01_unq = create_chips(site_cat_list['site_cat_year1_01'], 
                                      chip_size=chip_size, 
                                      unmsk_frac=unmsk_frac,
                                      col_suffix='JY1',
                                      repartition_size=round(chip_cnt/4))
        
# April Year 1
site_chip_year1_04_unq = create_chips(site_cat_list['site_cat_year1_04'],
                                      chip_size=chip_size,
                                      unmsk_frac=unmsk_frac,
                                      col_suffix='AY1',
                                      repartition_size=round(chip_cnt/4))

In [None]:
# Join TIR chips
site_chips_joined = site_chip_year2_01_unq.join(site_chip_year1_01_unq, on=['uid', 'site_type'], how='inner') \
                                          .join(site_chip_year1_04_unq, on=['uid', 'site_type'], how='inner') \
                                          .withColumn('file_path_name', 
                                                      pys.concat_ws('_', pys.col('uid'), pys.col('site_type'))) \
                                          .cache()

In [None]:
# Write out temporary PNGs to score
png_from_rf(site_chips_joined)

In [None]:
# Score PNGs
deployment_scores_pdf = score_pngs('pngs')

In [None]:
# Write scores to GeoJSON file
write_chip_scores(site_chips_joined, deployment_scores_pdf, year)