# Deployment of TIR Landsat 8 Macrolocalization Model - 2020

This notebook deploys the TIR Landsat 8 macrolocalization models for cement and steel plants for the year 2020.

## Import required libraries

In [None]:
!pip install fastai==1.0.61

In [None]:
from earthai.all import *
import earthai.chipping.strategy as chp
import pyspark.sql.functions as pys
from pyspark.sql.functions import lit, col, udf

import geopandas as gpd
import pandas as pd
import rasterio

import os
import shutil
import boto3
import glob
import time

from fastai import *
from fastai.vision import *

## Create Spark Session

* Important to do this before defining the udfs for scoring
* Set number of partitions on par with the number of catalog items per scene

In [None]:
partitions = 2500
spark = create_earthai_spark_session(**{
    "spark.default.parallelism": partitions,
    "spark.sql.shuffle.partitions": partitions,
})

## Define input/output files and paths, and parameters

### Parameters

* `chip_size` is the size of chips (length) to create (in pixels)
* `unmsk_frac` is the minimum threshold on the fraction of unmasked cells required to keep site in sample
* `year` defines the year for layer 1 (thermal band, in January); layers 2 and 3 (thermal band, in January and April, respectively) are `year - 1`
* `scene_subset` set to 1 or 2. This divides the scoring in two pieces to run on two servers at the same time. 1 will process the first set of scenes; 2 will process the second. 

In [None]:
chip_size = 35 # 1.05 km for Landsat 8
unmsk_frac = 0.75

year = '2020'

scene_subset = 2

### Input files and paths

* `s3_path` defines S3 high-level folder for L8 TIR macro-localization data
* `chip_cntr_tar` is the tar with GeoJSON files of chip centroids for the deployment region
* `CEMENT_MODEL_PATH` is the path on S3 to the Densenet161 cement model
* `STEEL_MODEL_PATH` is the path on S3 to the Resnet50 steel model
* `LOCAL_DIR` specifies where to keep put files locally for analysis

In [None]:
s3_path = 'L8-TIR-macro-localization-model-deployment'
chip_cntr_tar = 'L8-deployment-chip-centroids-CHN-10km-pthsh0.002.tar'

CEMENT_MODEL_PATH = 'L8-TIR-macro-localization-model-build3/L8-TIR-model-results3/densenet161_cement_binary_final.pkl'
STEEL_MODEL_PATH = 'L8-TIR-macro-localization-model-build3/L8-TIR-model-results3/resnet50_steel_binary_final.pkl'

LOCAL_DIR = '/scratch/'

### Output files and paths

* `output_path` defines (temporary) local place of storage
* `output_score_tar` define output tar of score GeoJSONS (one for each scene)

In [None]:
output_path = 'L8-deployment-chip-scores-CHN-10km-pthsh0.002_'+year+'_set'+str(scene_subset)
output_score_tar = output_path+'.tar'
output_gjson_prefix = 'L8-deployment-chip-scores-CHN-10km-pthsh0.002_'+year+'_'

In [None]:
if not os.path.exists(output_path):
    os.mkdir(output_path)

## Download Models and Define Scoring Functions

In [None]:
s3 = boto3.resource('s3')
bucket = s3.Bucket('sfi-shared-assets')

### Download models and load learners

In [None]:
def download_model(MODEL_PATH):
    if not os.path.exists(LOCAL_DIR+MODEL_PATH.split("/")[-1].replace(".pkl", "")):
        os.makedirs(LOCAL_DIR + MODEL_PATH.split("/")[-1].replace(".pkl", ""))
    bucket.download_file(MODEL_PATH, LOCAL_DIR+MODEL_PATH.split("/")[-1].replace(".pkl", "") + "/export.pkl")

In [None]:
download_model(CEMENT_MODEL_PATH)
download_model(STEEL_MODEL_PATH)

In [None]:
cement_model = load_learner(LOCAL_DIR + CEMENT_MODEL_PATH.split("/")[-1].replace(".pkl", ""))
steel_model = load_learner(LOCAL_DIR + STEEL_MODEL_PATH.split("/")[-1].replace(".pkl", ""))

### Define scoring function for PNGs

In [None]:
def score_pngs(path):
    
    # Get ImageDataBunch for Fastai
    data = (ImageDataBunch.from_folder(path, train='all', bs=16, num_workers=0, seed=42).normalize(imagenet_stats))
    
    # Create empty lists to store results
    data_cnt = len(data.train_ds)
    scene_id = []
    tile_id = []
    cement_prob = []
    steel_prob = []
    
    # Loop over images and get scores and metadata
    for i in range(0, data_cnt):
        
        # Cement results
        p_cement = cement_model.predict(data.train_ds.x[i])
        cement_prob.append(to_np(p_cement[2])[0].item())
    
        # Steel results
        p_steel = steel_model.predict(data.train_ds.x[i])
        steel_prob.append(to_np(p_steel[2])[1].item())
    
        # Metadata for chip
        scene_id.append('-'.join(str(data.items[i]).split('/')[-1].split('-')[0:2]))
        tile_id.append(str(data.items[i]).split('/')[-1].split('.')[0])
        
    # Return data frame
    score_pdf = pd.DataFrame({'scene_id': scene_id,
                              'tile_id': tile_id,
                              'cement_prob': cement_prob,
                              'steel_prob': steel_prob})
    
    return(score_pdf)

## Define EOD Catalog Read and Chipping Functions

### Get catalog of Landsat 8 scenes that intersect with chip centroids

Queries EarthAI Catalog to find L8 scenes that intersect with chip centroids.

* Returns specified scene for:
* January Year 2
* January Year 1
* April Year 1
* Join back to chip centroids for chipping

In [None]:
def eod_read_catalog(geom, grid_id, year):
    
    year2 = year
    year1 = str(int(year2) - 1)
    
    # January Year 2
    site_cat_year2_01 = earth_ondemand.read_catalog(
        geo=geom,
        start_datetime=year2+'-01-01', 
        end_datetime=year2+'-01-31',
        max_cloud_cover=100,
        collections='landsat8_l1tp',
        grid_ids=[grid_id]
    )
    if len(site_cat_year2_01) > 0:
        site_cat_year2_01 = gpd.sjoin(geom, site_cat_year2_01)
    
    # January Year 1
    site_cat_year1_01 = earth_ondemand.read_catalog(
        geo=geom,
        start_datetime=year1+'-01-01', 
        end_datetime=year1+'-01-31',
        max_cloud_cover=100,
        collections='landsat8_l1tp',
        grid_ids=[grid_id]
    )
    if len(site_cat_year1_01) > 0:
        site_cat_year1_01 = gpd.sjoin(geom, site_cat_year1_01)
    
    # April Year 1
    site_cat_year1_04 = earth_ondemand.read_catalog(
        geo=geom,
        start_datetime=year1+'-04-01', 
        end_datetime=year1+'-04-30',
        max_cloud_cover=100,
        collections='landsat8_l1tp',
        grid_ids=[grid_id]
    )
    if len(site_cat_year1_04) > 0:
        site_cat_year1_04 = gpd.sjoin(geom, site_cat_year1_04)
        
    return({'site_cat_year2_01': site_cat_year2_01,
            'site_cat_year1_01': site_cat_year1_01,
            'site_cat_year1_04': site_cat_year1_04})


## Create Image Chips

* Read and create image chips for 10km grid
* Select highest quality chips per site

In [None]:
def create_chips(site_cat, chip_size=35, unmsk_frac=0.75, col_suffix='JY2', repartition_size=partitions):
    
    # Create uniform, same-sized chips covering the deployment region
    # Filter out blank chips at edge of scenes
    # Handle rare edge case where returned chip is less than specified size (when reach edge of a scene)
    # Mask chips by QA band and compute count of unmasked cells
    # Remove chips with less than a minimum fraction of unmasked cells
    site_chips = spark.read.chip(site_cat, ['BQA'],
                                 chipping_strategy=chp.CentroidCentered(chip_size)) \
                      .select('scene_id', 'tile_id', 'id', 'BQA') \
                      .withColumn('mask', rf_make_constant_tile(1, chip_size, chip_size, 'uint16')) \
                      .withColumn('tot_cell_count', rf_data_cells('BQA')) \
                      .filter(pys.col('tot_cell_count') == chip_size*chip_size) \
                      .withColumn('BQA_min', rf_tile_min('BQA')) \
                      .filter(pys.col('BQA_min') > 1.0) \
                      .withColumn('mask', # designated fill = yes
                                  rf_mask_by_bit('mask', 'BQA', 0, 1)) \
                      .withColumn('mask', # cloud = yes
                                  rf_mask_by_bit('mask', 'BQA', 4, 1)) \
                      .withColumn('mask', # cloud shadow conf is medium or high
                                  rf_mask_by_bits('mask', 'BQA', 7, 2, [2, 3])) \
                      .withColumn('mask', # cirrus conf is medium or high
                                  rf_mask_by_bits('mask', 'BQA', 11, 2, [2, 3])) \
                      .withColumn('unmsk_cell_count', rf_data_cells('mask')) \
                      .filter(pys.col('unmsk_cell_count') >= unmsk_frac*chip_size*chip_size) \
                      .repartition(repartition_size, 'tile_id', 'id')
    
    # Find the chip(s) with the highest number of unmasked cells
    # If there's >1 chip (a tie) take the first record
    chpinf_pdf = site_chips.select('tile_id', 'id', 'unmsk_cell_count').toPandas()
    chpinf_pdf['grpid'] = chpinf_pdf['tile_id']    
    site_maxcnt = chpinf_pdf.sort_values('unmsk_cell_count', ascending=False) \
                            .groupby(['grpid']).first() \
                            .drop('unmsk_cell_count', axis=1)
    
    # Read in thermal band for highest quality chip
    site_cat = site_cat.merge(site_maxcnt, on=['tile_id', 'id'], how='inner')
    
    if len(site_cat) > 0:
        site_chips_unq = spark.read.chip(site_cat, ['B10'],
                                     chipping_strategy=chp.CentroidCentered(chip_size)) \
                          .select('scene_id', 'tile_id', 'id', 'datetime', 'B10') \
                          .withColumn('B10'+'_'+col_suffix,
                                      rf_convert_cell_type(rf_local_multiply(rf_rescale(rf_convert_cell_type('B10', 'uint16')), 
                                                                             65535), 'uint16')) \
                          .drop('B10') \
                          .withColumnRenamed('id', 'id'+'_'+col_suffix) \
                          .withColumnRenamed('datetime', 'datetime'+'_'+col_suffix) \
                          .repartition(repartition_size, 'tile_id')
    else:
        site_chips_unq = None
    
    return(site_chips_unq)

## Convert GeoTIFFs to PNGs

In [None]:
def convert_image(tif_filename, png_filename):
    with rasterio.open(tif_filename) as infile:
        
        profile = infile.profile
        profile['driver'] = 'PNG'
        
        raster = infile.read()
        
        with rasterio.open(png_filename, 'w', **profile) as dst:
            dst.write(raster)

## Create PNGs from RasterFrame

In [None]:
def png_from_rf(rf):
    
    # Create GeoTIFFs from RasterFrame
    print('(Writing GeoTIFFs)')
    st = time.time()
    rf.write.chip('geotiffs', filenameCol='tile_id', catalog=False)
    print('(', time.time()-st, ')')
    tif_file_list = glob.glob('geotiffs/*.tif')
    
    # Create output paths for PNGs to fit Fastai structure
    os.mkdir('pngs')
    os.mkdir('pngs/all')
    png_file_list = [f.replace('.tif', '.png').replace('geotiffs/', 'pngs/all/') for f in tif_file_list]
    
    # Convert and write out PNGs
    print('(Writing PNGs)')
    st = time.time()
    for i in range(0, len(tif_file_list)):
        convert_image(tif_file_list[i], png_file_list[i])
    print('(', time.time()-st, ')')

## Define Output Function

* Writes out scores to GeoJSON file

In [None]:
def write_chip_scores(rf, pdf, year):
    
    # Get tile extents from RasterFrame
    geo_pdf = rf.withColumn('geometry', st_reproject(st_geometry(rf_extent('B10_JY2')), 
                                                     rf_crs('B10_JY2'), 
                                                     pys.lit('EPSG:4326'))) \
                .select('scene_id', 'tile_id', 'geometry').toPandas()
    geo_pdf['year'] = year
    geo_gdf = gpd.GeoDataFrame(geo_pdf, geometry='geometry', crs='EPSG:4326')
    
    # Join with scores
    scores_gdf = pd.merge(geo_gdf, pdf, how='inner', on=['scene_id', 'tile_id'])
    
    output_score_file = output_path+'/'+output_gjson_prefix+scores_gdf.scene_id[0]+'.geojson'
    scores_gdf.to_file(output_score_file, driver='GeoJSON')

## Download and Read in Chip Centroids from 10km Grid

In [None]:
bucket.download_file(s3_path+'/'+chip_cntr_tar, LOCAL_DIR+chip_cntr_tar)

In [None]:
!tar -xf {LOCAL_DIR+chip_cntr_tar} -C {LOCAL_DIR}

In [None]:
chip_cntr_dir = chip_cntr_tar.replace('.tar', '')
chip_cntr_gjsons = os.listdir(LOCAL_DIR+chip_cntr_dir)
chip_cntr_gjsons.sort()

### Split scoring effort in two

In [None]:
list1 = []
list2 = []
for f in chip_cntr_gjsons:
    scene_ind3 = int(f.split('.')[1].split('-')[-1][0:3])
    if scene_ind3 <= 125:
        list1.append(f)
    else:
        list2.append(f)

In [None]:
if scene_subset == 1:
    scene_files = [LOCAL_DIR+chip_cntr_dir+'/'+f for f in list1]
    scene_ids = [f.split('_')[-1].split('.')[0] for f in list1]
if scene_subset == 2:
    scene_files = [LOCAL_DIR+chip_cntr_dir+'/'+f for f in list2]
    scene_ids = [f.split('_')[-1].split('.')[0] for f in list2]

In [None]:
print(len(scene_ids))

### Fail-safe

If server crashes, this picks up where we left off, so don't have to rerun scoring.

In [None]:
scored_scene_list = os.listdir(output_path)
scored_scene_list.sort()
if len(scored_scene_list) > 0:
    last_scored_scene = scored_scene_list[-1].split('.')[1].split('_')[-1]
    last_ind = scene_ids.index(last_scored_scene)
    scene_files = scene_files[last_ind+1:]
    scene_ids = scene_ids[last_ind+1:]

In [None]:
print(len(scene_ids))

### Temporary code to score specific scenes

## Loop over Scenes, Create Chips, and Score

For each scene:

* Get catalog of Landsat 8 scenes that intersect with chip centroids
* Read and create image chips
* Join TIR chips at different dates into single RasterFrame and score models
* Write scores out to file

In [None]:
# Delete temporary output paths for geotiffs and pngs and all files if they exist
if os.path.exists('geotiffs'):
    print('(Deleting geotiffs)')
    shutil.rmtree('geotiffs')
if os.path.exists('pngs'):
    print('(Deleting pngs)')
    shutil.rmtree('pngs')

In [None]:
# Start loop over scenes
for scene_id, scene_file in zip(scene_ids, scene_files):
    
    # Track time
    si_all = time.time()
    
    # Read in chips to GeoDataFrame
    chip_cntr_gdf = gpd.read_file(scene_file)
    chip_cnt = len(chip_cntr_gdf)
    print('Scene ', scene_id, ': Total chip count = ', chip_cnt)
    
    # Get catalog of Landsat 8 scenes that intersect with chip centroids
    site_cat_list = eod_read_catalog(chip_cntr_gdf, scene_id, year)
    
    # If one or more scenes missing for specified dates, do not score
    if len(site_cat_list['site_cat_year2_01']) == 0 or \
       len(site_cat_list['site_cat_year1_01']) == 0 or \
       len(site_cat_list['site_cat_year1_04']) == 0:
        
        print('Scene ', scene_id, ': Cannot score for year ', year, ' (one or more scenes unavailable)')
        print('Scene ', scene_id, ': Total time = ', (time.time() - si_all)/60., ' min')
    
    # If all dates available, chip and score
    else:
        
        # Read and create image chips
        # ---------------------------
        
        # January Year 2
        print('Scene ', scene_id, ': Creating chips for Janary ', year)
        st = time.time()
        site_chip_year2_01_unq = create_chips(site_cat_list['site_cat_year2_01'], 
                                              chip_size=chip_size, 
                                              unmsk_frac=unmsk_frac, 
                                              col_suffix='JY2',
                                              repartition_size=round(chip_cnt/4))
        print('(', time.time() - st, ')')
        
        # January Year 1
        st = time.time()
        print('Scene ', scene_id, ': Creating chips for Janary ', str(int(year)-1))
        site_chip_year1_01_unq = create_chips(site_cat_list['site_cat_year1_01'], 
                                              chip_size=chip_size, 
                                              unmsk_frac=unmsk_frac, 
                                              col_suffix='JY1',
                                              repartition_size=round(chip_cnt/4))
        print('(', time.time() - st, ')')
        
        # April Year 1
        st = time.time()
        print('Scene ', scene_id, ': Creating chips for April ', str(int(year)-1))
        site_chip_year1_04_unq = create_chips(site_cat_list['site_cat_year1_04'], 
                                              chip_size=chip_size, 
                                              unmsk_frac=unmsk_frac, 
                                              col_suffix='AY1',
                                              repartition_size=round(chip_cnt/4))
        print('(', time.time() - st, ')')
        
        # Join TIR chips and score
        if (site_chip_year2_01_unq is not None) and (site_chip_year1_01_unq is not None) and \
           (site_chip_year1_04_unq is not None):
            
            # Join TIR chips
            st = time.time()
            site_chips_joined = site_chip_year2_01_unq.join(site_chip_year1_01_unq, 
                                                            on=['scene_id', 'tile_id'], how='inner') \
                                                      .join(site_chip_year1_04_unq, 
                                                            on=['scene_id', 'tile_id'], how='inner') \
                                                      .repartition(round(chip_cnt/4), 'tile_id') \
                                                      .cache()
            
            if (site_chips_joined.count() > 0):
            
                # Write out temporary PNGs to score
                print('Scene ', scene_id, ': Writing out GeoTIFFs and PNGs to score')
                png_from_rf(site_chips_joined)
                print('(', time.time() - st, ')')
            
                # Score PNGs
                st = time.time()
                print('Scene ', scene_id, ': Scoring chips')
                deployment_scores_pdf = score_pngs('pngs')
                print('(', time.time() - st, ')')
            
                # Delete temporary output paths for geotiffs and pngs and all files if they exist
                if os.path.exists('geotiffs'):
                    print('(Deleting geotiffs)')
                    st = time.time()
                    shutil.rmtree('geotiffs')
                    print('(', time.time()-st, ')')
                if os.path.exists('pngs'):
                    print('(Deleting pngs)')
                    st = time.time()
                    shutil.rmtree('pngs')
                    print('(', time.time()-st, ')')
        
                # Write scores to GeoJSON file
                st = time.time()
                print('Scene ', scene_id, ': Saving scores in GeoJSON')
                write_chip_scores(site_chips_joined, deployment_scores_pdf, year)
                print('(', time.time() - st, ')')
            
                print('Scene ', scene_id, ': Wrote ', len(deployment_scores_pdf), ' chips for year ', year)
                print('Scene ', scene_id, ': Total time = ', (time.time() - si_all)/60., ' min')
                
            else:
                print('Scene ', scene_id, ': Cannot score for year ', year, ' (no cloud-free chips available)')
                print('Scene ', scene_id, ': Total time = ', (time.time() - si_all)/60., ' min')
            
        else:
            print('Scene ', scene_id, ': Cannot score for year ', year, ' (no cloud-free chips available)')
            print('Scene ', scene_id, ': Total time = ', (time.time() - si_all)/60., ' min')

## Tar results and upload to S3

In [None]:
unix_code = 'tar -cvf '+output_score_tar+' '+output_path

In [None]:
os.system(unix_code)

In [None]:
bucket.upload_file(output_score_tar, 
                   s3_path+'/'+output_score_tar)