# Creation of TIR Landsat 8 Chips for Land Cover

This notebook creates the Landsat 8 TIR Band 10 image chips for land cover from the EarthAI catalog.

Note that we still may have to convert from TOA to brightness temperature, following these guidelines: https://www.usgs.gov/land-resources/nli/landsat/using-usgs-landsat-level-1-data-product. It's not clear yet that this step is essential.

Documentation on Landsat 8 L1TP:
https://prd-wret.s3.us-west-2.amazonaws.com/assets/palladium/production/atoms/files/LSDS-1656_%20Landsat_Collection1_L1_Product_Definition-v2.pdf

In [None]:
# Import required packages
from earthai.init import *
import earthai.chipping.strategy

import pyspark.sql.functions as F
from pyspark.sql.window import Window
from pyspark.sql.functions import rank, col

from pyrasterframes.rf_types import TileUDT
from pyspark.sql.functions import udf

import geopandas as gpd
import pandas as pd
import folium
import folium.plugins

import os
import rasterio
from rasterio.plot import show

import boto3

%matplotlib inline

## Get land cover locations in China

In [None]:
# land cover in China
land_geojson = '/home/jovyan/sfi-asset-level-data/src/main/resources/cement_steel_land_geoms/landcover_datasetv4_UTM_CHINA_exactlocPOLYS_1200m.geojson'

# Read into GeoDataFrame, keep only subset of columns
land_gpd = gpd.read_file(land_geojson)
land_gpd = land_gpd[['id', 'geometry']]

# Convert to WGS84 for use w/ Earth OnDemand API
land_gpd = land_gpd.to_crs('OGC:CRS84')

# Total of 2002 land cover
land_gpd.rename(columns={'id': 'uid'}, inplace=True)
land_gpd

In [None]:
# Plot land cover polygons
style_function = lambda x: {'fillColor': '#f003fc', 'color': '#f003fc'}
land_polys = folium.features.GeoJson(land_gpd.to_json(), style_function=style_function)
m = folium.Map(location=[land_gpd.geometry.centroid.y.mean(), 
                         land_gpd.geometry.centroid.x.mean()],
               zoom_start=4)

m.add_children(land_polys)
m

## Get catalog of Landsat 8 scenes that intersect with land cover

In [None]:
# Query EarthAI Catalog to find L8 scenes that intersect with land cover polygons
# To best match exploratory work:
# Want high quality image for January, 2018
# Want high quality image for January, 2017
# Want high quality image for April, 2017

# Below, do NOT impose a maximum cloud cover filter; since sites are small, it's possible that a high-cloud
# coverage scene is relatively clear over the small region we need; will select highest quality scenes
# after masking steps below

# January 2018
catalog_2018_01 = earth_ondemand.read_catalog(
    land_gpd.geometry,
    start_datetime='2018-01-01', 
    end_datetime='2018-01-31',
    max_cloud_cover=100,
    collections='landsat8_l1tp'
)
# January 2017
catalog_2017_01 = earth_ondemand.read_catalog(
    land_gpd.geometry,
    start_datetime='2017-01-01', 
    end_datetime='2017-01-31',
    max_cloud_cover=100,
    collections='landsat8_l1tp'
)
# April 2017
catalog_2017_04 = earth_ondemand.read_catalog(
    land_gpd.geometry,
    start_datetime='2017-04-01', 
    end_datetime='2017-04-30',
    max_cloud_cover=100,
    collections='landsat8_l1tp'
)

In [None]:
# Join catalogs to land plant sites
land_cat_2018_01 = gpd.sjoin(land_gpd, catalog_2018_01)
land_cat_2017_01 = gpd.sjoin(land_gpd, catalog_2017_01)
land_cat_2017_04 = gpd.sjoin(land_gpd, catalog_2017_04)

# January 2018
print("January 2018")
print("----------------------------------------------")
# Number of plants with L8 imagery
land_2018_01_plnt_cnt = land_cat_2018_01['uid'].nunique()
print("Number of land cover with L8 imagery: ", land_2018_01_plnt_cnt)

# Number of L8 scenes
land_2018_01_L8_cnt = land_cat_2018_01['id'].nunique()
print("Total number of catalog entries:", land_cat_2018_01['uid'].count())
print("Number of unique L8 scenes:", land_2018_01_L8_cnt)

# January 2017
print("January 2017")
print("----------------------------------------------")
# Number of plants with L8 imagery
land_2017_01_plnt_cnt = land_cat_2017_01['uid'].nunique()
print("Number of land cover with L8 imagery: ", land_2017_01_plnt_cnt)

# Number of L8 scenes
land_2017_01_L8_cnt = land_cat_2017_01['id'].nunique()
print("Total number of catalog entries:", land_cat_2017_01['uid'].count())
print("Number of unique L8 scenes:", land_2017_01_L8_cnt)

# April 2017
print("April 2017")
print("----------------------------------------------")
# Number of plants with L8 imagery
land_2017_04_plnt_cnt = land_cat_2017_04['uid'].nunique()
print("Number of land cover with L8 imagery: ", land_2017_04_plnt_cnt)

# Number of L8 scenes
land_2017_04_L8_cnt = land_cat_2017_04['id'].nunique()
print("Total number of catalog entries:", land_cat_2017_04['uid'].count())
print("Number of unique L8 scenes:", land_2017_04_L8_cnt)

In [None]:
# Plot land polygons - testing
style_function = lambda x: {'fillColor': '#f003fc', 'color': '#f003fc'}
land_polys = folium.features.GeoJson(land_gpd.to_json(), style_function=style_function)
m = folium.Map(location=[land_gpd.geometry.centroid.y.mean(), 
                         land_gpd.geometry.centroid.x.mean()],
               zoom_start=3)

# Plot L8 scene boundaries
l8_gpd2p = catalog_2018_01[['id','geometry']]
style_function = lambda x: {'fillColor': '#32a852', 'color': '#32a852'}
l8_polys = folium.features.GeoJson(l8_gpd2p.to_json(), style_function=style_function)


m.add_children(l8_polys)
m.add_children(land_polys)
m

## Read and create image chips for land cover

In [None]:
# Use chip reader with centroid centered extent
# Limit to Band 10 and QA
land_chip_2018_01 = spark.read.chip(land_cat_2018_01, ['B10','BQA'],
                                      chipping_strategy=earthai.chipping.strategy.CentroidCentered(35))
land_chip_2017_01 = spark.read.chip(land_cat_2017_01, ['B10','BQA'],
                                      chipping_strategy=earthai.chipping.strategy.CentroidCentered(35))
land_chip_2017_04 = spark.read.chip(land_cat_2017_04, ['B10','BQA'],
                                      chipping_strategy=earthai.chipping.strategy.CentroidCentered(35))

# Keep only columns of interest
land_chip_2018_01 = land_chip_2018_01.select('uid','datetime','B10', 'BQA',\
                           'id', 'B10_path').repartition('uid')
land_chip_2017_01 = land_chip_2017_01.select('uid','datetime','B10', 'BQA',\
                           'id', 'B10_path').repartition('uid')
land_chip_2017_04 = land_chip_2017_04.select('uid','datetime','B10', 'BQA',\
                           'id', 'B10_path').repartition('uid')

In [None]:
# Mask chips w/ QA Band
# Example notebook discussing masking in more detail: /home/jovyan/examples/tutorials/geo-ops/masking-landsat8.ipynb
# Landsat 8 Collection 1 Tier 1 QA band description: 
#   https://www.usgs.gov/land-resources/nli/landsat/landsat-collection-1-level-1-quality-assessment-band?qt-science_support_page_related_con=0#qt-science_support_page_related_con

# In order to apply a mask, the tile must have a NoData defined. Landsat 8 measurement bands have a cell type of uint16raw, 
# which indicates that there is no NoData value defined. The first line of the code below sets the cell types to uint16, 
# whose NoData value is 0. This will cause any zero-valued cells in the measurement band to be considered NoData. In 
# Landsat 8, these areas correspond to the BQA fill areas.

# The next several lines mask out data according to bit values.
# The final lines calculate the total cell count, number of unmasked cells, and masked fraction

# January 2018
land_chip_2018_01_masked = land_chip_2018_01.withColumn('B10_uint16', rf_convert_cell_type('B10', 'uint16')) \
                                         .withColumn('B10_masked', # designated fill = yes
                                                     rf_mask_by_bit('B10_uint16', 'BQA', 0, 1)) \
                                         .withColumn('B10_masked', # cloud = yes
                                                     rf_mask_by_bit('B10_masked', 'BQA', 4, 1)) \
                                         .withColumn('B10_masked', # cloud shadow conf is medium or high
                                                     rf_mask_by_bits('B10_masked', 'BQA', 7, 2, [2, 3])) \
                                         .withColumn('B10_masked', # cirrus conf is medium or high
                                                     rf_mask_by_bits('B10_masked', 'BQA', 11, 2, [2, 3])) \
                                         .withColumn('tot_cell_count', rf_data_cells('B10')) \
                                         .withColumn('unmsk_cell_count', rf_data_cells('B10_masked')) \
                                         .withColumn('mask_fraction', 
                                                     (1.0 - F.col('unmsk_cell_count')/F.col('tot_cell_count')))

# January 2017
land_chip_2017_01_masked = land_chip_2017_01.withColumn('B10_uint16', rf_convert_cell_type('B10', 'uint16')) \
                                         .withColumn('B10_masked', # designated fill = yes
                                                     rf_mask_by_bit('B10_uint16', 'BQA', 0, 1)) \
                                         .withColumn('B10_masked', # cloud = yes
                                                     rf_mask_by_bit('B10_masked', 'BQA', 4, 1)) \
                                         .withColumn('B10_masked', # cloud shadow conf is medium or high
                                                     rf_mask_by_bits('B10_masked', 'BQA', 7, 2, [2, 3])) \
                                         .withColumn('B10_masked', # cirrus conf is medium or high
                                                     rf_mask_by_bits('B10_masked', 'BQA', 11, 2, [2, 3])) \
                                         .withColumn('tot_cell_count', rf_data_cells('B10')) \
                                         .withColumn('unmsk_cell_count', rf_data_cells('B10_masked')) \
                                         .withColumn('mask_fraction', 
                                                     (1.0 - F.col('unmsk_cell_count')/F.col('tot_cell_count')))

# April 2017
land_chip_2017_04_masked = land_chip_2017_04.withColumn('B10_uint16', rf_convert_cell_type('B10', 'uint16')) \
                                         .withColumn('B10_masked', # designated fill = yes
                                                     rf_mask_by_bit('B10_uint16', 'BQA', 0, 1)) \
                                         .withColumn('B10_masked', # cloud = yes
                                                     rf_mask_by_bit('B10_masked', 'BQA', 4, 1)) \
                                         .withColumn('B10_masked', # cloud shadow conf is medium or high
                                                     rf_mask_by_bits('B10_masked', 'BQA', 7, 2, [2, 3])) \
                                         .withColumn('B10_masked', # cirrus conf is medium or high
                                                     rf_mask_by_bits('B10_masked', 'BQA', 11, 2, [2, 3])) \
                                         .withColumn('tot_cell_count', rf_data_cells('B10')) \
                                         .withColumn('unmsk_cell_count', rf_data_cells('B10_masked')) \
                                         .withColumn('mask_fraction', 
                                                     (1.0 - F.col('unmsk_cell_count')/F.col('tot_cell_count')))

In [None]:
# Find the tile for each site that has the highest number of unmasked cells
# Keep the tile(s) for each plant that has the highest number of unmasked cells
# (And handle rare edge-case where returned chip is less than 35 x 35)

# January 2018
land_2018_01_maxcnt = land_chip_2018_01_masked.groupby('uid').max('unmsk_cell_count') \
                                                  .withColumnRenamed('max(unmsk_cell_count)', 'max_unmsk_cell_count')
land_chip_2018_01_masked = land_chip_2018_01_masked.join(land_2018_01_maxcnt, 'uid', 'left')
land_chip_2018_01_fltr = land_chip_2018_01_masked.filter(F.col('tot_cell_count') == 1225) \
                                                     .filter(F.col('unmsk_cell_count') == F.col('max_unmsk_cell_count'))

# January 2017
land_2017_01_maxcnt = land_chip_2017_01_masked.groupby('uid').max('unmsk_cell_count') \
                                                  .withColumnRenamed('max(unmsk_cell_count)', 'max_unmsk_cell_count')
land_chip_2017_01_masked = land_chip_2017_01_masked.join(land_2017_01_maxcnt, 'uid', 'left')
land_chip_2017_01_fltr = land_chip_2017_01_masked.filter(F.col('tot_cell_count') == 1225) \
                                                     .filter(F.col('unmsk_cell_count') == F.col('max_unmsk_cell_count'))

# April 2017
land_2017_04_maxcnt = land_chip_2017_04_masked.groupby('uid').max('unmsk_cell_count') \
                                                  .withColumnRenamed('max(unmsk_cell_count)', 'max_unmsk_cell_count')
land_chip_2017_04_masked = land_chip_2017_04_masked.join(land_2017_04_maxcnt, 'uid', 'left')
land_chip_2017_04_fltr = land_chip_2017_04_masked.filter(F.col('tot_cell_count') == 1225) \
                                                     .filter(F.col('unmsk_cell_count') == F.col('max_unmsk_cell_count'))

In [None]:
# If there's >1 tile per site, grab the first record, and remove duplicates
# Take B10 with NoData, unmasked

# January 2018
land_chip_2018_01_c25 = land_chip_2018_01_fltr.filter(col('mask_fraction') < 0.25)
land_chip_2018_01_unq = land_chip_2018_01_c25.groupby('uid') \
                                                 .agg(F.first('datetime').alias('datetime_Jan18'),
                                                      F.first('B10_uint16').alias('B10_Jan18'),
                                                      F.first('id').alias('id_Jan18'),
                                                      F.first('B10_path').alias('B10_path_Jan18'))

# January 2017
land_chip_2017_01_c25 = land_chip_2017_01_fltr.filter(col('mask_fraction') < 0.25)
land_chip_2017_01_unq = land_chip_2017_01_c25.groupby('uid') \
                                                 .agg(F.first('datetime').alias('datetime_Jan17'),
                                                      F.first('B10_uint16').alias('B10_Jan17'),
                                                      F.first('id').alias('id_Jan17'),
                                                      F.first('B10_path').alias('B10_path_Jan17'))

# April 2017
land_chip_2017_04_c25 = land_chip_2017_04_fltr.filter(col('mask_fraction') < 0.25)
land_chip_2017_04_unq = land_chip_2017_04_c25.groupby('uid') \
                                                 .agg(F.first('datetime').alias('datetime_Apr17'),
                                                      F.first('B10_uint16').alias('B10_Apr17'),
                                                      F.first('id').alias('id_Apr17'),
                                                      F.first('B10_path').alias('B10_path_Apr17'))

## Join TIR chips and normalize

In [None]:
# Join the chips together into a single RasterFrame
# Use inner join to ensure only sites with all three dates are included
land_chips_joined = land_chip_2018_01_unq.join(land_chip_2017_01_unq, on=['uid'], how='inner') \
                                         .join(land_chip_2017_04_unq, on=['uid'], how='inner')

## Write chips out as GeoTIFFs

In [None]:
# Output path (putting on scratch to test)
output_path = '/scratch/ALD_L8_TIR_landcover_chips_v4_B10_201801_201701_201704'

# Create a column of unique filenames
# Format: [uid]_land_v4_B10_201801_201701_201704
land_chips_joined = land_chips_joined.withColumn('file_path_name', 
                                                     F.concat_ws('_', F.col('uid'), lit('landcover_v4_B10_201801_201701_201704')))

In [None]:
# Write out chips
land_chips_joined.write.chip(output_path, filenameCol='file_path_name', 
                               catalog=True, 
                               metadata=['uid', 
                                         'id_Jan18', 'datetime_Jan18', 'B10_path_Jan18',
                                         'id_Jan17', 'datetime_Jan17', 'B10_path_Jan17',
                                         'id_Apr17', 'datetime_Apr17', 'B10_path_Apr17'])

In [None]:
!ls -lR /scratch/ALD_L8_TIR_landcover_chips_v4_B10_201801_201701_201704

In [None]:
# Check out what's in one of the chips for fun
tiffs = os.listdir(output_path)

with rasterio.open(output_path+'/'+tiffs[0]) as src:
    for k, v in src.meta.items():
        print(k, '\t\t', v)
        
    print('\n', 'T A G S :')
    for k, v in src.tags().items():
        print(k, '\t\t', v)
        
    print('\n B A N D S :')
    for b in range(1, src.count + 1):
        for k in src.tags(b):
            print("\tBand", b, '\t\t', k, '\t\t', v)
        print("\tBand", b, '\t\t', src.colorinterp[b-1])
    ax = show(src.read(1), transform=src.transform)

In [None]:
!tar -cvf /scratch/ALD_L8_TIR_landcover_chips_v4_B10_201801_201701_201704.tar /scratch/ALD_L8_TIR_landcover_chips_v4_B10_201801_201701_201704

In [None]:
!ls -lh /scratch/ALD_L8_TIR_landcover_chips_v4_B10_201801_201701_201704.tar

_Below workflow is temporary; AWS credentials not working in beta. Copies the tar file to local, and uses another script to upload to S3 using a different instance_

In [None]:
!cp /scratch/ALD_L8_TIR_landcover_chips_v4_B10_201801_201701_201704.tar /home/jovyan/sfi-asset-level-data/src/main/resources/ALD_L8_TIR_landcover_chips_v4_B10_201801_201701_201704.tar

## Upload tar file to S3 bucket