# Track progress of TIR Landsat 8 Macrolocalization Model

This notebook tallies up the progress on scoring the TIR Landsat 8 macrolocalization models for cement and steel plants.

## Import required libraries

In [None]:
import pandas as pd
import geopandas as gpd

import os
import shutil
import boto3
import glob

## Define input/output files and paths, and parameters

### Parameters

* `year` defines the year used for model deployment
* `scene_subset` set to 1 or 2. Scoring was divided in two pieces to run on two servers at the same time. 1 will process the first set of scenes; 2 will process the second. 
* `init_acct_file` set to True or False. If 1, will initialize the scene accounting file by computing the total chip count per scene. Only need to do this once per subset.

In [None]:
year = '2018'
scene_subset = 2
init_acct_file = False

### Input files and paths

* `s3_path` defines S3 high-level folder for L8 TIR macro-localization data
* `chip_cntr_tar` is the tar with GeoJSON files of chip centroids for the deployment region
* `score_tar` define tar of score GeoJSONS (one for each scene)
* `LOCAL_DIR` specifies where to keep put files locally for analysis

In [None]:
s3_path = 'L8-TIR-macro-localization-model-deployment'
chip_cntr_tar = 'L8-deployment-chip-centroids-CHN-10km-pthsh0.002.tar'
score_tar = 'L8-deployment-chip-scores-CHN-10km-pthsh0.002_'+year+'_set'+str(scene_subset)+'.tar'

LOCAL_DIR = '/scratch/'

### Output files and paths

* `scene_acct_csv` defines csv file tallying number of chips scored for different years per scene. This is first created when `year = '2020'`, and updated for other years.

In [None]:
scene_acct_csv = '../../resources/macro-loc-model-deployment/L8-deployment-scene_acct-CHN-10km-pthsh0.002_set'+str(scene_subset)+'.csv'

## Download and Read in Chip Centroids from 10km Grid

* Only necessary if `init_acct_file = True`

In [None]:
s3 = boto3.resource('s3')
bucket = s3.Bucket('sfi-shared-assets')

In [None]:
if init_acct_file:

    # Download and unpack tar file
    bucket.download_file(s3_path+'/'+chip_cntr_tar, LOCAL_DIR+chip_cntr_tar)
    !tar -xf {LOCAL_DIR+chip_cntr_tar} -C {LOCAL_DIR}
    
    chip_cntr_dir = chip_cntr_tar.replace('.tar', '')
    chip_cntr_gjsons = os.listdir(LOCAL_DIR+chip_cntr_dir)
    chip_cntr_gjsons.sort()
    
    # Divide data by set
    list1 = []
    list2 = []
    for f in chip_cntr_gjsons:
        scene_ind3 = int(f.split('.')[1].split('-')[-1][0:3])
        if scene_ind3 <= 125:
            list1.append(f)
        else:
            list2.append(f)
            
    if scene_subset == 1:
        scene_files = [LOCAL_DIR+chip_cntr_dir+'/'+f for f in list1]
        scene_ids = [f.split('_')[-1].split('.')[0] for f in list1]
    if scene_subset == 2:
        scene_files = [LOCAL_DIR+chip_cntr_dir+'/'+f for f in list2]
        scene_ids = [f.split('_')[-1].split('.')[0] for f in list2]
        
    print('Total number of scenes: ', len(scene_ids))

## Loop over Scenes, Tally Total Chip Count

* Only if `init_acct_file = True`
* Writes total count of chips per scene out to file
* If `init_acct_file = False`, reads in the scene accounting file

In [None]:
# If init_acct_file = True, create new accounting file and save
if init_acct_file:

    # Define output DataFrame
    scene_acct_pdf = pd.DataFrame(columns = ['scene_id', 'tile_cnt_tot'])
    
    # Loop over scenes
    for scene_id, scene_file in zip(scene_ids, scene_files):
    
        # Read in chip centroids
        chip_cntr_gdf = gpd.read_file(scene_file)
        chip_cnt = len(chip_cntr_gdf)
        
        # Write chip count to DataFrame
        scene_acct_pdf = scene_acct_pdf.append({'scene_id': scene_id,
                                                'tile_cnt_tot': chip_cnt},
                                               ignore_index=True)
        
        print('Scene ', scene_id, ': Total chip count = ', chip_cnt)
        
    # Save results to csv
    scene_acct_pdf.to_csv(scene_acct_csv, index=False)
    
# Otherwise, load in existing file
else:
    scene_acct_pdf = pd.read_csv(scene_acct_csv, index_col=False)
    print('Total number of scenes: ', len(scene_acct_pdf))

## Download and Read in Scores for Tiles in Scenes

In [None]:
bucket.download_file(s3_path+'/'+score_tar, LOCAL_DIR+score_tar)
!tar -xf {LOCAL_DIR+score_tar} -C {LOCAL_DIR}

In [None]:
score_dir = score_tar.replace('.tar', '')
score_gjsons = glob.glob(LOCAL_DIR+score_dir+'/*.geojson')
score_gjsons.sort()

In [None]:
for f in score_gjsons:
    scene_ids = [f.split('_')[-1].split('.')[0] for f in score_gjsons]
print('Total number of scored scenes: ', len(scene_ids))

In [None]:
tile_acct_pdf = pd.DataFrame(columns = ['scene_id', 'tile_cnt_'+year])

# Loop over scenes
for scene_id, scene_file in zip(scene_ids, score_gjsons):
    
    # Read in scores
    score_gdf = gpd.read_file(scene_file)
    tile_cnt = len(score_gdf)
        
    # Write chip count to DataFrame
    tile_acct_pdf = tile_acct_pdf.append({'scene_id': scene_id,
                                          'tile_cnt_'+year: tile_cnt},
                                         ignore_index=True)
        
    print('Scene ', scene_id, ': Total scored chip count = ', tile_cnt)

## Merge scored chip count to total chip count DataFrame and Save

In [None]:
scene_acct_pdf = pd.merge(scene_acct_pdf, tile_acct_pdf, how='left', on='scene_id')

In [None]:
scene_acct_pdf.loc[scene_acct_pdf['tile_cnt_'+year].isna(), ['tile_cnt_'+year]] = 0

In [None]:
scene_acct_pdf.to_csv(scene_acct_csv, index=False)