# Compute Statitics for TIR Landsat 8 Macrolocalization Model Deployment

* Cumulative percent of scored tiles by deployment date

## Import required libraries

In [None]:
import pandas as pd
import geopandas as gpd
import numpy as np

import os
import shutil
import boto3
import glob

from matplotlib import pyplot as plt

## Define input/output files and paths, and parameters

### Parameters

* `years_sets` zip that defines the years and subsets for model deployment to consider
* `pred_thresh` is the prediction threshold for selecting deployment grid cells

In [None]:
years_sets = zip(['2020', '2020', '2019', '2019', '2018', '2018'],
                 ['1', '2', '1', '2', '1', '2'])
pred_thresh = 0.002

### Steps to run

So we don't repeat analysis we've already completed

In [None]:
plot_cum_frac = False
compile_model_scores = False
create_score_hists = False
exam_score_hists = False
exam_fusion_options = False
compute_cmt_dist = False
exam_cmt_dist = True
compute_stl_dist = False
exam_stl_dist = True

### Input files and paths

* `s3_path` defines S3 high-level folder for L8 TIR macro-localization data
* `chip_ext_tar` is the tar with GeoJSON files of chip extens for the deployment region
* `score_tar_prefix` define prefix of tar files of score GeoJSONS
* `acct_csv_prefix` prefix of location of scene accounting csv files
* `macro_10km_shp` is a shapefile specifying the 10km grid from the proximity to infrastructure model
* `LOCAL_DIR` specifies where to keep put files locally for analysis

In [None]:
s3_path = 'L8-TIR-macro-localization-model-deployment'
chip_ext_tar = 'L8-deployment-chip-extents-CHN-10km-pthsh0.002.tar'
score_tar_prefix = 'L8-deployment-chip-scores-CHN-10km-pthsh0.002_'
acct_csv_prefix = '../../resources/macro-loc-model-deployment/L8-deployment-scene_acct-CHN-10km-pthsh0.002'

cement_site_geojson = "../../resources/macro-loc-model-build/cement_exact_china_v4.1.geojson"
steel_site_geojson = "../../resources/macro-loc-model-build/steel_exact_china_v4.1.geojson"

macro_10km_shp = "../../resources/nt-model/10km_CS_macro/macroloc_cement_steel_CHN_10.shp"

knwn_cntr_scores_gjson = '../../resources/macro-loc-model-deployment/L8-known-plant-chip-fastai-scores-CHN-10km-pthsh0.002_2020.geojson'

LOCAL_DIR = '/scratch/'

In [None]:
s3 = boto3.resource('s3')
bucket = s3.Bucket('sfi-shared-assets')

### Output files and paths

* `scene_acct_png` is png showing bar chart of scoring effort
* `prob_hist_csv`
* `known_prob_hist_csv`
* `cmt_hist_png`
* `cmt_cum_hist_png`
* `stl_hist_png`
* `stl_cum_hist_png`

In [None]:
scene_acct_png = acct_csv_prefix+'_bar_chart.png'

prob_hist_csv = acct_csv_prefix.replace('scene_acct', 'score_prob_hist')+'.csv'
compiled_scores_csv = (acct_csv_prefix.replace('scene_acct', 'compiled_scores')+'.csv').split('/')[-1]

cmt_hist_png = acct_csv_prefix.replace('scene_acct', 'cement_score_hist')+'.png'
cmt_cum_hist_png = acct_csv_prefix.replace('scene_acct', 'cement_score_cum_hist')+'.png'
stl_hist_png = acct_csv_prefix.replace('scene_acct', 'steel_score_hist')+'.png'
stl_cum_hist_png = acct_csv_prefix.replace('scene_acct', 'steel_score_cum_hist')+'.png'

cmt_comb_prob_scatter_png = acct_csv_prefix.replace('scene_acct', 'cement_comb_score_scatter')+'.png'
cmt_comb_prob_hist_png = acct_csv_prefix.replace('scene_acct', 'cement_comb_score_hist')+'.png'
cmt_knwn_comb_hist_png = acct_csv_prefix.replace('scene_acct', 'cement_knwn_comb_score_hist')+'.png'

stl_comb_prob_scatter_png = acct_csv_prefix.replace('scene_acct', 'steel_comb_score_scatter')+'.png'
stl_comb_prob_hist_png = acct_csv_prefix.replace('scene_acct', 'steel_comb_score_hist')+'.png'
stl_knwn_comb_hist_png = acct_csv_prefix.replace('scene_acct', 'steel_knwn_comb_score_hist')+'.png'

cmt_dist_csv = acct_csv_prefix.replace('scene_acct', 'cement_knwn_chip_dist')+'.csv'
cmt_dist_2dhist_png = cmt_dist_csv.replace('chip_dist','dist_2dhist').replace('.csv','.png')
cmt_dist_1dhist_png = cmt_dist_csv.replace('chip_dist','dist_1dhist').replace('.csv','.png')
cmt_chp_cntr_comp_png = cmt_dist_csv.replace('chip_dist','cntr_score_compare').replace('.csv','.png')

stl_dist_csv = acct_csv_prefix.replace('scene_acct', 'steel_knwn_chip_dist')+'.csv'
stl_dist_2dhist_png = stl_dist_csv.replace('chip_dist','dist_2dhist').replace('.csv','.png')
stl_dist_1dhist_png = stl_dist_csv.replace('chip_dist','dist_1dhist').replace('.csv','.png')
stl_chp_cntr_comp_png = stl_dist_csv.replace('chip_dist','cntr_score_compare').replace('.csv','.png')

## 1. Cumulative Count of Scored Chips by Year

In [None]:
if plot_cum_frac:

    # Read in and merge scene accounting files for set 1 and set 2
    set1_acct_pdf = pd.read_csv(acct_csv_prefix+'set1.csv', index_col=False)
    set2_acct_pdf = pd.read_csv(acct_csv_prefix+'set2.csv', index_col=False)
    scene_acct_pdf = pd.concat([set1_acct_pdf, set2_acct_pdf], ignore_index=True)
    
    # Compute cumulative counts
    scene_acct_pdf['tile_cnt_2019_cum'] = scene_acct_pdf.tile_cnt_2020 + scene_acct_pdf.tile_cnt_2019
    scene_acct_pdf['tile_cnt_2018_cum'] = scene_acct_pdf.tile_cnt_2019_cum + scene_acct_pdf.tile_cnt_2018
    
    # Create summary dataframe
    bar_x = ['2020', '2019', '2018']
    bar_y = [scene_acct_pdf.tile_cnt_2020.sum(axis=0),
         scene_acct_pdf.tile_cnt_2019.sum(axis=0),
         scene_acct_pdf.tile_cnt_2018.sum(axis=0)]
    bar_y_cum = [scene_acct_pdf.tile_cnt_2020.sum(axis=0),
             scene_acct_pdf.tile_cnt_2019_cum.sum(axis=0),
             scene_acct_pdf.tile_cnt_2018_cum.sum(axis=0)]
    acct_bar_chrt_pdf = pd.DataFrame({'Scored Chips Count': bar_y,
                                  'Cumulative Scored Chips Count': bar_y_cum},
                                 index = bar_x)
    
    # Plot and save figure
    acct_bar_plot = acct_bar_chrt_pdf.plot.bar(rot=0).get_figure()
    plt.title('Landsat-8 TIR Model Deployment Summary')
    plt.xlabel('Deployment Year')
    plt.ylabel('Number of Chips Scored')
    plt.axhline(scene_acct_pdf.tile_cnt_tot.sum(axis=0), xmin=0.75, label='Total Chip Count', color='tab:orange')
    plt.legend()
    acct_bar_plot.savefig(scene_acct_png)
    
    # Print summary table of cumulative score effort
    acct_bar_chrt_pdf['Fraction Chips Scored'] = acct_bar_chrt_pdf['Scored Chips Count'] / scene_acct_pdf.tile_cnt_tot.sum(axis=0)
    acct_bar_chrt_pdf['Cumulative Fraction Chips Scored'] = acct_bar_chrt_pdf['Cumulative Scored Chips Count'] / scene_acct_pdf.tile_cnt_tot.sum(axis=0)
    print('Total number of chips: ', scene_acct_pdf.tile_cnt_tot.sum(axis=0))
    print(acct_bar_chrt_pdf[['Cumulative Scored Chips Count', 'Cumulative Fraction Chips Scored']])

## 2. Compile model scores from all chips

In [None]:
if compile_model_scores:
    
    # Loop over all years and sets
    for yr, st in years_sets:
        
        # Download tar file from S3 and untar
        score_tar_file = score_tar_prefix+yr+'_set'+st+'.tar'
        bucket.download_file(s3_path+'/'+score_tar_file, LOCAL_DIR+score_tar_file)
        !tar -xf {LOCAL_DIR+score_tar_file} -C {LOCAL_DIR}
        print("Finished downloading and extracting ", score_tar_file)
    
        # Read in GeoJSONs of known cement and steel plants
        cement_site_gdf = gpd.read_file(cement_site_geojson)
        steel_site_gdf = gpd.read_file(steel_site_geojson)
        
        # Read in shapefile of deployment grid, filter by pred_thresh
        macro_10km_gdf = gpd.read_file(macro_10km_shp)
        macro_10km_gdf = macro_10km_gdf[macro_10km_gdf.preds >= pred_thresh]
    
        # Get list of GeoJSONS
        score_dir = score_tar_file.replace('.tar', '')
        score_gjsons = glob.glob(LOCAL_DIR+score_dir+'/*.geojson')
        score_gjsons.sort()
        
        # Loop over all GeoJSONS
        for gf in score_gjsons:
            
            # Read in DataFrame
            score_gdf = gpd.read_file(gf)
            
            # Join 10km preds; take max if tile intersects two grid sections
            score_gdf = gpd.sjoin(score_gdf, macro_10km_gdf, how='left', 
                                  op='intersects')
            score_gdf = score_gdf.drop(['scene_id', 'index_right', 'length', 
                                        'length_w', 'Count_pnt'], axis=1)
            score_gdf['grpid'] = score_gdf['tile_id']
            score_gdf = score_gdf.sort_values('preds', ascending=False) \
                                 .groupby(['grpid']).first()
            
            # Join to known cement plant sites
            score_gdf = gpd.sjoin(score_gdf, cement_site_gdf, how='left', 
                                  op='intersects')
            score_gdf['cement_uid'] = score_gdf.uid
            score_gdf = score_gdf.drop(['index_right', 'uid'], axis=1)
            
            # Join to known steel plant sites
            score_gdf = gpd.sjoin(score_gdf, steel_site_gdf, how='left', 
                                  op='intersects')
            score_gdf['steel_uid'] = score_gdf.uid
            score_gdf = score_gdf.drop(['index_right', 'uid'], axis=1)
        
            print('Done compiling scores for ', len(score_gdf), ' chips in ', gf)
            
            # Append to compiled scores file, droping geom column
            if 'compiled_scores_pdf' in locals():
                compiled_scores_pdf = pd.concat([compiled_scores_pdf, 
                                                 score_gdf.drop('geometry', axis=1)], 
                                                ignore_index=True)
            else:
                compiled_scores_pdf = score_gdf.drop('geometry', axis=1)
            
    # Save results in csv
    compiled_scores_pdf.to_csv(compiled_scores_csv, index=False)
    
    # Upload to S3 (too bit for git)
    bucket.upload_file(compiled_scores_csv, s3_path+'/'+compiled_scores_csv)

## 3. Create histograms of probabilities of model scores

In [None]:
if create_score_hists:
    
    # Read in combined score results
    compiled_scores_pdf = pd.read_csv(compiled_scores_csv, index_col=False,
                                     low_memory=False)
    
    # Define bin edges and probabability histograms
    bedges = np.arange(0, 1.01, 0.01)
    
    # All Chips (drop duplicates where >1 plant intersects)
    # ---------
    comp_scores_nodups = compiled_scores_pdf.drop_duplicates(subset='tile_id', 
                                                             keep='first')
    # Cement
    cmt_prb_hist, bin_edges = np.histogram(comp_scores_nodups.cement_prob, 
                                           bins=bedges)
    # Steel
    stl_prb_hist, bin_edges = np.histogram(comp_scores_nodups.steel_prob, 
                                           bins=bedges)
    
    # Chips intersecting with known plants
    # ------------------------------------
    # Cement
    cmt_prob_knwn = compiled_scores_pdf[~compiled_scores_pdf.cement_uid.isnull()]
    cmt_knwn_prb_hist, bin_edges = np.histogram(cmt_prob_knwn.cement_prob, 
                                                bins=bedges)
    # Steel
    stl_prob_knwn = compiled_scores_pdf[~compiled_scores_pdf.steel_uid.isnull()]
    stl_knwn_prb_hist, bin_edges = np.histogram(stl_prob_knwn.steel_prob, 
                                                bins=bedges)
            
    # Save results in DataFrame and write to csv
    model_prob_hist = pd.DataFrame({'bin_left_edge': bin_edges[:-1],
                                    'cmt_prb_hist': cmt_prb_hist,
                                    'cmt_knwn_prb_hist': cmt_knwn_prb_hist,
                                    'stl_prb_hist': stl_prb_hist,
                                    'stl_knwn_prb_hist': stl_knwn_prb_hist})
    model_prob_hist.to_csv(prob_hist_csv, index=False)

## 4. Examine scores of known sites compared to all chip scores

In [None]:
if exam_score_hists:
    
    # Read in histograms
    model_prob_hist = pd.read_csv(prob_hist_csv, index_col=False)
    
    # Plot histograms for cement plants
    # ---------------------------------
    fig, (ax1, ax2) = plt.subplots(2, sharey=False)

    ax1.bar(x=model_prob_hist.bin_left_edge,
        height=model_prob_hist.cmt_prb_hist,
        align='edge',
        width=0.01,
        label='All Chips')
    ax1.set(title='Cement Model Probability Distribution', ylabel='Count')
    ax1.legend()

    ax2.bar(x=model_prob_hist.bin_left_edge,
        height=model_prob_hist.cmt_knwn_prb_hist,
        align='edge',
        width=0.01,
        color='tab:orange',
        label='Chips Intersecting Known Plants')
    ax2.set(ylabel='Count', xlabel='Model Probability')
    ax2.legend()

    fig.savefig(cmt_hist_png)
    plt.show()
    
    # Plot histograms for steel plants
    # ---------------------------------
    fig, (ax1, ax2) = plt.subplots(2, sharey=False)

    ax1.bar(x=model_prob_hist.bin_left_edge,
        height=model_prob_hist.stl_prb_hist,
        align='edge',
        width=0.01,
        label='All Chips')
    ax1.set(title='Steel Model Probability Distribution', ylabel='Count')
    ax1.legend()

    ax2.bar(x=model_prob_hist.bin_left_edge,
        height=model_prob_hist.stl_knwn_prb_hist,
        align='edge',
        width=0.01,
        color='tab:orange',
        label='Chips Intersecting Known Plants')
    ax2.set(ylabel='Count', xlabel='Model Probability')
    ax2.legend()

    fig.savefig(stl_hist_png)
    plt.show()
    
    # Compute cumulative histograms
    model_prob_hist['cmt_prb_cumsum'] = model_prob_hist.cmt_prb_hist.cumsum()
    model_prob_hist['stl_prb_cumsum'] = model_prob_hist.stl_prb_hist.cumsum()
    model_prob_hist['cmt_knwn_prb_cumsum'] = model_prob_hist.cmt_knwn_prb_hist.cumsum()
    model_prob_hist['stl_knwn_prb_cumsum'] = model_prob_hist.stl_knwn_prb_hist.cumsum()
    
    # Plot cumulative histograms for cement
    # ------------------------------------
    fig, (ax1, ax2) = plt.subplots(2, sharey=False)

    ax1.bar(x=model_prob_hist.bin_left_edge,
        height=model_prob_hist.cmt_prb_cumsum,
        align='edge',
        width=0.01,
        label='All Chips')
    ax1.set(title='Cement Model Probability Cumulative Distribution', 
        ylabel='Cumulative Count')
    ax1.legend()

    ax2.bar(x=model_prob_hist.bin_left_edge,
        height=model_prob_hist.cmt_knwn_prb_cumsum,
        align='edge',
        width=0.01,
        color='tab:orange',
        label='Chips Intersecting Known Plants')
    ax2.set(ylabel='Cumulative Count', xlabel='Model Probability')
    ax2.legend()

    fig.savefig(cmt_cum_hist_png)
    plt.show()
    
    cmt_thr1000 = model_prob_hist.loc[(model_prob_hist.cmt_prb_cumsum - 
                            (model_prob_hist.cmt_prb_cumsum.max()-1000)).abs().idxmin()]
    print('Cement probability threshold to grab top 1000:', cmt_thr1000.bin_left_edge)
    
    # Plot cumulative histograms for steel
    # ------------------------------------
    fig, (ax1, ax2) = plt.subplots(2, sharey=False)

    ax1.bar(x=model_prob_hist.bin_left_edge,
        height=model_prob_hist.stl_prb_cumsum,
        align='edge',
        width=0.01,
        label='All Chips')
    ax1.set(title='Steel Model Probability Cumulative Distribution', 
        ylabel='Cumulative Count')
    ax1.legend()

    ax2.bar(x=model_prob_hist.bin_left_edge,
        height=model_prob_hist.stl_knwn_prb_cumsum,
        align='edge',
        width=0.01,
        color='tab:orange',
        label='Chips Intersecting Known Plants')
    ax2.set(ylabel='Cumulative Count', xlabel='Model Probability')
    ax2.legend()

    fig.savefig(stl_cum_hist_png)
    plt.show()
    
    stl_thr1000 = model_prob_hist.loc[(model_prob_hist.stl_prb_cumsum - 
                            (model_prob_hist.stl_prb_cumsum.max()-1000)).abs().idxmin()]
    print('Steel probability threshold to grab top 1000:', stl_thr1000.bin_left_edge)

## 5. Evaluate options for L8 and Proximity Model fusion

In [None]:
if exam_fusion_options:
    
    # Read in combined score results
    compiled_scores_pdf = pd.read_csv(compiled_scores_csv, index_col=False,
                                      low_memory=False)
    
    # Bin data into quantiles accroding to pred
    qbins = [0, 0.25, 0.5, 0.75, 1]
    compiled_scores_pdf['pred_grp'] = pd.qcut(compiled_scores_pdf.preds, q=qbins)#, labels=qgrp)
    
    # drop dups for pop averages
    comp_scores_nodups = compiled_scores_pdf.drop_duplicates(subset='tile_id', 
                                                             keep='first')
    # Compute average L8 probabilities per pred
    avg_prob_vs_pred = comp_scores_nodups[['index','preds','cement_prob','steel_prob']].groupby(['index']).mean()  
    
    # Cement
    # ------
    
    # Intersection w/ known cement plants
    cmt_prob_knwn = compiled_scores_pdf[~compiled_scores_pdf.cement_uid.isnull()]
    
    # Scatter plot for cement
    fig, ax = plt.subplots(1)
    plt.plot(comp_scores_nodups.preds, comp_scores_nodups.cement_prob, 
         color='silver', marker='.', linestyle='', label='All Chips')
    plt.plot(avg_prob_vs_pred.preds, avg_prob_vs_pred.cement_prob, 
         color='steelblue', marker='x', linestyle='', label='Average (All Chips)')
    plt.plot(cmt_prob_knwn.preds, cmt_prob_knwn.cement_prob, 
         color='seagreen', marker='x', linestyle='', label='Known Plants')
    plt.xlim([0,0.3])
    plt.xlabel('Probability (Proximity Model)')
    plt.ylabel('Probability (Landsat 8 TIR Model)')
    plt.legend()
    plt.title('Cement Model Probabilities')
    fig.savefig(cmt_comb_prob_scatter_png)
    
    # L8 scores by pred quantile - All chips
    comp_grp_scores = comp_scores_nodups.groupby('pred_grp')
    fig, ax = plt.subplots(1)
    ecol = ['k', 'b', 'g', 'r']
    labs = [str(f) for f in list(comp_grp_scores.groups.keys())]
    i = 0
    for group in comp_grp_scores:
        plt.hist(group[1].cement_prob, bins=50,
                 color=ecol[i], edgecolor=ecol[i],
                 label=labs[i], histtype='step')
        i = i+1
    plt.legend(title='Proximity Probability Interval')
    plt.xlabel('Landsat 8 TIR Model Probability')
    plt.ylabel('Count (All Chips)')
    plt.title('Cement Model Probability Distribution')
    plt.xlim([0,1])
    fig.savefig(cmt_comb_prob_hist_png)
    
    # L8 scores by pred quantile - Intersecting with plants
    cmt_prob_knwn_grps = cmt_prob_knwn.groupby('pred_grp')
    fig, ax = plt.subplots(1)
    ecol = ['k', 'b', 'g', 'r']
    labs = [str(f) for f in list(cmt_prob_knwn_grps.groups.keys())]
    i = 0
    for group in cmt_prob_knwn_grps:
        plt.hist(group[1].cement_prob, bins=10,
                 color=ecol[i], edgecolor=ecol[i],
                 label=labs[i], histtype='step')
        i = i+1
    plt.legend(title='Proximity Probability Interval')
    plt.xlabel('Landsat 8 TIR Model Probability')
    plt.ylabel('Count (Chips Intersecting Known Plants)')
    plt.title('Cement Model Probability Distribution')
    plt.xlim([0,1])
    fig.savefig(cmt_knwn_comb_hist_png)
    
    print('Number of Cement Plants per Quantile:')
    print(cmt_prob_knwn['pred_grp'].value_counts())
    
    # Steel
    # ------
    
    # Intersection w/ known steel plants
    stl_prob_knwn = compiled_scores_pdf[~compiled_scores_pdf.steel_uid.isnull()]
    
    # Scatter plot for steel
    fig, ax = plt.subplots(1)
    plt.plot(comp_scores_nodups.preds, comp_scores_nodups.steel_prob, 
         color='silver', marker='.', linestyle='', label='All Chips')
    plt.plot(avg_prob_vs_pred.preds, avg_prob_vs_pred.steel_prob, 
         color='steelblue', marker='x', linestyle='', label='Average (All Chips)')
    plt.plot(stl_prob_knwn.preds, stl_prob_knwn.steel_prob, 
         color='seagreen', marker='x', linestyle='', label='Known Plants')
    plt.xlim([0,0.3])
    plt.xlabel('Probability (Proximity Model)')
    plt.ylabel('Probability (Landsat 8 TIR Model)')
    plt.legend()
    plt.title('Steel Model Probabilities')
    fig.savefig(stl_comb_prob_scatter_png)
    
    # L8 scores by pred quantile - All chips
    fig, ax = plt.subplots(1)
    ecol = ['k', 'b', 'g', 'r']
    labs = [str(f) for f in list(comp_grp_scores.groups.keys())]
    i = 0
    for group in comp_grp_scores:
        plt.hist(group[1].steel_prob, bins=50,
                 color=ecol[i], edgecolor=ecol[i],
                 label=labs[i], histtype='step')
        i = i+1
    plt.legend(title='Proximity Probability Interval')
    plt.xlabel('Landsat 8 TIR Model Probability')
    plt.ylabel('Count (All Chips)')
    plt.title('Steel Model Probability Distribution')
    plt.xlim([0,1])
    fig.savefig(stl_comb_prob_hist_png)
    
    # L8 scores by pred quantile - Intersecting with plants
    stl_prob_knwn_grps = stl_prob_knwn.groupby('pred_grp')
    fig, ax = plt.subplots(1)
    ecol = ['k', 'b', 'g', 'r']
    labs = [str(f) for f in list(stl_prob_knwn_grps.groups.keys())]
    i = 0
    for group in stl_prob_knwn_grps:
        plt.hist(group[1].steel_prob, bins=10,
                 color=ecol[i], edgecolor=ecol[i],
                 label=labs[i], histtype='step')
        i = i+1
    plt.legend(title='Proximity Probability Interval')
    plt.xlabel('Landsat 8 TIR Model Probability')
    plt.ylabel('Count (Chips Intersecting Known Plants)')
    plt.title('Steel Model Probability Distribution')
    plt.xlim([0,1])
    fig.savefig(stl_knwn_comb_hist_png)
    
    print('Number of Steel Plants per Quantile:')
    print(stl_prob_knwn['pred_grp'].value_counts())

## 6. Study whether distance from chip center to plant affects cement results

In [None]:
if compute_cmt_dist:
    
    # Read in combined score results
    bucket.download_file(s3_path+'/'+compiled_scores_csv, LOCAL_DIR+compiled_scores_csv)
    compiled_scores_pdf = pd.read_csv(LOCAL_DIR+compiled_scores_csv, index_col=False,
                                      low_memory=False)
    
    # Limit to chips covering known cement plants
    cmt_prob_knwn = compiled_scores_pdf[~compiled_scores_pdf.cement_uid.isnull()] \
                                .drop(['index', 'prop_rail', 'prop_water', 'preds',
                                       'steel_prob', 'steel_uid'], axis=1)
    
    # Define groups to find right GeoJson files
    cmt_prob_knwn['scene_id'] = ['-'.join(f.split('-')[0:2]) for f in cmt_prob_knwn.tile_id]
    cmt_prob_knwn['set_id'] = ['set1' if int(sid.split('-')[-1][0:3]) <= 125 else 'set2' for sid in cmt_prob_knwn['scene_id']]
    cmt_prob_knwn_grps = cmt_prob_knwn.groupby(['year', 'set_id', 'scene_id'])
    grps_to_read = list(cmt_prob_knwn_grps.groups.keys())
    
    # Loop over all years and sets and download tar file from S3 and untar
    for yr, st in years_sets:
        score_tar_file = score_tar_prefix+yr+'_set'+st+'.tar'
        bucket.download_file(s3_path+'/'+score_tar_file, LOCAL_DIR+score_tar_file)
        !tar -xf {LOCAL_DIR+score_tar_file} -C {LOCAL_DIR}
        print("Finished downloading and extracting ", score_tar_file)
        
    # Read in GeoJSONs of known cement plants
    cement_site_gdf = gpd.read_file(cement_site_geojson)
    
    # Loop over scene/year/set groups to compile distance data
    calc_crs = "EPSG:3395"
    for group in cmt_prob_knwn_grps:
    
        # Get filename of GeoJson
        score_dir = LOCAL_DIR+'L8-deployment-chip-scores-CHN-10km-pthsh0.002_'+str(group[0][0])+'_'+group[0][1]
        score_gjson = score_dir+'/'+'L8-deployment-chip-scores-CHN-10km-pthsh0.002_'+str(group[0][0])+'_'+group[0][2]+'.geojson'
        print(score_gjson)
        
        # Read in GeoJson file
        score_gdf = gpd.read_file(score_gjson)
    
        # Merge to data group to get geometry, compute centroid of chip,
        # and convert to physical crs
        score_gdf = score_gdf[['tile_id', 'geometry']].merge(group[1], 
                                                         on='tile_id', 
                                                         how='right')
        score_gdf['geometry'] = score_gdf.geometry.centroid
        score_gdf = score_gdf.to_crs(calc_crs)
    
        # Merge known plant points to data group, convert crs
        knwn_plnt_gdf = cement_site_gdf.merge(group[1], left_on='uid', 
                                          right_on='cement_uid', how='right')
        knwn_plnt_gdf = knwn_plnt_gdf.to_crs(calc_crs)
    
        # Be safe, make sure we got the rows aligned
        vals_match = (score_gdf.tile_id == knwn_plnt_gdf.tile_id) & \
                     (score_gdf.cement_prob == knwn_plnt_gdf.cement_prob) & \
                     (score_gdf.cement_uid == knwn_plnt_gdf.cement_uid)
        if sum(vals_match) != len(vals_match):
            print('Oh no! Something is wrong. Work harder.')
            break
    
        # Calculate distance and drop unneeded columns
        score_gdf['dist_m'] = score_gdf.distance(knwn_plnt_gdf)
        score_gdf = score_gdf.drop(['geometry', 'year', 'scene_id', 'set_id'], axis=1)
     
        # Append to full list of all known plants
        if 'cmt_chp_plt_dist_pdf' in locals():
            cmt_chp_plt_dist_pdf = pd.concat([cmt_chp_plt_dist_pdf, score_gdf], 
                                             ignore_index=True)
        else:
            cmt_chp_plt_dist_pdf = score_gdf
    
    # Save to csv
    cmt_chp_plt_dist_pdf.to_csv(cmt_dist_csv, index=False)

In [None]:
if exam_cmt_dist:

    # Read in chips distance file
    cmt_chp_plt_dist_pdf = pd.read_csv(cmt_dist_csv, index_col=False)
            
    # 2D histogram of distance versus L8 score
    fig, ax = plt.subplots(1)
    h, xedge, yedge, im = plt.hist2d(cmt_chp_plt_dist_pdf.dist_m, cmt_chp_plt_dist_pdf.cement_prob, 
                                     bins=[10, 10],
                                     range=[[0, 1000], [0, 1]])
    plt.title('Cement Plants')
    plt.xlabel('Distance between Chip Center and Cement Plant (m)')
    plt.ylabel('Landsat 8 TIR Model Probability')
    fig.savefig(cmt_dist_2dhist_png)
    
    # histogram of distances between chip centers and cement plants
    fig, ax = plt.subplots(1)
    plt.bar(x=xedge[:-1],
        height=h.sum(axis=1),
        align='edge',
        width=100)
    plt.title('Cement Plants')
    plt.ylabel('Count of Chips')
    plt.xlabel('Distance between Chip Center and Cement Plant (m)')
    plt.show()
    fig.savefig(cmt_dist_1dhist_png)

    # histogram of model probabilities (don't save - just to validate)
    fig, ax = plt.subplots(1)
    plt.bar(x=yedge[:-1],
        height=h.sum(axis=0),
        align='edge',
        width=0.1)
    plt.title('Cement Plants')
    plt.ylabel('Count of Chips')
    plt.xlabel('Landsat 8 TIR Model Probability')
    plt.show()
    
    # Read in L8 model scores on chips centered on plants
    knwn_cntr_scores_pdf = gpd.read_file(knwn_cntr_scores_gjson)
    # Limit to cement plants
    kwnw_cmt_cntr_pdf = knwn_cntr_scores_pdf[knwn_cntr_scores_pdf['site_type'] == 'cement']
    kwnw_cmt_cntr_pdf['cement_cntr_prob'] = kwnw_cmt_cntr_pdf.cement_prob
    
    # Join to chip scores intersecrting with cement plants
    kwnw_cmt_cntr_pdf = pd.merge(cmt_chp_plt_dist_pdf,
                                 kwnw_cmt_cntr_pdf[['uid', 'cement_cntr_prob']],
                                 how='left', right_on='uid', left_on='cement_uid')
    
    # Scatter plot of scores
    fig, ax = plt.subplots(1)
    plt.plot(kwnw_cmt_cntr_pdf.cement_cntr_prob, kwnw_cmt_cntr_pdf.cement_prob, 'bx')
    plt.title('Cement: Landsat 8 TIR Model Probabilty')
    plt.xlabel('Chips Centered on Known Plants')
    plt.ylabel('Intersecting Chips in Deployment Grid')
    fig.savefig(cmt_chp_cntr_comp_png)

## 7. Study whether distance from chip center to plant affects steel results

In [None]:
if compute_stl_dist:
    
    # Read in combined score results
    bucket.download_file(s3_path+'/'+compiled_scores_csv, LOCAL_DIR+compiled_scores_csv)
    compiled_scores_pdf = pd.read_csv(LOCAL_DIR+compiled_scores_csv, index_col=False,
                                      low_memory=False)
    
    # Limit to chips covering known steel plants
    stl_prob_knwn = compiled_scores_pdf[~compiled_scores_pdf.steel_uid.isnull()] \
                                .drop(['index', 'prop_rail', 'prop_water', 'preds',
                                       'cement_prob', 'cement_uid'], axis=1)
    
    # Define groups to find right GeoJson files
    stl_prob_knwn['scene_id'] = ['-'.join(f.split('-')[0:2]) for f in stl_prob_knwn.tile_id]
    stl_prob_knwn['set_id'] = ['set1' if int(sid.split('-')[-1][0:3]) <= 125 else 'set2' for sid in stl_prob_knwn['scene_id']]
    stl_prob_knwn_grps = stl_prob_knwn.groupby(['year', 'set_id', 'scene_id'])
    grps_to_read = list(stl_prob_knwn_grps.groups.keys())
    
    # Loop over all years and sets and download tar file from S3 and untar
    for yr, st in years_sets:
        score_tar_file = score_tar_prefix+yr+'_set'+st+'.tar'
        bucket.download_file(s3_path+'/'+score_tar_file, LOCAL_DIR+score_tar_file)
        !tar -xf {LOCAL_DIR+score_tar_file} -C {LOCAL_DIR}
        print("Finished downloading and extracting ", score_tar_file)
        
    # Read in GeoJSONs of known steel plants
    steel_site_gdf = gpd.read_file(steel_site_geojson)
    
    # Loop over scene/year/set groups to compile distance data
    calc_crs = "EPSG:3395"
    for group in stl_prob_knwn_grps:
    
        # Get filename of GeoJson
        score_dir = LOCAL_DIR+'L8-deployment-chip-scores-CHN-10km-pthsh0.002_'+str(group[0][0])+'_'+group[0][1]
        score_gjson = score_dir+'/'+'L8-deployment-chip-scores-CHN-10km-pthsh0.002_'+str(group[0][0])+'_'+group[0][2]+'.geojson'
        print(score_gjson)
        
        # Read in GeoJson file
        score_gdf = gpd.read_file(score_gjson)
    
        # Merge to data group to get geometry, compute centroid of chip,
        # and convert to physical crs
        score_gdf = score_gdf[['tile_id', 'geometry']].merge(group[1], 
                                                         on='tile_id', 
                                                         how='right')
        score_gdf['geometry'] = score_gdf.geometry.centroid
        score_gdf = score_gdf.to_crs(calc_crs)
    
        # Merge known plant points to data group, convert crs
        knwn_plnt_gdf = steel_site_gdf.merge(group[1], left_on='uid', 
                                          right_on='steel_uid', how='right')
        knwn_plnt_gdf = knwn_plnt_gdf.to_crs(calc_crs)
    
        # Be safe, make sure we got the rows aligned
        vals_match = (score_gdf.tile_id == knwn_plnt_gdf.tile_id) & \
                     (score_gdf.steel_prob == knwn_plnt_gdf.steel_prob) & \
                     (score_gdf.steel_uid == knwn_plnt_gdf.steel_uid)
        if sum(vals_match) != len(vals_match):
            print('Oh no! Something is wrong. Work harder.')
            break
    
        # Calculate distance and drop unneeded columns
        score_gdf['dist_m'] = score_gdf.distance(knwn_plnt_gdf)
        score_gdf = score_gdf.drop(['geometry', 'year', 'scene_id', 'set_id'], axis=1)
     
        # Append to full list of all known plants
        if 'stl_chp_plt_dist_pdf' in locals():
            stl_chp_plt_dist_pdf = pd.concat([stl_chp_plt_dist_pdf, score_gdf], 
                                             ignore_index=True)
        else:
            stl_chp_plt_dist_pdf = score_gdf
    
    # Save to csv
    stl_chp_plt_dist_pdf.to_csv(stl_dist_csv, index=False)

In [None]:
if exam_stl_dist:

    # Read in chips distance file
    stl_chp_plt_dist_pdf = pd.read_csv(stl_dist_csv, index_col=False)
            
    # 2D histogram of distance versus L8 score
    fig, ax = plt.subplots(1)
    h, xedge, yedge, im = plt.hist2d(stl_chp_plt_dist_pdf.dist_m, stl_chp_plt_dist_pdf.steel_prob, 
                                     bins=[10, 10],
                                     range=[[0, 1000], [0, 1]])
    plt.title('Steel Plants')
    plt.xlabel('Distance between Chip Center and Steel Plant (m)')
    plt.ylabel('Landsat 8 TIR Model Probability')
    fig.savefig(stl_dist_2dhist_png)
    
    # histogram of distances between chip centers and steel plants
    fig, ax = plt.subplots(1)
    plt.bar(x=xedge[:-1],
        height=h.sum(axis=1),
        align='edge',
        width=100)
    plt.title('Steel Plants')
    plt.ylabel('Count of Chips')
    plt.xlabel('Distance between Chip Center and Steel Plant (m)')
    plt.show()
    fig.savefig(stl_dist_1dhist_png)

    # histogram of model probabilities (don't save - just to validate)
    fig, ax = plt.subplots(1)
    plt.bar(x=yedge[:-1],
        height=h.sum(axis=0),
        align='edge',
        width=0.1)
    plt.title('Steel Plants')
    plt.ylabel('Count of Chips')
    plt.xlabel('Landsat 8 TIR Model Probability')
    plt.show()
    
    # Read in L8 model scores on chips centered on plants
    knwn_cntr_scores_pdf = gpd.read_file(knwn_cntr_scores_gjson)
    # Limit to steel plants
    kwnw_stl_cntr_pdf = knwn_cntr_scores_pdf[knwn_cntr_scores_pdf['site_type'] == 'steel']
    kwnw_stl_cntr_pdf['steel_cntr_prob'] = kwnw_stl_cntr_pdf.steel_prob
    
    # Join to chip scores intersecrting with steel plants
    kwnw_stl_cntr_pdf = pd.merge(stl_chp_plt_dist_pdf,
                                 kwnw_stl_cntr_pdf[['uid', 'steel_cntr_prob']],
                                 how='left', right_on='uid', left_on='steel_uid')
    
    # Scatter plot of scores
    fig, ax = plt.subplots(1)
    plt.plot(kwnw_stl_cntr_pdf.steel_cntr_prob, kwnw_stl_cntr_pdf.steel_prob, 'bx')
    plt.title('Steel: Landsat 8 TIR Model Probabilty')
    plt.xlabel('Chips Centered on Known Plants')
    plt.ylabel('Intersecting Chips in Deployment Grid')
    fig.savefig(stl_chp_cntr_comp_png)