# Compare Scores from RGB Sentinel-2 Macrolocalization Model Derived in Different Ways

## Import required libraries

In [None]:
!pip install fastai==1.0.61

In [None]:
import geopandas as gpd
import pandas as pd

import os
import shutil
import boto3

from matplotlib import pyplot as plt

from fastai import *
from fastai.vision import *

## Define input/output files and paths, and parameters

### Parameters

* `year` defines the year of selected scenes
* `month` defines the month of selected scenes (format: January = "01", Februaray = "02", etc.)

In [None]:
year = '2020'
month = '06'

### Input files and paths

* `s3_path` defines S3 high-level folder for S2 RGB macro-localization data
* `MODEL_PATH` is the path on S3 to the Densenet161 multiclass model
* `LOCAL_DIR` specifies where to keep put files locally for analysis

In [None]:
s3_path = 'S2-RGB-macro-localization-model-deployment'
MODEL_PATH = 'S2-RGB-macro-localization-model-build3/S2-RGB-model-results3/densenet161_multiclass_final.pkl'
LOCAL_DIR = '/scratch/'

In [None]:
DATA_PATH = 'S2-RGB-macro-localization-model-build3/ALD_S2_RGB_chips_v4p1_train3.tar'

In [None]:
png_score_file = '../../resources/macro-loc-model-deployment/S2-known-plant-chip-fastai-scores-CHN-10km-pthsh0.002_'+year+month+'.geojson'

## Download Models and Define Scoring Functions

In [None]:
s3 = boto3.resource('s3')
bucket = s3.Bucket('sfi-shared-assets')

### Download models and load learners

In [None]:
def download_model(MODEL_PATH):
    if not os.path.exists(LOCAL_DIR+MODEL_PATH.split("/")[-1].replace(".pkl", "")):
        os.makedirs(LOCAL_DIR + MODEL_PATH.split("/")[-1].replace(".pkl", ""))
    bucket.download_file(MODEL_PATH, LOCAL_DIR+MODEL_PATH.split("/")[-1].replace(".pkl", "") + "/export.pkl")

In [None]:
download_model(MODEL_PATH)

In [None]:
multi_model = load_learner(LOCAL_DIR + MODEL_PATH.split("/")[-1].replace(".pkl", ""))

In [None]:
bucket.download_file(DATA_PATH, LOCAL_DIR + DATA_PATH.split("/")[-1])

In [None]:
unix_code = 'tar -C /scratch/ -xf '+LOCAL_DIR + DATA_PATH.split("/")[-1]
os.system(unix_code)

## Get Scores from Saved Models

### Cement

In [None]:
cement_only_path = LOCAL_DIR+'cement_'+year+month
os.mkdir(cement_only_path)
os.mkdir(cement_only_path+'/train')
os.mkdir(cement_only_path+'/train/cement')

In [None]:
cement_image_list1 = ! find {LOCAL_DIR+DATA_PATH.split("/")[-1].replace(".tar", "/")+'train/cement'} | grep png$
cement_image_list2 = ! find {LOCAL_DIR+DATA_PATH.split("/")[-1].replace(".tar", "/")+'validate/cement'} | grep png$

In [None]:
cement_image_year_list = []
for f in cement_image_list1:
    if f.split('/')[-1].split('_')[3] == year+month:
        cement_image_year_list.append(f)
for f in cement_image_list2:
    if f.split('/')[-1].split('_')[3] == year+month:
        cement_image_year_list.append(f)

In [None]:
for f in cement_image_year_list:
    shutil.move(f, cement_only_path+'/train/cement/')

In [None]:
data_cement = (ImageDataBunch.from_folder(cement_only_path, train='train', 
                                   bs=16, num_workers=0, seed=42)
        .normalize(imagenet_stats))

In [None]:
prob_cement = []
uid_cement = []
cement_cnt = len(data_cement.train_ds)

for i in range(0, cement_cnt):
 
    p1 = multi_model.predict(data_cement.train_ds.x[i])
    prob_cement.append(to_np(p1[2])[0].item())
    
    uid_cement.append(str(data_cement.items[i]).split('/')[-1].split('_')[0])

In [None]:
fastai_cement_pdf = pd.DataFrame({'uid': uid_cement,
                                  'site_type': 'cement',
                                  'fastai_prob': prob_cement})

In [None]:
print("Count of cement plants in training set: ", len(fastai_cement_pdf))

### Steel

In [None]:
steel_only_path = LOCAL_DIR+'steel_'+year+month
os.mkdir(steel_only_path)
os.mkdir(steel_only_path+'/train')
os.mkdir(steel_only_path+'/train/steel')

In [None]:
steel_image_list1 = ! find {LOCAL_DIR+DATA_PATH.split("/")[-1].replace(".tar", "/")+'train/steel'} | grep png$
steel_image_list2 = ! find {LOCAL_DIR+DATA_PATH.split("/")[-1].replace(".tar", "/")+'validate/steel'} | grep png$

In [None]:
steel_image_year_list = []
for f in steel_image_list1:
    if f.split('/')[-1].split('_')[3] == year+month:
        steel_image_year_list.append(f)
for f in steel_image_list2:
    if f.split('/')[-1].split('_')[3] == year+month:
        steel_image_year_list.append(f)

In [None]:
for f in steel_image_year_list:
    shutil.move(f, steel_only_path+'/train/steel/')

In [None]:
data_steel = (ImageDataBunch.from_folder(steel_only_path, train='train', 
                                   bs=16, num_workers=0, seed=42)
        .normalize(imagenet_stats))

In [None]:
prob_steel = []
uid_steel = []
steel_cnt = len(data_steel.train_ds)

for i in range(0, steel_cnt):
  
    p1 = multi_model.predict(data_steel.train_ds.x[i])
    prob_steel.append(to_np(p1[2])[2].item())
    
    uid_steel.append(str(data_steel.items[i]).split('/')[-1].split('_')[0])

In [None]:
fastai_steel_pdf = pd.DataFrame({'uid': uid_steel,
                                 'site_type': 'steel',
                                 'fastai_prob': prob_steel})

In [None]:
print("Count of steel plants in training set: ", len(fastai_steel_pdf))

## Get Scores from Deployment Script (PNG)

In [None]:
png_scores_gdf = gpd.read_file(png_score_file)

In [None]:
tmp = png_scores_gdf[png_scores_gdf['site_type'] == 'cement']
png_cement_pdf = pd.DataFrame({'uid': tmp['uid'],
                               'site_type': tmp['site_type'],
                               'png_prob': tmp['cement_prob']})

In [None]:
print("Count of cement plants in png scored set: ", len(png_cement_pdf))

In [None]:
tmp = png_scores_gdf[png_scores_gdf['site_type'] == 'steel']
png_steel_pdf = pd.DataFrame({'uid': tmp['uid'],
                               'site_type': tmp['site_type'],
                               'png_prob': tmp['steel_prob']})

In [None]:
print("Count of steel plants in png scored set: ", len(png_steel_pdf))

## Compare

In [None]:
cement_compare_pdf = pd.merge(fastai_cement_pdf, png_cement_pdf, how='outer', on='uid')
steel_compare_pdf = pd.merge(fastai_steel_pdf, png_steel_pdf, how='outer', on='uid')

In [None]:
plt.title("Cement: Landsat-8 Model Score Comparison") 
plt.xlabel("Fastai Model Score") 
plt.ylabel("Comparison Model Score") 
plt.plot(cement_compare_pdf.fastai_prob, cement_compare_pdf.png_prob, 'gx', label='PNG Model Scores')
plt.legend()
plt.show()

In [None]:
plt.title("Steel: Landsat-8 Model Score Comparison") 
plt.xlabel("Fastai Model Score") 
plt.ylabel("Comparison Model Score") 
plt.plot(steel_compare_pdf.fastai_prob, steel_compare_pdf.png_prob, 'gx', label='PNG Model Scores')
plt.legend()
plt.show()