In [85]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

<img width=50px  src = 'https://apps.fs.usda.gov/lcms-viewer/images/lcms-icon.png'>

# LCMS Map Validation

<table align="left">
  <td>
    <a href="https://colab.research.google.com/github/redcastle-resources/lcms-training/blob/main/7-Map_Validation.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> Run in Colab
    </a>
  </td>
  <td>
    <a href="https://github.com/redcastle-resources/lcms-training/blob/main/7-Map_Validation.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      View on GitHub
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://github.com/redcastle-resources/lcms-training/blob/main/7-Map_Validation.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo">
      Open in Vertex AI Workbench
    </a>
  </td>
</table>
<br/><br/><br/>


## Overview


This notebook teaches how to assess map accuracy of LCMS outputs

### Objective

In this tutorial, you learn how to assess the map accuracy of LCMS map outputs

This tutorial uses the following Google Cloud services:

- `Google Earth Engine`

The steps performed include:

- Understanding the difference between model and map accuracy
- Simulating map accuracy with k fold cross validation

In [195]:
#Module imports
#!python -m pip install geeViz --upgrade
try:
    import geeViz.getImagesLib as getImagesLib
except:
    !python -m pip install geeViz
    import geeViz.getImagesLib as getImagesLib

import geeViz.changeDetectionLib as changeDetectionLib
import geeViz.assetManagerLib as aml
import geeViz.taskManagerLib as tml
import geeViz.gee2Pandas as g2p
import inspect,operator,os
import matplotlib.pyplot as plt
import pandas as pd  


try:
    from sklearn.model_selection import train_test_split
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.model_selection import GroupKFold
    from sklearn.metrics import accuracy_score,classification_report,balanced_accuracy_score,cohen_kappa_score
    from sklearn import metrics 
except:
    !pip install -U scikit-learn
    from sklearn.model_selection import train_test_split
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.model_selection import GroupKFold
    from sklearn.metrics import accuracy_score,classification_report,balanced_accuracy_score,cohen_kappa_score
    from sklearn import metrics 
# from IPython.display import IFrame,display, HTML
ee = getImagesLib.ee
Map = getImagesLib.Map

# Can set the port used for viewing map outputs
Map.port = 1235
print('Done')


Done


## Before you begin

### Set your current URL under `workbench_url`
* This will be in your URL/search bar at the top of the browser window you are currently in
* It will look something like `https://1234567890122-dot-us-west3.notebooks.googleusercontent.com/`

### Set a folder to use for all exports under `export_path_root` 
* It will be something like `projects/projectID/assets/someFolder`
* This folder does not have to already exist. If it does not exist, it will be created

In [9]:
workbench_url = 'https://53c21733d8125e22-dot-us-west3.notebooks.googleusercontent.com'
export_path_root  = 'projects/rcr-gee/assets/lcms-training'

print('Done')

Done


In [177]:
# Bring in all folders/collections that are needed
# These must already exist as they are created in previous notebooks

export_timeSync_folder = f'{export_path_root}/lcms-training_module-4_timeSync'

export_assembledLCMSOutputs_collection = f'{export_path_root}/lcms-training_module-6_assembledLCMSOutputs'

# This is the pre-made TimeSync data
# Creating this dataset is not covered in this set of notebooks
timeSync_featureCollection = 'projects/lcms-292214/assets/R8/PR_USVI/TimeSync/18_PRVI_AllPlots_TimeSync_Annualized_Table_secLC'


print('Done')

Done


In [178]:
Map.proxy_url = workbench_url

# First, we'll need to repeat steps from Module 5 and download our reference data to a local location
# Bring in raw TS data
timeSyncData = ee.FeatureCollection(timeSync_featureCollection)
timeSync_fields = timeSyncData.first().toDictionary().keys().getInfo()
# Now lets bring in all training data and prep it for modeling
assets = ee.data.listAssets({'parent': export_timeSync_folder})['assets']

# You may need to change the permissions for viewing model outputs in geeViz
# Uncomment this if needed
# for asset in assets:aml.updateACL(asset['name'],writers = [],all_users_can_read = True,readers = [])

# Read in each year of extracted TimsSync data
training_data = ee.FeatureCollection([ee.FeatureCollection(asset['name']) for asset in assets]).flatten()

# Bring in existing LCMS data for the class names, numbers, and colors
lcms_viz_dict = ee.ImageCollection("USFS/GTAC/LCMS/v2020-6").first().toDictionary().getInfo()
                                             
print('LCMS class code, names, and colors:',lcms_viz_dict)


# Get the field names for prediction
# Find any field that was not in the original TimeSync data and assume that is a predictor variable
all_fields = training_data.first().toDictionary().keys().getInfo()
predictor_field_names = [field for field in all_fields if field not in timeSync_fields]

# Filter out any non null values (any training plot with missing predictor data will cause the model to fail entirely)
training_data = training_data.filter(ee.Filter.notNull(predictor_field_names))



LCMS class code, names, and colors: {'Change_class_names': ['Stable', 'Slow Loss', 'Fast Loss', 'Gain', 'Non-Processing Area Mask'], 'Change_class_palette': ['3d4551', 'f39268', 'd54309', '00a398', '1b1716'], 'Change_class_values': [1, 2, 3, 4, 5], 'Land_Cover_class_names': ['Trees', 'Tall Shrubs & Trees Mix (SEAK Only)', 'Shrubs & Trees Mix', 'Grass/Forb/Herb & Trees Mix', 'Barren & Trees Mix', 'Tall Shrubs (SEAK Only)', 'Shrubs', 'Grass/Forb/Herb & Shrubs Mix', 'Barren & Shrubs Mix', 'Grass/Forb/Herb', 'Barren & Grass/Forb/Herb Mix', 'Barren or Impervious', 'Snow or Ice', 'Water', 'Non-Processing Area Mask'], 'Land_Cover_class_palette': ['005e00', '008000', '00cc00', 'b3ff1a', '99ff99', 'b30088', 'e68a00', 'ffad33', 'ffe0b3', 'ffff00', 'aa7700', 'd3bf9b', 'ffffff', '4780f3', '1b1716'], 'Land_Cover_class_values': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], 'Land_Use_class_names': ['Agriculture', 'Developed', 'Forest', 'Non-Forest Wetland', 'Other', 'Rangeland or Pasture', 'No

In [182]:
# Now, we'll crosswalk the training fields to numeric codes
# The TimeSync fields are a string by default
# They must be a number for modeling
# Set up lookup dictionaries to convert the names to numeric codes
land_cover_name_code_dict = ee.Dictionary({'TREES':1,
                             'TSHRUBS-TRE':2,
                             'SHRUBS-TRE':3,
                             'GRASS-TREE':4,
                             'BARREN-TRE':5,
                             'TSHRUBS':6,
                             'SHRUBS':7,
                             'GRASS-SHRU':8,
                             'BARREN-SHR':9,
                             'GRASS':10,
                             'BARREN-GRA':11,
                             'BARREN-IMP':12,
                             'BARREN-IMP':12,
                             'WATER':14
                            })
land_use_name_code_dict = ee.Dictionary({'Agriculture':1,
                           'Developed':2,
                           'Forest':3,
                           'Non-forest Wetland':4,
                           'Other':5,
                           'Rangeland':6
                          })

change_code_dict = ee.Dictionary({'Debris': 3, 
                                  'Fire': 3, 
                                  'Growth/Recovery': 4, 
                                  'Harvest': 3, 'Hydrology': 3, 
                                  'Mechanical': 3, 
                                  'Other': 3, 
                                  'Spectral Decline': 2, 
                                  'Stable': 1, 
                                  'Structural Decline': 2, 
                                  'Wind/Ice': 3})

reference_field_dict = {'Land_Cover':{'field':'DOM_SEC_LC','name_code_dict':land_cover_name_code_dict},
                        'Land_Use':{'field':'DOM_LU','name_code_dict':land_use_name_code_dict},
                        'Change':{'field':'CP','name_code_dict':change_code_dict,
                                  'fields':['Slow Loss', 'Fast Loss', 'Gain']}
                       }
# Make a function that will get the code for a given name and set it
# We could also use the remap function to accomplish this
def set_class_code(plot,product):
    name_fieldName = reference_field_dict[product]['field']
    code_fieldName = ee.String(name_fieldName).cat('_Code')
    name = ee.String(plot.get(name_fieldName))
    code = reference_field_dict[product]['name_code_dict'].get(name)
    plot = plot.set(code_fieldName,code)
    return plot
                    
                    
    # print(name_fieldName,code_fieldName.getInfo(),name.getInfo(),code.getInfo())
            
# set_class_code(training_data.first(),'Land_Cover')
for product in list(reference_field_dict.keys()):
    print('Crosswalking:',product)
    training_data = training_data.map(lambda f:set_class_code(f,product))

# Now will download the training table to a local location

local_model_data_folder = '/tmp/lcms-training/local_modeling'
local_training_csv = os.path.join(local_model_data_folder,'timeSync_training_table.csv')


if not os.path.exists(local_model_data_folder):os.makedirs(local_model_data_folder)

# Download the training data from a featureCollection to a local CSV
# This function will automatically break the featureCollection into 5000 feature featureCollections
# if it is larger than the 5000 feature limit set by GEE
g2p.featureCollection_to_csv(training_data,local_training_csv,overwrite = False)

# Once the table is store locally, read it in
training_df = pd.read_csv(local_training_csv)

training_df.describe()
print('Done')

Crosswalking: Land_Cover
Crosswalking: Land_Use
Crosswalking: Change
/tmp/lcms-training/local_modeling/timeSync_training_table.csv  already exists
Done


In [194]:
# LCMS does not have enough training samples to simply ommit 20% or so from training our final models
# Since our assemblage process introduces differences between the model predicted class, and our sample
# is based on a stratified random sample design, we cannot simply use the out-of-bag samples from the random forest model
# We have to use a method that will simulate the map accuracy that can account for the likelihood of each samples inclusion
# (strata weights), as well as also allow us to introduce any assemblage rules that are not typically part of the underlying 
# random forest model
# 

KFoldInfo = {}
# kfoldinfo_pickle_filename = pickleName+'.p'
KFoldInfo['TrainingData'] = training_df.copy()

# strata = allTrainingData[stratColumn].squeeze()
groups = training_df['PLOTID'].squeeze()
k = 5
n_jobs = 4
gkf = GroupKFold(n_splits=k)
foldNum = 1
seed = 999
nTrees = 5
# Fit and Train model
# Set up a random forest model
rf = RandomForestClassifier(n_estimators = nTrees, random_state=seed,oob_score=True,n_jobs = n_jobs)

for train_index, test_index in gkf.split(training_df, training_df, groups):
    KFoldInfo[str(foldNum)] = {}
    print()
    print('Fold Number: '+str(foldNum))
    print()
    print(len(train_index),len(test_index))
    # Indices of training and test samples
    KFoldInfo[str(foldNum)]['Indices'] = {\
        'Train': train_index,
        'Test': test_index}

    # Strata of training and test samples
    # gk_strata_train, gk_strata_test = strata.iloc[train_index], strata.iloc[test_index]
#     KFoldInfo[str(foldNum)]['Strata'] = {\
#         'Train': gk_strata_train,
#         'Test': gk_strata_test}

#     # Run model and predict probabilities
#     KFoldInfo[str(foldNum)]['Probabilities'] = {}
#     KFoldInfo[str(foldNum)]['Predictions'] = {}
#     KFoldInfo[str(foldNum)]['Model'] = {}
    
    k_train,k_test = training_df.iloc[train_index], training_df.iloc[test_index]
    print(len(k_train))
    # for product in list(reference_field_dict.keys()):
        # print(foldNum,product)
        
        # Get X and Y points for each group  
       
        # gkx_train, gkx_test = training_df.iloc[train_index], training_df.iloc[test_index]
        # gky_train, gky_test = indDict[model]['y'].iloc[train_index], indDict[model]['y'].iloc[test_index]
    foldNum+=1


Fold Number: 1

16792 4199
16792

Fold Number: 2

16793 4198
16793

Fold Number: 3

16793 4198
16793

Fold Number: 4

16793 4198
16793

Fold Number: 5

16793 4198
16793


In [None]:
# Another method for computing model accuracy is with cross validation
# This method partitions the data into k parts and leaves one out for each of k iterations
# The held out training points are then used to assess the model accuracy. All held out samples are combined to 
# get the simulated model accuracy
from sklearn.model_selection import cross_val_score
scores = cross_val_score(rf, X, y, cv=10,scoring = 'balanced_accuracy')
print("%0.2f balanced accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))