# **Step 2: K-fold cross validation**

In [None]:
# import Google Earth Engine API
import ee
# Trigger the authentication flow.
ee.Authenticate()
# Initialize the library.
ee.Initialize(project='...')

In [None]:
import geemap #for plotting interactive maps (includes folium)
from ee import batch # for exporting maps/images to google drive
import matplotlib.pyplot as plt # for plotting the historgram
import seaborn as sns # for plotting the historgram
import pandas as pd # for plotting the historgram
import geopandas as gpd
import numpy as np # for further image calculation
import math #for tile processing
import folium
import os
from math import ceil

In [None]:
# read in the training data created in step 1
training_data_1 = ee.FeatureCollection('projects/.../assets/training_data_1')
training_data_2 = ee.FeatureCollection('projects/.../assets/training_data_2')

In [None]:
# join the training data
training_data = training_data_1.merge(training_data_2)

In [None]:
training_data = training_data.randomColumn('random') # adds a column with random numbers from 0 to 1.

In [None]:
bands = ['ndvi', 'ndwi', 'elevation', 'tpi', 'slope', 'tcg', 'NDPI', 'VV_gamma0', 'VH_gamma0', 'reip', 'ari', 'ecoregion']

In [None]:
# Parameters for k-fold cross-validation
num_folds = 5

# List to hold evaluation results for each fold (confusion matrices)
confusion_matrices = []

# Create the k-folds
for fold in range(num_folds):
    #print(f"Processing fold {fold + 1}...")

    # Test fold is the current fold; training folds are all the others
    testing_fold = training_data.filter(ee.Filter.gte('random', fold / num_folds)) \
                                .filter(ee.Filter.lt('random', (fold + 1) / num_folds))

    training_fold = training_data.filter(ee.Filter.Or(
        ee.Filter.lt('random', fold / num_folds),
        ee.Filter.gte('random', (fold + 1) / num_folds)
    ))

    # Train the classifier on the training fold
    classifier = ee.Classifier.smileRandomForest(numberOfTrees=50).train(
        features=training_fold,
        classProperty='class',
        inputProperties=bands
    )

    # Apply the classifier to the testing fold
    classified_test = testing_fold.classify(classifier)

    # Ensure class labels and predictions are integers before computing the confusion matrix
    classified_test = classified_test.map(lambda feature: feature.set('classification', ee.Number(feature.get('classification')).int()))
    classified_test = classified_test.map(lambda feature: feature.set('class', ee.Number(feature.get('class')).int()))

    # Compute the confusion matrix on the testing fold
    confusion_matrix = classified_test.errorMatrix('class', 'classification')

    # Round and convert confusion matrix values to integers
    rounded_confusion_matrix = confusion_matrix.array().round().int()

    # Convert back to ConfusionMatrix object for export purposes
    final_confusion_matrix = ee.ConfusionMatrix(rounded_confusion_matrix)

    # Convert the confusion matrix to an ee.List of lists
    matrix_list = final_confusion_matrix.array().toList()  # Server-side list of lists

    # Prepare each row of the confusion matrix as an ee.Feature
    def create_feature(actual_class):
        row = ee.List(matrix_list.get(actual_class))

        # Generate property names as 'predicted_0', 'predicted_1', etc., using integer indices
        row_dict = ee.Dictionary.fromLists(
            ee.List.sequence(0, row.length().subtract(1)).map(lambda i: ee.String("predicted_").cat(ee.Number(i).toInt().format())),
            row
        )
        return ee.Feature(None, row_dict.set('actual', ee.String("actual_").cat(ee.Number(actual_class).toInt().format())))


    # Map over the rows to create a FeatureCollection for export
    matrix_fc = ee.FeatureCollection(
        ee.List.sequence(0, matrix_list.length().subtract(1)).map(create_feature)
    )

    #print(matrix_fc.first().propertyNames().getInfo())

    # Store the final confusion matrix in the list for potential further use
    confusion_matrices.append(final_confusion_matrix)

    # Export the confusion matrix for this fold to Google Drive as a CSV file
    export_task = ee.batch.Export.table.toDrive(
        collection=matrix_fc,
        description=f'ConfusionMatrix_global_50trees_Fold_{fold + 1}',
        fileNamePrefix=f'confusion_matrix_global_fold_{fold + 1}',
        fileFormat='CSV',
        folder="earth_engine_exports"
    )

    # Start the export task
    export_task.start()

    # Print a message for tracking purposes
    print(f"Export started for confusion matrix of fold {fold + 1}.")


In [None]:
# Parameters for k-fold cross-validation for each mountain region
num_folds = 5
ecoregion_mapping = {
    "Andes": [590, 589, 588, 493, 460, 608],
    "Himalayas": [751,768, 770],
    "Alps": [689],
    "Rockies": [353,367, 438],
    "Global": [590, 589, 588, 493, 460, 751, 768, 689, 353, 367, 608, 438, 770]
}

# List to hold confusion matrices for each fold and ecoregion
ecoregion_confusion_matrices = []

# List to collect all test predictions for the overall confusion matrix
all_classified_test = []

# Create the k-folds
for fold in range(num_folds):
    print(f"Processing fold {fold + 1}...")

    # Test fold is the current fold; training folds are all the others
    testing_fold = training_data.filter(ee.Filter.gte('random', fold / num_folds)) \
                                .filter(ee.Filter.lt('random', (fold + 1) / num_folds))

    training_fold = training_data.filter(ee.Filter.Or(
        ee.Filter.lt('random', fold / num_folds),
        ee.Filter.gte('random', (fold + 1) / num_folds)
    ))

    # Train the classifier on the training fold
    classifier = ee.Classifier.smileRandomForest(numberOfTrees=50).train(
        features=training_fold,
        classProperty='class',
        inputProperties=bands
    )

    # Apply the classifier to the testing fold
    classified_test = testing_fold.classify(classifier)

    # Ensure class labels and predictions are integers before computing confusion matrix
    classified_test = classified_test.map(lambda feature: feature.set('classification', ee.Number(feature.get('classification')).int()))
    classified_test = classified_test.map(lambda feature: feature.set('class', ee.Number(feature.get('class')).int()))

    # Iterate over ecoregions and compute confusion matrices for each ecoregion
    for ecoregion_name, ecoregion_ids in ecoregion_mapping.items():
        print(f"Processing ecoregion: {ecoregion_name} in fold {fold + 1}...")

        # Filter the testing fold for the current ecoregion
        ecoregion_test = classified_test.filter(ee.Filter.inList('ecoregion', ecoregion_ids))

        # Check if there are samples in the ecoregion test set
        if ecoregion_test.size().getInfo() > 0:
            # Compute the confusion matrix for the ecoregion
            ecoregion_confusion_matrix = ecoregion_test.errorMatrix('class', 'classification')

            # Round and convert confusion matrix values to integers
            rounded_ecoregion_confusion_matrix = ecoregion_confusion_matrix.array().round().int()

            # Convert back to ConfusionMatrix object for export purposes
            final_ecoregion_confusion_matrix = ee.ConfusionMatrix(rounded_ecoregion_confusion_matrix)

            # Store the confusion matrix for this fold and ecoregion
            ecoregion_confusion_matrices.append({
                "fold": fold + 1,
                "ecoregion": ecoregion_name,
                "confusion_matrix": final_ecoregion_confusion_matrix
            })

            # Convert the ecoregion confusion matrix to a list of lists for export
            matrix_list_ecoregion = final_ecoregion_confusion_matrix.array().toList()

            # Prepare each row of the ecoregion confusion matrix as an ee.Feature
            def create_ecoregion_feature(actual_class):
                row = ee.List(matrix_list_ecoregion.get(actual_class))

                # Generate property names as 'predicted_0', 'predicted_1', etc., using integer indices
                row_dict = ee.Dictionary.fromLists(
                    ee.List.sequence(0, row.length().subtract(1)).map(lambda i: ee.String("predicted_").cat(ee.Number(i).toInt().format())),
                    row
                )
                return ee.Feature(None, row_dict.set('actual', ee.String("actual_").cat(ee.Number(actual_class).toInt().format())))

            # Map over the rows to create a FeatureCollection for the ecoregion confusion matrix export
            matrix_fc_ecoregion = ee.FeatureCollection(
                ee.List.sequence(0, matrix_list_ecoregion.length().subtract(1)).map(create_ecoregion_feature)
            )

            # Export the confusion matrix for the ecoregion to Google Drive
            export_task_ecoregion = ee.batch.Export.table.toDrive(
                collection=matrix_fc_ecoregion,
                description=f'ConfusionMatrix_{ecoregion_name}_Fold_{fold + 1}',
                fileNamePrefix=f'confusion_matrix_{ecoregion_name}_fold_{fold + 1}',
                fileFormat='CSV',
                folder="earth_engine_exports"
            )

            # Start the export task for the ecoregion confusion matrix
            export_task_ecoregion.start()

            print(f"Export started for {ecoregion_name} confusion matrix in fold {fold + 1}.")
        else:
            print(f"No samples for ecoregion {ecoregion_name} in fold {fold + 1}. Skipping...")
