# Build Points for Accuracy Assessment
Goal: Drop random points for 4 classes (Development, Tree Canopy, Vegetation, Wetlands) where change occurred and where no change occurred.

In [None]:
# imports
import numpy as np
import rasterio as rio
import geopandas as gpd
import pandas as pd
from shapely.geometry import Point
import math

# folder containing raster truth data
folder = r"C:\Users\smcdonald\Documents\Data\GEE\Wicomico\AccuracyData"
lcs = ['dev', 'TC', 'veg', 'wet'] # 4 LCs of interest
num_samples = 50 # number of samples per class and per change type (50 no change tc, 50 for change in TC)

# path to LandTrendr results
lt_folder = r"C:\Users\smcdonald\Documents\Data\GEE\Wicomico\LandTrendr"

# LandTrendr bands/indices 
lt_results =  ['SR_B1', 'SR_B2', 'SR_B3', 'SR_B4', 'SR_B5', 'SR_B6', 'SR_B7', 'NDVI', 'NBR', 'NDMI', 'NDBI', 'TCB', 'TCG', 'TCW']
bands = ['stYr', 'endYr', 'stVal', 'endVal', 'mag', 'dur', 'rate'] # bands in the LT results

### Helper Functions

In [None]:
def random_sample(ary, values, num_samples):
    """
    random_sample randomly sample num_samples of cells in the array that are equal to one of the values
                    passed. 

    Parameters
    ----------
    ary : np.array
        array to build points from
    values : list
        raster values to sample
    num_samples : int
        number of samples to select

    Returns
    -------
    list
        list of tuples repesenting x,y indexes of the original array
    """
    ns = num_samples
     # mask values to sample from
    sampling_matrix = np.where(np.isin(ary, values), 1, 0)

     # subset array to nonzero values
    n = np.nonzero(sampling_matrix)
    sample_array = sampling_matrix[n]

    # convert to probabilities
    sample_array = sample_array / sample_array.sum()
    l = len(sample_array)

    # validate there are enough cells to sample
    s_flag = True
    vals, counts = np.unique(sampling_matrix, return_counts=True)
    vals, counts = list(vals), list(counts)
    if 1 not in vals:
        print(f"\tNo cells to sample - skipping")
        s_flag = False
    elif counts[vals.index(1)] < num_samples*4:
        ns = math.floor((counts[vals.index(1)] / 4))
        if ns >= 1:
            print(f"\tNot enough cells to sample - reduced to {ns} from {counts[vals.index(1)]} cells")
        else:
            print(f"\tNot enough cells to sample {counts[vals.index(1)]} - skipping")
            s_flag = False
        
    if s_flag:
        # get row, col indices for random samples
        s = np.random.choice(range(l), ns, p=sample_array, replace=False) 
        samples = [(n[0][i], n[1][i]) for i in s] # list of tuples

        # return samples
        return samples
    else: # no samples, return empty list
        return []

def build_points(src, ary, samples, lc):
    """
    build_points Convert the x,y raster indices to points. Constructs a geopandas geodataframe of the points 
                 and extracts the value of the array. Expects the array to have values 1,2,3, where 1 is no
                 change, 2 is gain, and 3 is loss (for the specified lc). 

    Parameters
    ----------
    src : rasterio object
        open source object of the raster
    ary : numpy.array
        array used to create samples.
    samples : list
        list of raster x,y coordinates
    lc : str
        name of lc type that was sampled. Used for naming conventions.

    Returns
    -------
    gpd.GeoDataFrame
        Point database of the samples, with the value of the raster it was sampled from.
    """
    # copy metadata
    meta = src.meta.copy()

    # get crs
    crs = meta['crs']
    if crs.is_epsg_code:
        epsg_code = int(crs['init'].lstrip('epsg:'))

    # get upper left corner and cell size from metadata
    top, left, x_cellSize, y_cellSize = meta['transform'][5], meta['transform'][2], meta['transform'][0], meta['transform'][4]

    # iterate through the samples to construct their x,y points
    points = [] # list to store points
    values = [] # list of the raster value, same index as points
    for sample in samples:
        # construct x and y values
        x = sample[1] * x_cellSize + left + (x_cellSize / 2) # add half of cell size to center point
        y = sample[0] * y_cellSize + top + (y_cellSize / 2) # add half of cell size to center point

        # convert to point geometry
        pt = Point((x,y))

        # add point list of points
        points.append(pt)

        # extract raster value
        val = ary[sample[0]][sample[1]]
        if val not in [1,2,3]:
            print(f"ERROR SAMPLE: {val}. {sample}")
        values.append(val)

    # convert to geodataframe
    point_gdf = gpd.GeoDataFrame(geometry=points, data={'value':values}, crs=f"EPSG:{epsg_code}")

    # define string name
    point_gdf.loc[point_gdf['value'] == 1, 'lc'] = lc
    point_gdf.loc[point_gdf['value'] == 2, 'lc'] = f"{lc}_gain"
    point_gdf.loc[point_gdf['value'] == 3, 'lc'] = f"{lc}_loss"

    # return points
    return point_gdf

def extract_values_at_points(boi, gdf):
    """
    extract_values_at_points For the raster of interest, extract the raster value for each point
                             in the geodataframe and add them as new columns in the geodataframe.
                             Expects the raster to contain the number and name of bands listed
                             in the first section. 

    Parameters
    ----------
    boi : str
        Name of band of interest. Used to construct raster path to sample.
    gdf : gpd.GeoDataFrame
        Point database.

    Returns
    -------
    gpd.GeoDataFrame
        Point database with added columns of sampled raster values.
    """
    # for the band of interest (boi), extract the LT result values for the points
    ras_path = f"{lt_folder}/LT_Wico_{boi}.tif"

    # create list of coords
    coord_list = [(x, y) for x, y in zip(gdf["geometry"].x, gdf["geometry"].y)]

    # build column names
    if 'SR' in boi:
        pf = f"{boi.split('_')[-1]}"
    else:
        pf = boi
    new_cols = [f"{pf}_{x}" for x in bands]

    # open the raster
    with rio.open(ras_path) as src:
        gdf[new_cols] = [x for x in src.sample(coord_list)]
    
    # return the points with data extracted
    return gdf


### Iterate the Rasters and Randomly Sample Points

In [None]:
# list of geodataframes
gdf_list = []

# iterate over the land covers
for lc in lcs:
    print(f"Starting {lc}...")
    ras_path = f"{folder}/Wico_AA_{lc}_30m.tif"
    # open the raster
    with rio.open(ras_path) as src:
        # read in the array
        ary = src.read(1)

        # pull samples for change
        chg_samples = random_sample(ary, [2, 3], num_samples)

        # pull samples for no change
        st_samples = random_sample(ary, [1], num_samples)

        # merge samples into single list of indices
        samples = chg_samples + st_samples

        # create points and extract cell values
        tmp_gdf = build_points(src, ary, samples, lc)

        # add gdf to list
        gdf_list.append(tmp_gdf.copy())
        del tmp_gdf

# concat gdfs into single gdf
sample_points = pd.concat(gdf_list).pipe(gpd.GeoDataFrame)

# extract LT results for the points
for b in lt_results:
    sample_points = extract_values_at_points(b, sample_points)

# write points
sample_points.to_file(f"{folder}/AA_points.shp")


### Identify max values per spectral index to normalize between 0 and 1 (or -1 and 1)

In [None]:
# to normalize data - define max values per band/index
# TODO this should be moved up when reading these in the first time
normalize_ = {}
for b in lt_results:
    # normalized difference indices are aleady 0-1
    if b in ['NDVI', 'NBR', 'NDMI', 'NDBI']:
        continue

    # read in ary
    ras_path = f"{lt_folder}/LT_Wico_{b}.tif"
    with rio.open(ras_path) as src:
        mx = 0.0
        for i in [3, 4]: # start and end values
            # read array
            ary = src.read(7)

            # extract max value
            m1 = abs(np.nanmax(ary))
            m2 = abs(np.nanmin(ary))
            ary = None

            # correct for negatives
            if m2 > m1:
                m1 = m2
            
            # update max
            if m1 > mx:
                mx = m1

        # set max value in the dict
        normalize_[b] = mx

        print(b, mx)

### Normalize sample point change magnitudes

In [None]:
if sample_points is None:
    sample_points = gpd.read_file(f"{folder}/AA_points.shp")

# normalize magnitudes
for n in normalize_:
    if 'SR_' in n:
        c = f"{n.split('_')[-1]}_mag"
    else:
        c = f"{n}_mag"

    sample_points.loc[:, f"{c}_n"] = sample_points[c] / normalize_[n]

### Calculate Producer's and User's Accuracies
Calculate PA and UA for change versus no change for each class.
Construct matrix for each band/index and lc combination.

In [None]:

# table to store results
df = pd.DataFrame(columns=['landcover', 'index', 'nc_nc', 'c_c', 'nc_c', 'c_nc', 'com_nc', 'com_c', 'om_nc', 'om_c'])

for lc in lcs:
    for b in lt_results:
        if 'SR' in b:
            boi = f"{b.split('_')[-1]}_mag_n"
        elif b in ['NDVI', 'NDMI', 'NDBI', 'NBR']:
            boi = f"{b}_mag"
        else:
            boi = f"{b}_mag_n"

        # calculate number of agreement samples
        nc_nc = len(sample_points.query(f"lc == '{lc}' and ({boi} < 0.05 and {boi} > -0.05)"))
        c_c = len(sample_points.query(f"(lc == '{lc}_gain' or lc == '{lc}_loss') and ({boi} >= 0.05 or {boi} <= -0.05)"))

        # calculate disagreement samples
        c_nc = len(sample_points.query(f"lc == '{lc}' and ({boi} >= 0.05 or {boi} <= -0.05)"))
        nc_c = len(sample_points.query(f"(lc == '{lc}_gain' or lc == '{lc}_loss') and ({boi} < 0.05 and {boi} > -0.05)"))

        # commission errors
        com_nc = round( (nc_nc / (nc_nc + nc_c)), 4)
        com_c = round( (c_c / (c_c + c_nc) ), 4)

        # omission errors
        om_nc = round( (nc_nc / (nc_nc + c_nc) ), 4)
        om_c = round( (c_c / (c_c + nc_c) ), 4)

        # add data to table
        df.loc[len(df)] = [lc, boi.split('_')[0], nc_nc, c_c, nc_c, c_nc, com_nc, com_c, om_nc, om_c]

        # create matrix
        """
        | col truth, rows LT | No change            | Change                   |
        | No Change          | lc where band < 0.05 | lc_chg where band < 0.05 |
        | Change             | lc where band > 0.05 | lc where band > 0.05     |
        """
        matrix = pd.DataFrame()

        # add counts
        matrix.loc[:, boi.split('_')[0]] = ["No Change", "Change"] # add row /result names
        matrix.loc[:, "No Change"] = [nc_nc, c_nc] # add values in columns
        matrix.loc[:, "Change"] = [nc_c, c_c] # add values in columns

        # add totals
        matrix.loc[:, 'Total'] = [nc_nc+nc_c, c_nc+c_c]
        matrix.loc[len(matrix)] = ['Total', nc_nc+c_nc, nc_c+c_c, nc_nc+nc_c+c_nc+c_c]

        # add producer and user accuracies
        matrix.loc[:, "User's Accuracy"] = [com_nc, com_c, np.nan]
        matrix.loc[len(matrix)] = ["Producer's Accuracy", om_nc, om_c, np.nan, np.nan]

        # write matrix
        matrix.to_csv(f"{folder}/tables/matrices/{lc}_{boi.split('_')[0]}_matrix.csv", index=False)