This script will read in the tif files and split into chunks of 128x128 for training the model.  Note I will ignore all chunks where there is no data for dNBR as we wouldn't want to train on this.  
The in path and outpath have to be where the files from the tif files were sent to from the google cloud storage bucket after downloading from earth engine. 

Read in packages

In [1]:
import pandas as pd
import os
import numpy as np
import xarray as xr
import rioxarray
import glob
import random
import geopandas as gpd
from sklearn.utils import shuffle
from MightyMosaic import MightyMosaic


First take the input tif files and chunk them to sizes of 128x128 for the neural network.  

In [None]:
"""arguments are path to .tif files to be chunked, out_path to save files, chunk size, and if desired a threshold"""       

#check if all 0
def is_matrix_all_zeros(matrix):
    # Convert the matrix to a NumPy array
    np_matrix = np.array(matrix)

    # Check if all elements in the array are zeros
    return np.all(np_matrix == 0)

#in_path to tif files
in_path = '/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/nbac_85'
# in_path = '/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/mtbs_85'

#out_path which will be manipulated based on parameters below
out_path = '/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/nbac_85'

# out_path = '/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/mtbs_85'

#right now everything is set to use 0 dnbr threshold
size = 128
threshold = True

if threshold == True:

    out_path = out_path + '_subs_0_' + str(size)
    if not os.path.isdir(out_path):
        os.makedirs(out_path)

if threshold == False:

    out_path = out_path + '_subs_' + str(size)
    if not os.path.isdir(out_path):
        os.makedirs(out_path)

files = os.listdir(in_path)

# files = ['median_6169.tif']

#loop through files
for f in files:

    if f.endswith('.tif'):
        
         
        #in file
        try:
        
            #file name
            f2 = f.replace('median_', '')

            #ID of file
            f_id = int(f2.replace('.tif', ''))
        
     

            #read in file
            in_mat = rioxarray.open_rasterio(os.path.join(in_path, f))

            
            #now turn in mat to numpy, and in mtbs to numpy
            in_mat = in_mat.to_numpy().astype(float)


            #convert to band last
            in_mat = np.moveaxis(in_mat, 0, 2) 
            
               
            #get dnbr
            t = in_mat[:, :, 0]
            
             #check if all 0
            result = is_matrix_all_zeros(t)
            
            #if it is not all 0s keep data
            if result == False:

                #mosaic
                mosaic = MightyMosaic.from_array(in_mat, (size,size), overlap_factor=1) 

                #take off last dimensions of mosaic which give edge effects as they are filled with no data
                mosaic = mosaic[:-1, :-1, :, :, :]


                #first two dimensions are number of chunks, 3 and 4 are size (256, 266) and last is nbands
                for i in range(mosaic.shape[0]):

                    for j in range(mosaic.shape[1]):

                        fname = os.path.join(str(i) + '_' +  str(j)+ '_' + f2.replace('.tif', '.npy'))
                        out_name = os.path.join(out_path, fname)

                        if os.path.exists(out_name) == False:

                            in_mat3 = mosaic[i, j, :, :, :-1]

                            target = mosaic[i, j, :, :, 3] #if only three predictors 3 is dnbr, otherwise it is 9

                            #turn nan to -999
                            in_mat[np.isnan(in_mat)] = -999

                            target = target.astype('int')

                            target[target <0 ] = 0
                            target[target >1 ] = 0

                            #
                            target[~np.isin(target, [0,1])] = 0

                            #turn nan to 0
                            target[np.isnan(target)] = 0

                            #if the target is all 0 don't train on it
                            # if np.all(target == 0) == False:

                            #now get dnbr which is the 6th band
                            dnbr = in_mat3[:, :, 0]
                            
                            result1 = is_matrix_all_zeros(dnbr)

                            if result1 == False:

                                #if threshold apply dnbr threshold
                                target[dnbr < 0] = 0

                                e = np.dstack([in_mat3, target])

                                np.save(out_name, e)

                                print(f)

        except:
            pass

Make faster with Dask

In [None]:
import os
import numpy as np
import rioxarray
import dask
from dask.diagnostics import ProgressBar
from MightyMosaic import MightyMosaic

# Check if all elements in a matrix are zero
def is_matrix_all_zeros(matrix):
    np_matrix = np.array(matrix)
    return np.all(np_matrix == 0)

# Input and output paths
in_path = '/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/nbac_85'
out_path = '/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/nbac_85'

size = 128
threshold = True

# Set output path based on threshold
if threshold:
    out_path += f'_subs_0_{size}'
else:
    out_path += f'_subs_{size}'

if not os.path.isdir(out_path):
    os.makedirs(out_path)

files = [f for f in os.listdir(in_path) if f.endswith('.tif')]

# Define the function to process a single file
def process_file(f):
    try:
        f2 = f.replace('median_', '')
        f_id = int(f2.replace('.tif', ''))
        in_mat = rioxarray.open_rasterio(os.path.join(in_path, f)).to_numpy().astype(float)
        in_mat = np.moveaxis(in_mat, 0, 2) 
        t = in_mat[:, :, 0]
        if not is_matrix_all_zeros(t):
            mosaic = MightyMosaic.from_array(in_mat, (size, size), overlap_factor=1) 
            mosaic = mosaic[:-1, :-1, :, :, :]
            for i in range(mosaic.shape[0]):
                for j in range(mosaic.shape[1]):
                    fname = os.path.join(str(i) + '_' +  str(j) + '_' + f2.replace('.tif', '.npy'))
                    out_name = os.path.join(out_path, fname)
                    if not os.path.exists(out_name):
                        in_mat3 = mosaic[i, j, :, :, :-1]
                        target = mosaic[i, j, :, :, 3]
                        in_mat3[np.isnan(in_mat3)] = -999
                        target = target.astype('int')
                        target[target < 0] = 0
                        target[target > 1] = 0
                        target[~np.isin(target, [0, 1])] = 0
                        target[np.isnan(target)] = 0
                        dnbr = in_mat3[:, :, 0]
                        if not is_matrix_all_zeros(dnbr):
                            if threshold:
                                target[dnbr < 0] = 0
                            e = np.dstack([in_mat3, target])
                            np.save(out_name, e)
                            print(f"Processed {f}")
    except Exception as e:
        print(f"Error processing {f}: {e}")

# Parallel execution using Dask
tasks = [dask.delayed(process_file)(f) for f in files]

with ProgressBar():
    dask.compute(*tasks)


[                                        ] | 0% Completed | 1.67 s msProcessed median_8566.tif
Processed median_8566.tif
Processed median_8566.tif
Processed median_8566.tif
Processed median_8566.tif
Processed median_8566.tif
[                                        ] | 0% Completed | 1.77 sProcessed median_8566.tif
Processed median_8566.tif
Processed median_8566.tif
Processed median_8566.tif
Processed median_8566.tif
Processed median_8566.tif
Processed median_8566.tif
[                                        ] | 0% Completed | 1.87 sProcessed median_8566.tif
Processed median_8566.tif
Processed median_8566.tif
[                                        ] | 0% Completed | 2.79 sProcessed median_9543.tif
Processed median_9543.tif
Processed median_9543.tif
[                                        ] | 0% Completed | 2.89 sProcessed median_9543.tif
Processed median_9543.tif
Processed median_9543.tif
Processed median_9543.tif
Processed median_9543.tif
Processed median_9543.tif
Processed median_

In [1]:
't'

't'

Now take those files and save a csv with file names for the 80/10/10 training, validation, testing parts

In [2]:

def listdir_fullpath(d):
    return [os.path.join(d, f) for f in os.listdir(d)]

#list of files
# file_names = listdir_fullpath('/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/nbac_1985_sent_harm_subs_128')
file_names = listdir_fullpath('/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/nbac_85_subs_0_128')


#shuffle file_names
file_names = shuffle(file_names, random_state = 555)


print(len(file_names))

random.seed(555)
# #get train at 80%

train_files = random.sample(file_names, k=round(len(file_names) * 0.8))

#files at 10%
val_files = list(set(file_names) - set(train_files))
val_files = random.sample(val_files, k=round(len(file_names) * 0.1))

#combine traini and val
temp = train_files + val_files
#get test files at 10%
test_files = list(set(file_names) - set(temp))


#convert the lists to pandas dataframes
train = pd.DataFrame({'Files': train_files})
val = pd.DataFrame({'Files': val_files})
test = pd.DataFrame({'Files': test_files})

print(train.shape)
print(val.shape)
print(test.shape)

train.to_csv('/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/nbac_85_training_files.csv')
val.to_csv('/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/nbac_85_validation_files.csv')
test.to_csv('/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/nbac_85_testing_files.csv')


649494
(519595, 1)
(64949, 1)
(64950, 1)


In [1]:
import rioxarray 
import xarray as xr

rds = rioxarray.open_rasterio('/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/nbac_85/median_8.tif')
# Print the band names
print(f"Band names: {rds.band.values}")


Band names: [1 2 3 4]


In [2]:
rds

In [1]:
library(terra)


rds = rast('/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/nbac_85/median_8.tif')

names(rds)

terra 1.7.71

