This script will read in the tif files and split into chunks of 128x128 for training the model.  Note I will ignore all chunks where there is no data for dNBR as we wouldn't want to train on this.  
The in path and outpath have to be where the files from the tif files were sent to from the google cloud storage bucket after downloading from earth engine. 

Read in packages

In [1]:
import pandas as pd
import os
import numpy as np
import xarray as xr
import rioxarray
import glob
import random
import geopandas as gpd
from sklearn.utils import shuffle
from MightyMosaic import MightyMosaic


Right now the files are called  median_i.tif.tif, make median_i.tif

In [3]:
import os
import re

# Define the input path
in_path = '/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/mtbs_old'

# List all files in the directory
files = os.listdir(in_path)

# Loop through the files
for file_name in files:
    if file_name.endswith('.tif.tif'):
        # Extract the number i using regex
        match = re.search(r'\d+', file_name)
        if match:
            number = match.group(0)
            # Create the new file name
            new_name = f'median_{number}.tif'
            # Get the full paths
            old_file = os.path.join(in_path, file_name)
            new_file = os.path.join(in_path, new_name)
            # Rename the file
            os.rename(old_file, new_file)
            print(f'Renamed: {old_file} -> {new_file}')


Renamed: /explore/nobackup/people/spotter5/cnn_mapping/nbac_training/nbac_old/median_10014.tif.tif -> /explore/nobackup/people/spotter5/cnn_mapping/nbac_training/nbac_old/median_10014.tif
Renamed: /explore/nobackup/people/spotter5/cnn_mapping/nbac_training/nbac_old/median_10013.tif.tif -> /explore/nobackup/people/spotter5/cnn_mapping/nbac_training/nbac_old/median_10013.tif
Renamed: /explore/nobackup/people/spotter5/cnn_mapping/nbac_training/nbac_old/median_10017.tif.tif -> /explore/nobackup/people/spotter5/cnn_mapping/nbac_training/nbac_old/median_10017.tif
Renamed: /explore/nobackup/people/spotter5/cnn_mapping/nbac_training/nbac_old/median_10020.tif.tif -> /explore/nobackup/people/spotter5/cnn_mapping/nbac_training/nbac_old/median_10020.tif
Renamed: /explore/nobackup/people/spotter5/cnn_mapping/nbac_training/nbac_old/median_10012.tif.tif -> /explore/nobackup/people/spotter5/cnn_mapping/nbac_training/nbac_old/median_10012.tif
Renamed: /explore/nobackup/people/spotter5/cnn_mapping/nbac_

First take the input tif files and chunk them to sizes of 128x128 for the neural network.  

In [None]:
"""arguments are path to .tif files to be chunked, out_path to save files, chunk size, and if desired a threshold"""       

#check if all 0
def is_matrix_all_zeros(matrix):
    # Convert the matrix to a NumPy array
    np_matrix = np.array(matrix)

    # Check if all elements in the array are zeros
    return np.all(np_matrix == 0)

#in_path to tif files
in_path = '/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/mtbs_old'

#out_path which will be manipulated based on parameters below
out_path = '/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/mtbs_old'


#right now everything is set to use 0 dnbr threshold
size = 128
threshold = True

if threshold == True:

    out_path = out_path + '_subs_0_' + str(size)
    if not os.path.isdir(out_path):
        os.makedirs(out_path)

if threshold == False:

    out_path = out_path + '_subs_' + str(size)
    if not os.path.isdir(out_path):
        os.makedirs(out_path)

files = os.listdir(in_path)

# files = ['median_6169.tif']

#loop through files
for f in files:

    if f.endswith('.tif'):
        
         
        #in file
        try:
        
            #file name
            f2 = f.replace('median_', '')

            #ID of file
            f_id = int(f2.replace('.tif', ''))
        
     

            #read in file
            in_mat = rioxarray.open_rasterio(os.path.join(in_path, f))

            
            #now turn in mat to numpy, and in mtbs to numpy
            in_mat = in_mat.to_numpy().astype(float)


            #convert to band last
            in_mat = np.moveaxis(in_mat, 0, 2) 
            
               
            #get dnbr
            t = in_mat[:, :, 0]
            
             #check if all 0
            result = is_matrix_all_zeros(t)
            
            #if it is not all 0s keep data
            if result == False:

                #mosaic
                mosaic = MightyMosaic.from_array(in_mat, (size,size), overlap_factor=1) 

                #take off last dimensions of mosaic which give edge effects as they are filled with no data
                mosaic = mosaic[:-1, :-1, :, :, :]


                #first two dimensions are number of chunks, 3 and 4 are size (256, 266) and last is nbands
                for i in range(mosaic.shape[0]):

                    for j in range(mosaic.shape[1]):

                        fname = os.path.join(str(i) + '_' +  str(j)+ '_' + f2.replace('.tif', '.npy'))
                        out_name = os.path.join(out_path, fname)

                        if os.path.exists(out_name) == False:

                            in_mat3 = mosaic[i, j, :, :, :-1]

                            target = mosaic[i, j, :, :, 3]

                            #turn nan to -999
                            in_mat[np.isnan(in_mat)] = -999

                            target = target.astype('int')

                            target[target <0 ] = 0
                            target[target >1 ] = 0

                            #
                            target[~np.isin(target, [0,1])] = 0

                            #turn nan to 0
                            target[np.isnan(target)] = 0

                            #if the target is all 0 don't train on it
                            # if np.all(target == 0) == False:

                            #now get dnbr which is the 6th band
                            dnbr = in_mat3[:, :, 0]
                            
                            result1 = is_matrix_all_zeros(dnbr)

                            if result1 == False:

                                #if threshold apply dnbr threshold
                                target[dnbr < 0] = 0

                                e = np.dstack([in_mat3, target])

                                np.save(out_name, e)

                                print(f)

        except:
            pass


KeyboardInterrupt

Exception ignored in: 'rasterio._env.log_error'
Traceback (most recent call last):
  File "/home/spotter5/.conda/envs/deeplearning3/lib/python3.10/logging/__init__.py", line 1467, in info
    def info(self, msg, *args, **kwargs):
KeyboardInterrupt: 

KeyboardInterrupt

Exception ignored in: 'rasterio._env.log_error'
Traceback (most recent call last):
  File "/home/spotter5/.conda/envs/deeplearning3/lib/python3.10/logging/__init__.py", line 1467, in info
    def info(self, msg, *args, **kwargs):
KeyboardInterrupt: 


Faster with dask

In [5]:
import os
import numpy as np
import rioxarray
import dask
from dask.diagnostics import ProgressBar
from MightyMosaic import MightyMosaic

# Check if all elements in a matrix are zero
def is_matrix_all_zeros(matrix):
    np_matrix = np.array(matrix)
    return np.all(np_matrix == 0)

# Input and output paths
in_path = '/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/mtbs_old'
out_path = '/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/mtbs_old'
size = 128
threshold = True

# Set output path based on threshold
if threshold:
    out_path += f'_subs_0_{size}'
else:
    out_path += f'_subs_{size}'

if not os.path.isdir(out_path):
    os.makedirs(out_path)

files = [f for f in os.listdir(in_path) if f.endswith('.tif')]

# Define the function to process a single file
def process_file(f):
    try:
        f2 = f.replace('median_', '')
        f_id = int(f2.replace('.tif', ''))
        in_mat = rioxarray.open_rasterio(os.path.join(in_path, f)).to_numpy().astype(float)
        in_mat = np.moveaxis(in_mat, 0, 2) 
        t = in_mat[:, :, 0]
        if not is_matrix_all_zeros(t):
            mosaic = MightyMosaic.from_array(in_mat, (size, size), overlap_factor=1) 
            mosaic = mosaic[:-1, :-1, :, :, :]
            for i in range(mosaic.shape[0]):
                for j in range(mosaic.shape[1]):
                    fname = os.path.join(str(i) + '_' +  str(j) + '_' + f2.replace('.tif', '.npy'))
                    out_name = os.path.join(out_path, fname)
                    if not os.path.exists(out_name):
                        in_mat3 = mosaic[i, j, :, :, :-1]
                        target = mosaic[i, j, :, :, 3]
                        in_mat3[np.isnan(in_mat3)] = -999
                        target = target.astype('int')
                        target[target < 0] = 0
                        target[target > 1] = 0
                        target[~np.isin(target, [0, 1])] = 0
                        target[np.isnan(target)] = 0
                        dnbr = in_mat3[:, :, 0]
                        if not is_matrix_all_zeros(dnbr):
                            if threshold:
                                target[dnbr < 0] = 0
                            e = np.dstack([in_mat3, target])
                            np.save(out_name, e)
                            print(f"Processed {f}")
    except Exception as e:
        print(f"Error processing {f}: {e}")

# Parallel execution using Dask
tasks = [dask.delayed(process_file)(f) for f in files]

with ProgressBar():
    dask.compute(*tasks)


[#                                       ] | 2% Completed | 1.76 s msError processing median_493.tif: j_begin (0) should be less than j_end (0) (0 < 0 is False)
[#                                       ] | 4% Completed | 3.55 sError processing median_559.tif: i_begin (0) should be less than i_end (0) (0 < 0 is False)
[#####                                   ] | 12% Completed | 8.47 sError processing median_740.tif: i_begin (0) should be less than i_end (0) (0 < 0 is False)
[######                                  ] | 16% Completed | 10.82 sError processing median_1166.tif: j_begin (0) should be less than j_end (0) (0 < 0 is False)
[######                                  ] | 16% Completed | 11.04 sError processing median_403.tif: j_begin (0) should be less than j_end (0) (0 < 0 is False)
Error processing median_354.tif: j_begin (0) should be less than j_end (0) (0 < 0 is False)
[##########                              ] | 27% Completed | 16.93 sError processing median_511.tif: j_begin 

In [5]:
't'

't'

Now take those files and save a csv with file names for the 80/10/10 training, validation, testing parts

In [2]:

def listdir_fullpath(d):
    return [os.path.join(d, f) for f in os.listdir(d)]

#list of files
# file_names = listdir_fullpath('/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/nbac_1985_sent_harm_subs_128')
file_names = listdir_fullpath('/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/mtbs_old_subs_0_128')


#shuffle file_names
file_names = shuffle(file_names, random_state = 555)


print(len(file_names))

random.seed(555)
# #get train at 80%

train_files = random.sample(file_names, k=round(len(file_names) * 0.8))

#files at 10%
val_files = list(set(file_names) - set(train_files))
val_files = random.sample(val_files, k=round(len(file_names) * 0.1))

#combine traini and val
temp = train_files + val_files
#get test files at 10%
test_files = list(set(file_names) - set(temp))


#convert the lists to pandas dataframes
train = pd.DataFrame({'Files': train_files})
val = pd.DataFrame({'Files': val_files})
test = pd.DataFrame({'Files': test_files})

print(train.shape)
print(val.shape)
print(test.shape)

train.to_csv('/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/mtbs_old_training_files.csv')
val.to_csv('/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/mtbs_old_validation_files.csv')
test.to_csv('/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/mtbs_old_testing_files.csv')


59942
(47954, 1)
(5994, 1)
(5994, 1)
