Read in packages

In [1]:
import pandas as pd
import os
import numpy as np
import xarray as xr
import rioxarray
import glob
import random
import geopandas as gpd
from sklearn.utils import shuffle
from MightyMosaic import MightyMosaic


First take the input tif files and chunk them to sizes of 128x128 for the neural network.  

In [2]:
"""arguments are path to .tif files to be chunked, out_path to save files, chunk size, and if desired a threshold"""        
#in_path to tif files
in_path = '/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/l8_sent_collection2_proj'

#out_path which will be manipulated based on parameters below
out_path = '/explore/nobackup/people/spotter5/cnn_mapping/Russia/l8_sent_collection2_proj_nbac'
if not os.path.isdir(out_path):
    os.makedirs(out_path)

#right now everything is set to use 0 dnbr threshold
size = 128
threshold = True

if threshold == True:

    out_path = out_path + '_subs_0_' + str(size)
    if not os.path.isdir(out_path):
        os.makedirs(out_path)

if threshold == False:

    out_path = out_path + '_subs_' + str(size)
    if not os.path.isdir(out_path):
        os.makedirs(out_path)

files = os.listdir(in_path)

#loop through files
for f in files:

    if f.endswith('.tif'):
        
        #file name
        f2 = f.replace('median_', '')
        
        #ID of file
        f_id = int(f2.replace('.tif', ''))
        
      
        #in file
        try:

            #read in file
            in_mat = rioxarray.open_rasterio(os.path.join(in_path, f))

            
            #now turn in mat to numpy, and in mtbs to numpy
            in_mat = in_mat.to_numpy().astype(float)


            #convert to band last
            in_mat = np.moveaxis(in_mat, 0, 2) 

            #mosaic
            mosaic = MightyMosaic.from_array(in_mat, (size,size), overlap_factor=1) 
            
            #take off last dimensions of mosaic which give edge effects as they are filled with no data
            mosaic = mosaic[:-1, :-1, :, :, :]


            #first two dimensions are number of chunks, 3 and 4 are size (256, 266) and last is nbands
            for i in range(mosaic.shape[0]):

                for j in range(mosaic.shape[1]):

                    fname = os.path.join(str(i) + '_' +  str(j)+ '_' + f2.replace('.tif', '.npy'))
                    out_name = os.path.join(out_path, fname)

                    if os.path.exists(out_name) == False:

                        in_mat3 = mosaic[i, j, :, :, :-1]

                        target = mosaic[i, j, :, :, 9]

                        #turn nan to -999
                        in_mat[np.isnan(in_mat)] = -999

                        target = target.astype('int')

                        target[target <0 ] = 0
                        target[target >1 ] = 0

                        #
                        target[~np.isin(target, [0,1])] = 0

                        #turn nan to 0
                        target[np.isnan(target)] = 0

                        #if the target is all 0 don't train on it
                        # if np.all(target == 0) == False:

                        #now get dnbr which is the 6th band
                        dnbr = in_mat3[:, :, 6]

                        #if threshold apply dnbr threshold
                        target[dnbr < 0] = 0

                        e = np.dstack([in_mat3, target])

                        np.save(out_name, e)

                        print(f)

        except:
            pass

median_10026.tif
median_10028.tif
median_10028.tif
median_10028.tif
median_10028.tif
median_10028.tif
median_10028.tif
median_10028.tif
median_10028.tif
median_10028.tif
median_10029.tif
median_10027.tif
median_10030.tif
median_10030.tif
median_10030.tif
median_10030.tif
median_10030.tif
median_10030.tif
median_10039.tif
median_10039.tif
median_10039.tif
median_10039.tif
median_10041.tif
median_10035.tif
median_10036.tif
median_10036.tif
median_10036.tif
median_10036.tif
median_10032.tif
median_10032.tif
median_10032.tif
median_10032.tif
median_10032.tif
median_10032.tif
median_10032.tif
median_10032.tif
median_10032.tif
median_10044.tif
median_10085.tif
median_10085.tif
median_10059.tif
median_10059.tif
median_10059.tif
median_10059.tif
median_10063.tif
median_10063.tif
median_10063.tif
median_10063.tif
median_10038.tif
median_10038.tif
median_10038.tif
median_10038.tif
median_10083.tif
median_10045.tif
median_10045.tif
median_10045.tif
median_10045.tif
median_10045.tif
median_10045.t

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [3]:
't'

't'

Now take those files and save a csv with file names for the 80/10/10 training, validation, testing parts

In [3]:

def listdir_fullpath(d):
    return [os.path.join(d, f) for f in os.listdir(d)]
#list of files
# file_names = listdir_fullpath('/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/nbac_1985_sent_harm_subs_128')
file_names = listdir_fullpath('/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/l8_sent_collection2_proj_mtbs_subs_0_128')


#shuffle file_names
file_names = shuffle(file_names, random_state = 555)


print(len(file_names))

random.seed(555)
# #get train at 80%

train_files = random.sample(file_names, k=round(len(file_names) * 0.8))

#files at 10%
val_files = list(set(file_names) - set(train_files))
val_files = random.sample(val_files, k=round(len(file_names) * 0.1))

#combine traini and val
temp = train_files + val_files
#get test files at 10%
test_files = list(set(file_names) - set(temp))


#convert the lists to pandas dataframes
train = pd.DataFrame({'Files': train_files})
val = pd.DataFrame({'Files': val_files})
test = pd.DataFrame({'Files': test_files})

print(train.shape)
print(val.shape)
print(test.shape)

train.to_csv('/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/l8_sent_collection2_proj_mtbs_0_128_unburned_training_files.csv')
val.to_csv('/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/l8_sent_collection2_proj_mtbs_0_128_unburned_validation_files.csv')
test.to_csv('/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/l8_sent_collection2_proj_mtbs_0_128_unburned_testing_files.csv')


55776
(44621, 1)
(5578, 1)
(5577, 1)


In [45]:
print(train)

                                                   Files
0      /explore/nobackup/people/spotter5/cnn_mapping/...
1      /explore/nobackup/people/spotter5/cnn_mapping/...
2      /explore/nobackup/people/spotter5/cnn_mapping/...
3      /explore/nobackup/people/spotter5/cnn_mapping/...
4      /explore/nobackup/people/spotter5/cnn_mapping/...
...                                                  ...
43612  /explore/nobackup/people/spotter5/cnn_mapping/...
43613  /explore/nobackup/people/spotter5/cnn_mapping/...
43614  /explore/nobackup/people/spotter5/cnn_mapping/...
43615  /explore/nobackup/people/spotter5/cnn_mapping/...
43616  /explore/nobackup/people/spotter5/cnn_mapping/...

[43617 rows x 1 columns]


In [7]:
t = pd.read_csv('/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/l8_sent_collection2_crop_0_256_training_files.csv')
t.shape

(98246, 2)

In [5]:
t

NameError: name 't' is not defined

In [19]:
t =  pd.read_csv('/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/mtn_negative_images_crop_subs_128_validation_files.csv')['Files'].tolist()

s = [i.split('/')[-2].split('_')[-1] for i in t]
np.unique(s)

array(['128'], dtype='<U3')

In [None]:
import pandas as pd
import os
import numpy as np
import xarray as xr
import rioxarray
import glob
import PIL.Image
import random
from sklearn.utils import shuffle
from MightyMosaic import MightyMosaic
import matplotlib.pyplot as plt

in_mat = rioxarray.open_rasterio(os.path.join(in_path, f)).to_numpy()

#convert to band last
in_mat = np.moveaxis(in_mat, 0, 2) 

pl
            


In [6]:
import numpy as np
from MightyMosaic import MightyMosaic

matrix = np.random.rand(250, 250)

size = 128

#mosaic
mosaic = MightyMosaic.from_array(matrix, (size,size), overlap_factor=1) 
print(mosaic.shape)



(2, 2, 128, 128)


In [8]:
 #first two dimensions are number of chunks, 3 and 4 are size (256, 266) and last is nbands
for i in range(mosaic.shape[0]):

    for j in range(mosaic.shape[1]):
        
        print(i, j)

0 0
0 1
1 0
1 1
