In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
!pip install scikeras
!pip install --upgrade dask dask_ml distributed

Collecting dask
  Downloading dask-2022.2.0-py3-none-any.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 8.9 MB/s 
Collecting distributed
  Using cached distributed-2022.2.0-py3-none-any.whl (837 kB)
Collecting pyyaml>=5.3.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 26.8 MB/s 
Installing collected packages: pyyaml, dask, distributed
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully uninstalled PyYAML-3.13
  Attempting uninstall: dask
    Found existing installation: dask 2.12.0
    Uninstalling dask-2.12.0:
      Successfully uninstalled dask-2.12.0
  Attempting uninstall: distributed
    Found existing installation: distributed 2.4.0
    Uninstalling distributed-2.4.0:
      Successfully uninstalled distributed-2.4.0
Successfully installed dask-2022.2.0 distri

In [3]:
import os
import imageio
import distributed
from tensorflow import keras
from tensorflow.keras import layers
from scikeras.wrappers import KerasClassifier

import dask
import dask.array as da
from dask.distributed import Client
from dask_ml.model_selection import HyperbandSearchCV

# Initialize the Dask client
client = Client()

In [4]:
# Read in data

tif_lst = os.listdir('/content/gdrive/MyDrive/tif_data/2021')

array_lst = []
for year in os.listdir('/content/gdrive/MyDrive/tif_data'):
    if year != '.DS_Store':
        lazy_raster_lst = [dask.delayed(imageio.imread)('/content/gdrive/MyDrive/tif_data/' + year + '/' + tif) for tif in tif_lst]
        raster_lst = [da.from_delayed(lazy_raster, shape=(40,40), dtype='int32') for lazy_raster in lazy_raster_lst]
        raster_array = da.stack(raster_lst, axis=0)
        array_lst.append(raster_array)
    raw_dataset = da.stack(array_lst, axis=1)

In [5]:
# One-hot encode the data

# This expands the number of channels for each raster from one to three

# The first channel is equal to 1 if data is missing, 0 otherwise
# The second channel is equal to 1 if no deforestation occurred, 0 otherwise
# The third channel is equal to 1 if deforestation occurred, 0 otherwise

dataset = (da.array([-1, 0, 1]) == da.stack([raw_dataset], axis=-1)).astype('int32')

In [14]:
# Re-chunk the Dask array for model training

chunk_size = 10
dataset = dataset.rechunk((chunk_size, dataset.shape[1], 40, 40, 3))

In [7]:
# Get the size of the dataset

print('Dataset size:', dataset.shape)

num_samples = dataset.shape[0]
print('Number of rasters per year:', num_samples)

num_frames = dataset.shape[1]
print('Number of years:', num_frames)

raster_size = dataset.shape[2:]
print('Raster size:', raster_size)

Dataset size: (20, 18, 40, 40, 3)
Number of rasters per year: 20
Number of years: 18
Raster size: (40, 40, 3)


In [15]:
# Extract features and outcome variable
# x is frames 0 to n-1, and y is frames 1 to n

def split_x_y(data):
    x = data[:, 0:data.shape[1]-1, :, :]
    y = data[:, 1:data.shape[1], :, :]
    return x, y

X, y = split_x_y(dataset)

In [30]:
# Define the CNN-LSTM architecture

def build_model(lr=0.1):
    num_filters=64
    
    # Construct the input layer with no definite frame size
    inp = layers.Input(shape=(None, *X.shape[2:]))

    # Construct three ConvLSTM2D layers with batch norm,
    # followed by a Conv3D layer so that the output is
    # the same shape as the original raster
    x = layers.ConvLSTM2D(filters=num_filters,
                          kernel_size=(5, 5),
                          padding="same",
                          return_sequences=True,
                          activation="relu",)(inp)
    x = layers.BatchNormalization()(x)
    x = layers.ConvLSTM2D(filters=num_filters,
                          kernel_size=(3, 3),
                          padding="same",
                          return_sequences=True,
                          activation="relu",)(x)
    x = layers.BatchNormalization()(x)
    x = layers.ConvLSTM2D(filters=num_filters,
                          kernel_size=(1, 1),
                          padding="same",
                          return_sequences=True,
                          activation="relu",)(x)
    x = layers.Conv3D(filters=3, 
                      kernel_size=(3, 3, 3), 
                      activation="softmax", 
                      padding="same")(x)

    # Build and compile the model
    model = keras.models.Model(inp, x)
    model.compile(loss=keras.losses.binary_crossentropy,
                  optimizer=keras.optimizers.Adam(learning_rate=lr))

In [34]:
# Cross-validate the model
niceties = dict(verbose=False)
model = KerasClassifier(build_fn=build_model, lr=None, momentum=None, **niceties)

params = {'lr' : [0.1]}

search = HyperbandSearchCV(model, params, max_iter=len(params), test_size=1/chunk_size)
search.fit(X, y)

ValueError: ignored