Here is a proposal of a model in order to classify images of rail, between 2 classes, with insulation joint or without. This is my first CNN model and I am interested in all kinds of feedback.

Insulation joints separate track circuits which aim at detecting and then locating the trains on the network. They play a key role to ensure safety when trains operate to prevent collision.

# Packages

In [None]:
import os
import random
import tqdm
from typing import Tuple

import numpy as np
import pandas as pd

import cv2
import matplotlib.pyplot as plot

from sklearn.metrics import confusion_matrix
from sklearn.calibration import calibration_curve
from sklearn.model_selection import train_test_split

import tensorflow as tf

from IPython.display import SVG

# Parameters

All the images do not have the same size. There are 2 different sizes. Inasmuch as the 2 sizes are quite big, a new image shape has been chosen, smaller than the 2 initial shapes, but not too small to keep enough details in the images (in particular the joints).

In [None]:
image_dir_path = "/kaggle/input/insulation-joint-training-set-prorail/trainset_insulation_joint/images/"
label_csv_path = "/kaggle/input/insulation-joint-training-set-prorail/trainset_insulation_joint/labels.csv"

image_shape = (200,200)

# Get data

## Get dataframe of all images with their corresponding label

In [None]:
data_df = pd.read_csv(label_csv_path, sep=";")

In [None]:
data_df.head(2)

## Get height and width
All images do not have the same height and width. In order to keep all sizes in all datasets, height and width are first computed.

In [None]:
data_df["height"] = 0
data_df["width"] = 0
for image_file in tqdm.tqdm(data_df["filepath"]):
    image = cv2.imread(os.path.join(image_dir_path, image_file), 0) # in order to read it as a grayscale image
    height, width = image.shape
    data_df.loc[data_df["filepath"] == image_file, "height"] = height
    data_df.loc[data_df["filepath"] == image_file, "width"] = width

In [None]:
data_df[["height", "width"]].value_counts()

## Few examples
Positive and negative examples are displayed with the two different sizes. For every image, a new image with the new shape is also displayed in order to check if the joint is still visible after resizing.

In [None]:
example_filepath_list = []
example_filepath_list.extend(
    random.sample(
        list(
            data_df.loc[
                (data_df["label"] == "n") & (data_df["height"] == 851),
                "filepath"
            ].values
        ),
        2
    )
)
example_filepath_list.extend(
    random.sample(
        list(
            data_df.loc[
                (data_df["label"] == "n") & (data_df["height"] == 1066),
                "filepath"
            ].values
        ),
        2
    )
)
example_filepath_list.extend(
    random.sample(
        list(
            data_df.loc[
                (data_df["label"] == "p") & (data_df["height"] == 851),
                "filepath"
            ].values
        ),
        2
    )
)
example_filepath_list.extend(
    random.sample(
        list(
            data_df.loc[
                (data_df["label"] == "p") & (data_df["height"] == 1066),
                "filepath"
            ].values
        ),
        2
    )
)

In [None]:
f, ax = plot.subplots(len(example_filepath_list), 2)
f.subplots_adjust(0, 0, 1.5 * 2, 1.5 * len(example_filepath_list))
for index in range(len(example_filepath_list)):
    label = data_df.loc[
        data_df["filepath"] == example_filepath_list[index], "label"
    ].values[0]
    image = cv2.imread(
        os.path.join(image_dir_path, example_filepath_list[index]), 0
    )
    ax[index, 0].imshow(image)
    ax[index, 0].set_title(label)
    image_resized = cv2.resize(image, image_shape, interpolation=cv2.INTER_AREA)
    ax[index, 1].imshow(image_resized)
    ax[index, 1].set_title(label)

## Split data into 3 datasets: train, dev and test

In [None]:
# Stratify column in order to keep same proportion of labels and sizes in all datasets
data_df["stratify"] = (
    data_df["label"]
    + "_"
    + data_df["height"].astype(str)
    + "_"
    + data_df["width"].astype(str)
)
print(data_df["stratify"].value_counts())

# Train+dev and test
indices_train_dev, indices_test = train_test_split(
    data_df.index, test_size=0.1, shuffle=True, stratify=data_df["stratify"],
)

# Train and dev
indices_train, indices_dev = train_test_split(
    data_df.loc[indices_train_dev, :].index,
    test_size=0.1,
    shuffle=True,
    stratify=data_df.loc[indices_train_dev, "stratify"],
)

# Add dataset column
data_df["dataset"] = ""
data_df.loc[indices_test, "dataset"] = "test"
data_df.loc[indices_dev, "dataset"] = "dev"
data_df.loc[indices_train, "dataset"] = "train"
print(data_df["dataset"].value_counts())

## Oversample the positive label of the train dataset

In [None]:
# First multiply all positive images equally
nb_class_n = (data_df.loc[indices_train, "label"] == "n").sum()
nb_class_p = (data_df.loc[indices_train, "label"] == "p").sum()

oversampling_of_p = int(np.floor(nb_class_n / nb_class_p) - 1)

data_df_oversampling = pd.concat(
    [data_df.loc[(data_df["dataset"] == "train") & (data_df["label"] == "p")].copy()]
    * oversampling_of_p
)
data_df_oversampling["dataset"] = "train_oversampling"
data_df = data_df.append(data_df_oversampling, ignore_index=True)

# Then complete to have the same number of positive and negative images in the training set by randomly choosing the positive images
nb_class_n = (
    data_df.loc[
        (
            (data_df["dataset"] == "train")
            | (data_df["dataset"] == "train_oversampling")
        ),
        "label",
    ]
    == "n"
).sum()
nb_class_p = (
    data_df.loc[
        (
            (data_df["dataset"] == "train")
            | (data_df["dataset"] == "train_oversampling")
        ),
        "label",
    ]
    == "p"
).sum()

additionnal_indices_for_oversampling = random.sample(
    data_df.loc[
        (data_df["dataset"] == "train") & (data_df["label"] == "p")
    ].index.to_list(),
    nb_class_n - nb_class_p,
)

data_df_oversampling = data_df.loc[additionnal_indices_for_oversampling].copy()
data_df_oversampling["dataset"] = "train_oversampling"
data_df = data_df.append(data_df_oversampling, ignore_index=True)
print(data_df[["dataset", "label"]].value_counts())

# Generate datasets of images with data augmentation for the training dataset

In [None]:
train_datagen = tf.keras.preprocessing.image.ImageDataGenerator(
    rescale=1.0 / 255,
    width_shift_range=0.3, # because the joint is usually horizontally in the center of the image, however it could be close to the top or the bottom hence no height_shift_range
    fill_mode="nearest",
    horizontal_flip=True,
    vertical_flip=True,
)
val_datagen = tf.keras.preprocessing.image.ImageDataGenerator(rescale=1.0 / 255)

# common kwargs for all datasets
kwargs = {
    "directory": image_dir_path,
    "x_col": "filepath",
    "y_col": "label",
    "weight_col": None,
    "target_size": image_shape,
    "color_mode": "grayscale",
    "class_mode": "binary",
    "batch_size": 256,
}

train_generator = train_datagen.flow_from_dataframe(
    data_df.loc[
        (data_df["dataset"] == "train") | (data_df["dataset"] == "train_oversampling")
    ],
    **kwargs
)
validation_generator = val_datagen.flow_from_dataframe(
    data_df.loc[(data_df["dataset"] == "dev")],
    **kwargs
)

# CNN model

## Architecture

I had trouble to simultaneously get a parsimonious and reproducible model.

Since all the images look quite similar (the rail is always vertical, the joint always horizontal) and there are only 2 classes, I first thought that a small model would be more appropriate. I manage to get quite good results. However I had trouble to get similar result when trying to retrain the model. Sometimes, the new performances were very bad compared to the best one. This make me think that the model was overfitting but I was not able to get better results with smaller or more regularized models.

For this reason I decided to build a bigger model. Although it seems to be a bit of overfitting during training, this is the only way I found to get more reproducible models. Two different training would result in similar performance. I do not have a lot of experience in ML, but to me, a good architecture should be robust to seed, hence the following proposal of architecture.

In [None]:
regu_coef = 0.004
dropout_cnn = 0
dropout_fc = 0.4

model = tf.keras.models.Sequential()

model.add(
    tf.keras.layers.Conv2D(
        8,
        kernel_size=(3, 3),
        activation="relu",
        input_shape=(*image_shape, 1),
        kernel_regularizer=tf.keras.regularizers.l2(regu_coef),
        bias_regularizer=tf.keras.regularizers.l2(regu_coef),
        kernel_initializer=tf.keras.initializers.he_normal(),
    )
)
model.add(tf.keras.layers.Dropout(dropout_cnn))
model.add(tf.keras.layers.MaxPool2D(pool_size=(2, 2), strides=(2, 2)))
model.add(tf.keras.layers.BatchNormalization())
model.add(
    tf.keras.layers.Conv2D(
        8,
        kernel_size=(3, 3),
        activation="relu",
        kernel_regularizer=tf.keras.regularizers.l2(regu_coef),
        bias_regularizer=tf.keras.regularizers.l2(regu_coef),
        kernel_initializer=tf.keras.initializers.he_normal(),
    )
)
model.add(tf.keras.layers.Dropout(dropout_cnn))
model.add(tf.keras.layers.MaxPool2D(pool_size=(2, 2), strides=(2, 2)))
model.add(tf.keras.layers.BatchNormalization())
model.add(
    tf.keras.layers.Conv2D(
        8,
        kernel_size=(3, 3),
        activation="relu",
        kernel_regularizer=tf.keras.regularizers.l2(regu_coef),
        bias_regularizer=tf.keras.regularizers.l2(regu_coef),
        kernel_initializer=tf.keras.initializers.he_normal(),
    )
)
model.add(tf.keras.layers.Dropout(dropout_cnn))
model.add(tf.keras.layers.MaxPool2D(pool_size=(2, 2), strides=(2, 2)))
model.add(tf.keras.layers.BatchNormalization())
model.add(
    tf.keras.layers.Conv2D(
        2,
        kernel_size=(3, 3),
        activation="relu",
        kernel_regularizer=tf.keras.regularizers.l2(regu_coef),
        bias_regularizer=tf.keras.regularizers.l2(regu_coef),
        kernel_initializer=tf.keras.initializers.he_normal(),
    )
)
model.add(tf.keras.layers.Dropout(dropout_fc))
model.add(tf.keras.layers.MaxPool2D(pool_size=(2, 2), strides=(2, 2)))
model.add(tf.keras.layers.Flatten())
model.add(
    tf.keras.layers.Dense(
        10,
        activation="relu",
        kernel_regularizer=tf.keras.regularizers.l2(regu_coef),
        bias_regularizer=tf.keras.regularizers.l2(regu_coef),
        kernel_initializer=tf.keras.initializers.he_normal(),
    )
)
model.add(tf.keras.layers.Dropout(dropout_cnn))
model.add(
    tf.keras.layers.Dense(
        1,
        activation="sigmoid",
        kernel_regularizer=tf.keras.regularizers.l2(regu_coef),
        bias_regularizer=tf.keras.regularizers.l2(regu_coef),
        kernel_initializer=tf.keras.initializers.he_normal(),
    )
)

initial_learning_rate = 0.001
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate, decay_steps=2000, decay_rate=0.96, staircase=False
)

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=lr_schedule),
    loss=tf.keras.losses.BinaryCrossentropy(),
    metrics=[
        "accuracy",
        tf.keras.metrics.Precision(name="precision"),
        tf.keras.metrics.Recall(name="recall"),
    ],
)

model.summary()

## Training

I only did a few epochs with a quite big learning rate (which partially explains the high variability on the dev set). More epochs with a smaller learning rate help to get a better model.

In [None]:
callback = tf.keras.callbacks.EarlyStopping(
    monitor="val_loss", patience=10, mode="min", restore_best_weights=True
)

In [None]:
model.fit(
    train_generator,
    epochs=60,
    validation_data=validation_generator,
    callbacks=[callback],
    workers = 12,
    max_queue_size = 16
)

# Metrics

In [None]:
test_datagen = tf.keras.preprocessing.image.ImageDataGenerator(rescale=1.0 / 255)
test_generator = test_datagen.flow_from_dataframe(
    data_df.loc[(data_df["dataset"] == "test")],
    directory=image_dir_path,
    x_col="filepath",
    y_col="label",
    weight_col=None,
    target_size=image_shape,
    color_mode="grayscale",
    class_mode="binary",
    batch_size=64,
    shuffle=False
)

In [None]:
model.evaluate(test_generator)

In [None]:
y_prod_pred = model.predict(test_generator)
y_pred = np.copy(y_prod_pred)
y_pred[y_pred > 0.5] = 1
y_pred[y_pred <= 0.5] = 0
y_true = test_generator.classes

In [None]:
confusion_matrix(y_true, y_pred)

In [None]:
prob_true, prob_pred = calibration_curve(y_true, y_prod_pred, n_bins=20, strategy="quantile")
f, ax = plot.subplots(1, 1)
ax.plot(prob_pred, prob_true, linestyle=":", marker="+", label="model calibration curve")
ax.plot(
    [0, 1], [0, 1], label="calibrated curve",
)
ax.set_xlabel("Mean predicted value")
ax.set_ylabel("Mean true value")
ax.legend(["model calibration curve", "calibrated curve"])

# Explainability

## Activation maximization from the training set
Get the first 9 image parts of the training set that maximize the activation of each filter. This gives insights of the patterns detected by the model in order to predict the final class.

In [None]:
def locs_of_previous_layer(i, j, padding, kernel_size, strides):
    if padding == "valid":
        shift = (0, 0)
    elif padding == "same":
        shift = tuple([-int(np.floor(x / 2)) for x in kernel_size])
    return (
        i * strides[0] + shift[0],
        i * strides[0] + shift[0] + kernel_size[0] - 1,
        j * strides[1] + shift[1],
        j * strides[1] + shift[1] + kernel_size[1] - 1,
    )


def feature_map_loc_to_input_loc(model, layer_name, i, j):
    previous_layer = False
    x_min = i
    x_max = i
    y_min = j
    y_max = j
    for layer in reversed(model.layers):
        if layer.name == layer_name:
            previous_layer = True
        if previous_layer:
            if isinstance(layer, tf.keras.layers.Conv2D):
                strides = layer.strides
                padding = layer.padding
                kernel_size = layer.kernel_size
                x_min, _, y_min, _ = locs_of_previous_layer(
                    x_min, y_min, padding, kernel_size, strides
                )
                _, x_max, _, y_max = locs_of_previous_layer(
                    x_max, y_max, padding, kernel_size, strides
                )
            elif isinstance(layer, tf.keras.layers.MaxPooling2D):
                strides = layer.strides
                padding = layer.padding
                kernel_size = layer.pool_size
                x_min, _, y_min, _ = locs_of_previous_layer(
                    x_min, y_min, padding, kernel_size, strides
                )
                _, x_max, _, y_max = locs_of_previous_layer(
                    x_max, y_max, padding, kernel_size, strides
                )
    return x_min, x_max, y_min, y_max

def get_filter_highest_activations(model, dev_generator, nb_images_per_filter=9):
    # submodel
    conv_layers_outputs = [
        (layer.name, layer.output)
        for layer in model.layers
        if isinstance(layer, tf.keras.layers.Conv2D)
    ]
    activations_model = tf.keras.models.Model(
        model.inputs,
        outputs=[conv_layer_output[1] for conv_layer_output in conv_layers_outputs],
    )

    # output initialization
    max_activations = {}
    for layer_id in range(len(activations_model.output)):
        max_activations[layer_id] = {}
        for channel_id in range(activations_model.output[layer_id].shape[-1]):
            max_activations[layer_id][channel_id] = {"activation": [], "input": []}

    # get max activations
    for ii in range(len(dev_generator)):
        (image_batch, _) = next(dev_generator)
        predictions = activations_model.predict(image_batch)
        for layer in range(len(activations_model.output)):
            for channel in range(activations_model.output[layer].shape[-1]):
                layer_name = conv_layers_outputs[layer][0]
                new_shape = 1
                for size in predictions[layer].shape[:3]:
                    new_shape *= size
                highest_values = tf.sort(
                    tf.reshape(predictions[layer][:, :, :, channel], new_shape)
                )[-nb_images_per_filter:]
                for activation in highest_values:
                    if len(
                        max_activations[layer][channel]["activation"]
                    ) == nb_images_per_filter and activation.numpy() > min(
                        max_activations[layer][channel]["activation"]
                    ):
                        index_to_remove = max_activations[layer][channel][
                            "activation"
                        ].index(min(max_activations[layer][channel]["activation"]))
                        del max_activations[layer][channel]["activation"][
                            index_to_remove
                        ]
                        del max_activations[layer][channel]["input"][index_to_remove]

                    if (
                        len(max_activations[layer][channel]["activation"])
                        < nb_images_per_filter
                    ):
                        max_activations[layer][channel]["activation"].append(
                            activation.numpy()
                        )
                        loc = tf.where(
                            predictions[layer][:, :, :, channel] == activation
                        )[
                            0
                        ]  # take the first one if several
                        x_min, x_max, y_min, y_max = feature_map_loc_to_input_loc(
                            activations_model,
                            layer_name,
                            loc[1].numpy(),
                            loc[2].numpy(),
                        )
                        if x_max - x_min != y_max - y_min:
                            break
                        max_activations[layer][channel]["input"].append(
                            image_batch[
                                loc[0].numpy(), x_min : x_max + 1, y_min : y_max + 1, 0
                            ]
                        )

    # create IO output
    all_layer_io = []
    for layer in max_activations.keys():
        layer_io = []
        nb_channels = len(max_activations[layer])
        for channel in range(nb_channels):
            size = max_activations[layer][channel]["input"][0].shape[0]
            fig = plot.figure(figsize=(size / 5, size / 5))
            for index in range(len(max_activations[layer][channel]["input"])):
                ax = fig.add_subplot(
                    np.sqrt(nb_images_per_filter),
                    np.sqrt(nb_images_per_filter),
                    index + 1,
                )
                ax.imshow(
                    max_activations[layer][channel]["input"][index],
                    cmap="gray",
                )
                ax.get_xaxis().set_visible(False)
                ax.get_yaxis().set_visible(False)

In [None]:
dev_datagen = tf.keras.preprocessing.image.ImageDataGenerator(rescale=1.0 / 255)
kwargs = {
    "directory": image_dir_path,
    "x_col": "filepath",
    "y_col": "label",
    "weight_col": None,
    "target_size": image_shape,
    "color_mode": "grayscale",
    "class_mode": "binary",
    "batch_size": 64,
}
dev_generator = dev_datagen.flow_from_dataframe(
    data_df.loc[
        data_df["dataset"] != "train_oversampling"
    ],
    **kwargs
)

get_filter_highest_activations(model, dev_generator)

## Grad CAM and guided grad CAM

Grad CAM and Guided Grad CAM are used on a single (positive) example to highlight their potential to explain the prediction of the model on a single instance.

In [None]:
def get_last_conv2d_name(model: tf.keras.models.Sequential) -> str:
    """
    Get the name of the last convolutional layer
    """
    for layer in reversed(model.layers):
        if isinstance(layer, tf.keras.layers.Conv2D):
            return layer.name
    raise ValueError(
        "There is no Conv2D layer in the model. "
        "This is then not possible to retrieve the name of a Conv2D layer."
    )

    
def compute_grad_cam(
    model: tf.keras.models.Sequential, image: np.ndarray, image_shape: Tuple[int]
) -> np.ndarray:
    """
    Compute grad CAM of a given image and model.
    """
    last_cnn_layer = get_last_conv2d_name(model)

    # Create a graph that outputs target convolution and output
    grad_model = tf.keras.models.Model(
        model.inputs, [model.get_layer(last_cnn_layer).output, model.output]
    )

    # Get the prediction for target class
    with tf.GradientTape() as tape:
        last_conv_outputs, predictions = grad_model(image)
        y_pred_prob = predictions[:, 0]  # prediction for the target class
    grads = tape.gradient(y_pred_prob, last_conv_outputs)[
        0
    ]  # only one image to predict

    # Average gradients spatially
    weights = tf.reduce_mean(grads, axis=(0, 1))

    # Build a ponderated map of filters according to gradients importance
    last_conv_output = last_conv_outputs[0]  # only one image to predict
    grad_cam = tf.reduce_sum(tf.multiply(weights, last_conv_output), axis=-1)

    # Final Grad CAM computation: resize and ReLU
    grad_cam = cv2.resize(grad_cam.numpy(), image_shape)
    grad_cam = np.maximum(grad_cam, 0)
    return grad_cam

@tf.custom_gradient
def guided_relu(x):
    def grad(dy):
        return tf.multiply(
            tf.multiply(tf.where(dy > 0, 1.0, 0.0), tf.where(x > 0, 1.0, 0.0)), dy
        )

    result = tf.nn.relu(x)
    return result, grad

def compute_guided_backpropagation(
    model: tf.keras.models.Sequential, image: np.ndarray
) -> tf.python.framework.ops.EagerTensor:
    """
    Compute the guided backpropagation with the modified relu activation
    """
    model_with_guided_relu = tf.keras.models.clone_model(model)
    model_with_guided_relu.set_weights(model.get_weights())

    layer_list = [
        layer for layer in model_with_guided_relu.layers if hasattr(layer, "activation")
    ]
    for layer in layer_list:
        if layer.activation == tf.keras.activations.relu:
            layer.activation = guided_relu

    input_data = tf.Variable(tf.cast(image, tf.float32))
    with tf.GradientTape() as tape:
        predictions = model_with_guided_relu(input_data)
        loss = predictions[:, 0]
    guided_backpropagation = tape.gradient(loss, input_data)[
        0
    ]  # only one image to predict
    return guided_backpropagation

In [None]:
# Get image of test set and preprocess it
image_example_path = os.path.join(
    image_dir_path,
    data_df.loc[
        (data_df["dataset"] == "test")
        & (data_df["label"] == "p"),
    "filepath"
    ].values[0],
)
image_example = tf.keras.preprocessing.image.load_img(
    image_example_path, target_size=image_shape, color_mode="grayscale"
)
image_example = tf.keras.preprocessing.image.img_to_array(image_example)
image_example = np.expand_dims(image_example, axis=0)
image_example = image_example / 255

In [None]:
model.predict(image_example)

In [None]:
grad_cam = compute_grad_cam(model, image_example, image_shape)
heatmap = (grad_cam - grad_cam.min()) / (grad_cam.max() - grad_cam.min())
guided_backpropagation = compute_guided_backpropagation(model, image_example)

fig = plot.figure(figsize=(8, 16))
ax = fig.add_subplot(1, 3, 1)
ax.imshow(image_example[0, :, :, 0], cmap="gray")
ax = fig.add_subplot(1, 3, 2)
ax.imshow((image_example[0, :, :, 0] * heatmap[:, :]), cmap="gray")
ax = fig.add_subplot(1, 3, 3)
ax.imshow((guided_backpropagation[:, :, 0] * heatmap[:, :]), cmap="gray")

## Limit of the trained model
The model has been trained on a dataset on images. Using this model on an images that differs from this dataset may be irrelevant. Below, I constructed an image that maximize the activation of one of the two filters of the last layer. The prediction is 1 although it seems clear that the image does not include a joint.

In [None]:
def build_submodel(
    model: tf.keras.models.Sequential,
    conv_layer_name: str,
    add_input_over_all_reals: bool = True,
    change_relu: bool = True,
) -> tf.keras.models.Sequential:
    """
    Build a submodel of the original model.

    The output of the submodel is the output of the Conv2D layer which name is provided in argument.

    Since the image pixel values are between 0 and 1 (after normalization),
    an additional layer may be added at the beginning implementing a sigmoid
    to map all the real to values between 0 and 1.

    The derivative of the ReLU is 0 for negative values. This may prevent the optimization to work.
    It is possible to change the ReLU activation by a Leaky ReLU one in order to make the optimization works.
    """
    # Create a copy because activations may be changed
    model_copy = tf.keras.models.clone_model(model)
    model_copy.set_weights(model.get_weights())

    # Create submodel
    submodel = tf.keras.models.Model(
        model_copy.inputs, model_copy.get_layer(conv_layer_name).output
    )

    # Change activation
    if change_relu:
        layer_list = [
            layer for layer in submodel.layers if hasattr(layer, "activation")
        ]
        for layer in layer_list:
            if layer.activation == tf.keras.activations.relu:
                layer.activation = tf.nn.leaky_relu

    # Add new input over all reals in order to restrain the image value over ]0,1[
    if add_input_over_all_reals:
        input_layer_over_all_reals = tf.keras.Input(shape=(*image_shape, 1))
        x = tf.keras.layers.Activation(tf.keras.activations.sigmoid)(
            input_layer_over_all_reals
        )
        output = submodel(x)
        submodel = tf.keras.models.Model(input_layer_over_all_reals, output)

    return submodel


def maximize_ouput(
    submodel: tf.keras.models.Sequential,
    channel_id: int,
    epochs: int,
    nb_activation_to_max: str = "single",
    learning_rate: float = 1.0,
) -> (np.array, list, list):
    """
    Maximize the output of the provided model by changing the input.
    """
    # Initialize random normal noise on R
    input_data = np.random.normal(size=(1, *image_shape, 1))
    # Cast random noise from np.float64 to tf.float32 Variable because we will compute the derivative
    input_data = tf.Variable(tf.cast(input_data, tf.float32))

    # get maximum size to look at during optimization
    if nb_activation_to_max == "single":
        max_size = 0
    elif nb_activation_to_max == "all":
        max_size = input_data.shape[1]
    else:
        raise ValueError(f"Unknown input argument {nb_activation_to_max}.")

    # Iterate gradient ascents
    mean_activation_history = []
    grad_history = []
    for _ in range(epochs):
        with tf.GradientTape() as tape:
            output = submodel(input_data)
            if nb_activation_to_max == "single":
                mean_activation = tf.reduce_mean(output[:, 0, 0, channel_id])
            elif nb_activation_to_max == "all":
                mean_activation = tf.reduce_mean(output[:, :, :, channel_id])
            mean_activation_history.append(mean_activation)
        grads = tape.gradient(mean_activation, input_data)
        if tf.norm(grads) < 1e-5:
            print("Optimization stopped since gradient is almost flat.")
            break
        grad_history.append(tf.norm(grads))
        input_data.assign_add(grads * learning_rate)
        if nb_activation_to_max == "single":
            max_size_tmp = tf.math.reduce_max(tf.where(grads[0, :, :, 0]))
            max_size = max(max_size_tmp, max_size)
        if len(mean_activation_history) >= 5:
            if mean_activation_history[-1] < min(mean_activation_history[-5:-2]):
                print("Optimization stopped since activation did not increase.")
                break
    return (
        input_data[0, : max_size + 1, : max_size + 1, 0],
        mean_activation_history,
        grad_history,
    )

In [None]:
submodel = build_submodel(model, "conv2d_3", True, True)

input_value_on_real, activation_history, grad_history = maximize_ouput(
    submodel,
    1,
    1000,
    nb_activation_to_max="single",
    learning_rate=0.01,
)

In [None]:
input_data = np.zeros(shape=(1, *image_shape, 1))
input_data[0, :38, :38, 0] = tf.math.sigmoid(input_value_on_real)
input_data[0, 38 : 2 * 38, :38, 0] = tf.math.sigmoid(input_value_on_real)
input_data[0, :38, 38 : 2 * 38, 0] = tf.math.sigmoid(input_value_on_real)


kwargs = {"vmin": 0, "vmax": 1}
fig = plot.figure(figsize=(6, 6))
ax = fig.add_subplot(1, 1, 1)
ax.imshow(input_data[0, :, :, 0], cmap="gray", **kwargs)
fig.suptitle(f"{model.predict(input_data)[0][0]:.2%}")
plot.show()