In this notebook, I followed the below blogpage until the implementation of the model: 

I had to add one additional dimension to the images for 'batch' that SegFormer model needs. That is why matplotlib functions will give an error (just skip for now). The ultimate error is for the last cell. This error occurs because the proj layer expects the input tensor to have a shape with the last dimension (axis -1) equal to 3, but the input shape received by the proj layer is (1, 134, 9, 128).

The issue seems to be related to the data preprocessing or input configuration for your model. To fix this, you should check the following:

Verify that the images in your dataset have the correct shape (height, width, channels). In this case, the channels should be 3, as it represents the RGB color channels.
Check the data preprocessing pipeline for your images. Ensure that the images are being loaded and resized correctly to the expected input shape of the model.
Verify the configuration of the proj layer or any other layer in your segformer model that might expect a specific input shape. Make sure the input shape is compatible with the model architecture.
Ensure that the input images are correctly passed to the model during training and validation.



https://www.analyticsvidhya.com/blog/2023/04/deep-learning-for-image-segmentation-with-tensorflow/

Some cells used: 
https://keras.io/examples/vision/segformer/

### This notebook is a previous version of the one4all and stepwise notebooks.
### It uses images with 3 channels and png masks with 1 channel (it converts pngs with 3 channels to 1 channel) and is using post image and classification masks. The rest is similar to the one4all and stepwise notebooks.

In [None]:
import os
#import cv2
import numpy as np
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
import matplotlib as mpl
from tqdm import tqdm
from sklearn.model_selection import train_test_split

In [None]:
import tensorflow as tf
import os

# A list to collect paths of 1000 images
image_path = []
for root, dirs, files in os.walk('/Users/gmeneses/DScourse/00_capstone/xView2_baseline_fork/xBD_last_subset_test_mask/guatemala-volcano/images'):
    # Iterate over 1000 images
    for file in files:
        # Check if the file has a PNG extension
        if file.lower().endswith('.png') and '_post' in file:
            # Create path
            path = os.path.join(root, file)
            # Add path to list
            image_path.append(path)
            
print(len(image_path))

# A list to collect paths of 1000 masks
mask_path = []
for root, dirs, files in os.walk('/Users/gmeneses/DScourse/00_capstone/xView2_baseline_fork/xBD_last_subset_test_mask/guatemala-volcano/masks'):
    # Iterate over 1000 masks
    for file in files:
        # Check if the file has a PNG extension
        if file.lower().endswith('.png'):
            # Obtain the path
            path = os.path.join(root, file)
            # Add path to the list
            mask_path.append(path)
print(len(mask_path))

     

12
12


In [None]:
def convert_mask_to_single_channel(mask_3_channels):
    # Assuming mask_3_channels has shape (height, width, 3)
    height, width, _ = mask_3_channels.shape

    # Create an empty array with shape (height, width, 1) for the single-channel mask
    single_channel_mask = np.zeros((height, width, 1), dtype=np.uint8)

    # Define the colors representing each category (RGB values)
    category_colors = {
        (0, 0, 0): 0,        # Class 0 - Black (no building) or un-classified
        (255, 255, 255): 1,  # Class 1 - White (no-damage)
        (255, 0, 0): 2,      # Class 2 - Red (minor damage)
        (0, 255, 0): 3,      # Class 3 - Green (major damage)
        (0, 0, 255): 4,      # Class 4 - Blue (destroyed)
    }
    # Loop through each pixel and assign the corresponding category to the single-channel mask
    for y in range(height):
        for x in range(width):
            pixel_color = tuple(mask_3_channels[y, x])
            category = category_colors.get(pixel_color, -1)  # -1 for unknown category
            single_channel_mask[y, x] = category

    return single_channel_mask

In [None]:
# here we are converting png images and masks to arrays
from PIL import Image
# create a list to store images
images = []
# iterate over 1000 image paths
for path in tqdm(image_path):
    # read file
    file = tf.io.read_file(path)
    # decode png file into a tensor
    image = tf.image.decode_png(file, channels=3, dtype=tf.uint8)

    # #adding 4th dimension for batch size 
    # image = tf.expand_dims(image, axis=0)
    # append to the list
    images.append(image)


# create a list to store masks and converts to single channel
masks = []
# iterate over 1000 mask paths
for path in tqdm(mask_path):
    # read the file
    #file = tf.io.read_file(path)
    file = Image.open(path)
    mask_3_channels = np.array(file)
    # decode png file into a tensor
    #mask_3_channels = tf.image.decode_png(file, channels=3, dtype=tf.uint8)
    mask = convert_mask_to_single_channel(mask_3_channels)
    # #adding 4th dimension for batch size 
    # mask = tf.expand_dims(mask, axis=0)
    # append mask to the list
    masks.append(tf.convert_to_tensor(mask))

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:00<00:00, 37.19it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:35<00:00,  2.95s/it]


In [None]:
# Print the shape of the first image
for i,im in enumerate(images):
    print(images[i].shape)
#print(type(images[0]))

(1024, 1024, 3)
(1024, 1024, 3)
(1024, 1024, 3)
(1024, 1024, 3)
(1024, 1024, 3)
(1024, 1024, 3)
(1024, 1024, 3)
(1024, 1024, 3)
(1024, 1024, 3)
(1024, 1024, 3)
(1024, 1024, 3)
(1024, 1024, 3)


In [None]:
# plt.figure(figsize=(25,13))

# # Iterate over the images in the range 4-6
# for i in range(4,7):
#     # Create a subplot for each image
#     plt.subplot(4,6,i)
#     # Get the i-th image from the list
#     img = images[i]
#     # Show the image with a colorbar
#     plt.imshow(img)
#     plt.colorbar()
#     # Turn off the axis labels
#     plt.axis('off')

# # Display the figure
# plt.show()

In [None]:
# # Define a normalizer that can be applied while visualizing masks to have a consistency
# NORM = mpl.colors.Normalize(vmin=0, vmax=58)

# # plot masks
# plt.figure(figsize=(25,13))
# for i in range(4,7):
#     plt.subplot(4,6,i)
#     img = masks[i]
#     plt.imshow(img, cmap='jet', norm=NORM)
#     plt.colorbar()
#     plt.axis('off')
# plt.show()

In [None]:
# #functions to resize the images and masks 
# def resize_image(image):
#     # scale the image
#     image = tf.cast(image, tf.float32)
#     image = image/255.0
#     # resize image
#     image = tf.image.resize(image, (128,128))
#     return image

# def resize_mask(mask):
#     # resize the mask
#     mask = tf.image.resize(mask, (128,128))
#     mask = tf.cast(mask, tf.uint8)
#     return mask    



#X = [resize_image(i) for i in images]
#y = [resize_mask(m) for m in masks]

X = [i for i in images]
y = [m for m in masks]
len(X), len(y) 

(12, 12)

In [None]:
print (X[0].shape)
print (y[0].shape)

(1024, 1024, 3)
(1024, 1024, 1)


In [None]:
# #visualizing a resized image and respective mask
# # plot an image
# plt.imshow(X[11])
# plt.colorbar()
# plt.show()

# #plot a mask
# plt.imshow(y[11], cmap='jet')
# plt.colorbar()
# plt.show()

In [None]:
# split data into 80/20 ratio
train_X, val_X,train_y, val_y = train_test_split(X, y, test_size=0.2, 
                                                      random_state=0
                                                     )
# develop tf Dataset objects
#train_X = tf.data.Dataset.from_tensor_slices(train_X)
#val_X = tf.data.Dataset.from_tensor_slices(val_X)
train_dataset = tf.data.Dataset.from_tensor_slices((train_X,train_y))
val_dataset = tf.data.Dataset.from_tensor_slices((val_X,val_y))

#train_y = tf.data.Dataset.from_tensor_slices(train_y)
#val_y = tf.data.Dataset.from_tensor_slices(val_y)
#print(len(train_X) ,len(val_X),len(train_y),len(val_y))
# verify the shapes and data types
#train_X.element_spec, train_y.element_spec, val_X.element_spec, val_y.element_spec

In [None]:
# sample = next(iter(train_dataset))
# image, mask = sample[0], sample[1]

# # Inspect the shape and data type of the image and mask
# print("Image shape:", image.shape)  # E.g., (height, width, channels)
# print("Image data type:", image.dtype)  # E.g., float32

# print("Mask shape:", mask.shape)  # E.g., (height, width, 1)
# print("Mask data type:", mask.dtype)  # E.g., int64

Image shape: (1024, 1024, 3)
Image data type: <dtype: 'uint8'>
Mask shape: (1024, 1024, 1)
Mask data type: <dtype: 'uint8'>


In [None]:
#train_dataset

<_TensorSliceDataset element_spec=(TensorSpec(shape=(1024, 1024, 3), dtype=tf.uint8, name=None), TensorSpec(shape=(1024, 1024, 1), dtype=tf.uint8, name=None))>

In [None]:
# # Augmentation functions ARE NOT implemented in this notebook

# # adjust brightness of image
# # don't alter in mask
# def brightness(img, mask):
#     img = tf.image.adjust_brightness(img, 0.1)
#     return img, mask

# # adjust gamma of image
# # don't alter in mask
# def gamma(img, mask):
#     img = tf.image.adjust_gamma(img, 0.1)
#     return img, mask

# # adjust hue of image
# # don't alter in mask
# def hue(img, mask):
#     img = tf.image.adjust_hue(img, -0.1)
#     return img, mask

# def crop(img, mask):
#     # crop both image and mask identically
#     img = tf.image.central_crop(img, 0.7)
#     # resize after cropping
#     img = tf.image.resize(img, (128,128))
#     mask = tf.image.central_crop(mask, 0.7)
#     # resize afer cropping
#     mask = tf.image.resize(mask, (128,128))
#     # cast to integers as they are class numbers
#     mask = tf.cast(mask, tf.uint8)
#     return img, mask
# # flip both image and mask identically
# def flip_hori(img, mask):
#     img = tf.image.flip_left_right(img)
#     mask = tf.image.flip_left_right(mask)
#     return img, mask

# # flip both image and mask identically
# def flip_vert(img, mask):
#     img = tf.image.flip_up_down(img)
#     mask = tf.image.flip_up_down(mask)
#     return img, mask

# # rotate both image and mask identically
# def rotate(img, mask):
#     img = tf.image.rot90(img)
#     mask = tf.image.rot90(mask)
#     return img, mask

In [None]:
# # zip images and masks
# #train = tf.data.Dataset.zip((train_X, train_y))
# #val = tf.data.Dataset.zip((val_X, val_y))

# # perform augmentation on train data only

# a = train_dataset.map(brightness)
# b = train_dataset.map(gamma)
# c = train_dataset.map(hue)
# d = train_dataset.map(crop)
# e = train_dataset.map(flip_hori)
# f = train_dataset.map(flip_vert)
# g = train_dataset.map(rotate)

# # concatenate every new augmented sets
# train_dataset = train_dataset.concatenate(a)
# train_dataset = train_dataset.concatenate(b)
# train_dataset = train_dataset.concatenate(c)
# train_dataset = train_dataset.concatenate(d)
# train_dataset = train_dataset.concatenate(e)
# train_dataset = train_dataset.concatenate(f)

In [None]:
def map_fn(image, mask):
    # Assign names to the elements in the dataset
    return {"image": image, "segmentation_mask": mask}
named_dataset_train = train_dataset.map(map_fn)
named_dataset_val = val_dataset.map(map_fn)



In [None]:
sample = next(iter(named_dataset_train))
image, mask = sample["image"], sample["segmentation_mask"]

# Inspect the shape and data type of the image and mask
print("Image shape:", image.shape)  # E.g., (height, width, channels)
print("Image data type:", image.dtype)  # E.g., float32

print("Mask shape:", mask.shape)  # E.g., (height, width, 1)
print("Mask data type:", mask.dtype)  # E.g., int64

Image shape: (1024, 1024, 3)
Image data type: <dtype: 'uint8'>
Mask shape: (1024, 1024, 1)
Mask data type: <dtype: 'uint8'>


In [None]:
from tensorflow.keras import backend

image_size = 512
mean = tf.constant([0.485, 0.456, 0.406])
std = tf.constant([0.229, 0.224, 0.225])


def normalize(input_image, input_mask):
    input_image = tf.image.convert_image_dtype(input_image, tf.float32)
    input_image = (input_image - mean) / tf.maximum(std, backend.epsilon())
    #input_mask -= 1
    return input_image, input_mask


def load_image(datapoint):
    input_image = tf.image.resize(datapoint["image"], (image_size, image_size))
    input_mask = tf.image.resize(
        datapoint["segmentation_mask"],
        (image_size, image_size),
        method="bilinear",
    )

    input_image, input_mask = normalize(input_image, input_mask)
    input_image = tf.transpose(input_image, (2, 0, 1))
    return {"pixel_values": input_image, "labels": tf.squeeze(input_mask)}    

In [None]:
#setting the batch size
# BATCH = 4

# AT = tf.data.AUTOTUNE
# #buffersize
# BUFFER = 6

# STEPS_PER_EPOCH = 9//BATCH # total number of training samples / batch
# VALIDATION_STEPS = 3//BATCH # total number of validation samples / batch


# #caches the data in memory to speed up data loading - shuffles - batches
# train = (named_dataset_train
#          .cache()
#          .shuffle(BUFFER)
#          .map(load_image, num_parallel_calls=AT)
#          .batch(BATCH)
#          .prefetch(buffer_size=AT)
#          )

# val = (named_dataset_val
#        .map(load_image, num_parallel_calls=AT)
#        .batch(BATCH)
#        .prefetch(buffer_size=AT)
# )

In [None]:
auto = tf.data.AUTOTUNE
batch_size = 4

train = (
    named_dataset_train
    .cache()
    .shuffle(batch_size * 10)
    .map(load_image, num_parallel_calls=auto)
    .batch(batch_size)
    .prefetch(auto)
)
val = (
    named_dataset_val
    .map(load_image, num_parallel_calls=auto)
    .batch(batch_size)
    .prefetch(auto)
)

In [None]:
train

<_PrefetchDataset element_spec={'pixel_values': TensorSpec(shape=(None, 3, 512, 512), dtype=tf.float32, name=None), 'labels': TensorSpec(shape=(None, 512, 512), dtype=tf.float32, name=None)}>

From hereon, I added my code


In [None]:
# load the model
# this part was take from: https://keras.io/examples/vision/segformer/

from transformers import TFSegformerForSemanticSegmentation

model_checkpoint = "nvidia/mit-b0"
id2label = {0: "background", 1: "no-damage", 2: "minor-damage", 3: "major-damage", 4: "destroyed"}
label2id = {label: id for id, label in id2label.items()}
num_labels = len(id2label)
model = TFSegformerForSemanticSegmentation.from_pretrained(
    model_checkpoint,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True,
)

  from .autonotebook import tqdm as notebook_tqdm
2023-08-02 11:18:00.890987: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x28d91a7c0 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2023-08-02 11:18:00.891128: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Host, Default Version
2023-08-02 11:18:01.013470: I ./tensorflow/compiler/jit/device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.




Some layers from the model checkpoint at nvidia/mit-b0 were not used when initializing TFSegformerForSemanticSegmentation: ['classifier']
- This IS expected if you are initializing TFSegformerForSemanticSegmentation from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFSegformerForSemanticSegmentation from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFSegformerForSemanticSegmentation were not initialized from the model checkpoint at nvidia/mit-b0 and are newly initialized: ['decode_head']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# this part was take from: https://keras.io/examples/vision/segformer/
# lr = 0.00006
# optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
# loss_function = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
# model.compile(optimizer=optimizer, loss=loss_function)
#

lr = 0.00006
optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
model.compile(optimizer=optimizer)



In [None]:
model.summary()

Model: "tf_segformer_for_semantic_segmentation"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 segformer (TFSegformerMain  multiple                  3319392   
 Layer)                                                          
                                                                 
 decode_head (TFSegformerDe  multiple                  396549    
 codeHead)                                                       
                                                                 
Total params: 3715941 (14.18 MB)
Trainable params: 3715429 (14.17 MB)
Non-trainable params: 512 (2.00 KB)
_________________________________________________________________


In [None]:

import matplotlib.pyplot as plt



def display(display_list):
    plt.figure(figsize=(15, 15))

    title = ["Input Image", "True Mask", "Predicted Mask"]

    for i in range(len(display_list)):
        plt.subplot(1, len(display_list), i + 1)
        plt.title(title[i])
        plt.imshow(tf.keras.utils.array_to_img(display_list[i]))
        plt.axis("off")
    # Create the folder if it doesn't exist
    plt.show()


for samples in val.take(2):
    
    sample_image, sample_mask = samples["pixel_values"][0], samples["labels"][0]
    sample_image = tf.transpose(sample_image, (1, 2, 0))
    sample_mask = tf.expand_dims(sample_mask, -1)
    display([sample_image, sample_mask])

In [None]:
# this part was take from: https://keras.io/examples/vision/segformer/

from IPython.display import clear_output


def create_mask(pred_mask):
    pred_mask = tf.math.argmax(pred_mask, axis=1)
    pred_mask = tf.expand_dims(pred_mask, -1)
    return pred_mask[0]


def show_predictions(dataset=None, num=1):
    if dataset:
        for sample in dataset.take(num):
            images, masks = sample["pixel_values"], sample["labels"]
            masks = tf.expand_dims(masks, -1)
            pred_masks = model.predict(images).logits
            images = tf.transpose(images, (0, 2, 3, 1))
            display([images[0], masks[0], create_mask(pred_masks)])
    else:
        display(
            [
                sample_image,
                sample_mask,
                create_mask(model.predict(tf.expand_dims(sample_image, 0))),
            ]
        )


class DisplayCallback(tf.keras.callbacks.Callback):
    def __init__(self, dataset, **kwargs):
        super().__init__(**kwargs)
        self.dataset = dataset

    def on_epoch_end(self, epoch, logs=None):
        clear_output(wait=True)
        show_predictions(self.dataset)
        print("\nSample Prediction after epoch {}\n".format(epoch + 1))

# Train the Model

In [None]:
# ??not sure if I need to use train and val or train_X and val_X

history = model.fit(
    train,
    validation_data=val,
    callbacks=[DisplayCallback(val)],
    epochs=10,
)


Epoch 1/10


In [None]:
#!pip install ipython
from IPython.display import clear_output
show_predictions(val, 5)



[<tf.Tensor: shape=(512, 512, 3), dtype=float32, numpy=
 array([[[355.9607 , 420.95535, 333.75113],
         [329.75983, 394.16962, 307.08447],
         [326.4847 , 391.93747, 302.64   ],
         ...,
         [208.58078, 367.3839 , 175.97334],
         [193.29694, 342.83035, 161.5289 ],
         [202.03056, 358.45535, 164.86223]],
 
        [[361.41922, 432.11606, 339.30667],
         [357.0524 , 428.76785, 334.86224],
         [354.869  , 428.76785, 331.5289 ],
         ...,
         [199.84715, 347.29462, 168.19557],
         [212.94759, 377.42856, 179.30669],
         [207.48907, 370.73212, 172.64001]],
 
        [[371.24454, 447.74106, 348.1956 ],
         [382.16156, 461.1339 , 358.1956 ],
         [382.16156, 463.36606, 358.1956 ],
         ...,
         [208.58078, 359.5714 , 173.75113],
         [212.94759, 377.42856, 177.08446],
         [217.3144 , 389.70535, 179.30669]],
 
        ...,
 
        [[194.38864, 263.58926, 147.08446],
         [238.05676, 318.27676, 183.75113]