In [2]:
import os
import pathlib
from glob import glob

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from PIL import Image
import pickle

import tensorflow as tf
import tensorflow_datasets as tfdata
import tensorflow_docs as tfdocs
import tensorflow_docs.plots
from tensorflow.python.saved_model import signature_constants

from tensorflow.keras import Model
from tensorflow.keras.models import *
from tensorflow.keras.layers import *
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.preprocessing.image import *
from tensorflow.keras.losses import SparseCategoricalCrossentropy

import cv2


import mlflow
from mlflow.tracking import MlflowClient
from mlflow.exceptions import MlflowException
from mlflow.models.signature import ModelSignature
from mlflow.types.schema import Schema, TensorSpec

from prefect import flow, task
from prefect.task_runners import SequentialTaskRunner

print(f'Pandas:{pd.__version__}, Numpy:{np.__version__}, Tensorflow:{tf.__version__}')
#os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
print('Device:', tf.config.list_physical_devices('GPU'))
print("----Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Pandas:1.3.4, Numpy:1.21.4, Tensorflow:2.11.0
Device: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
----Num GPUs Available:  1


In [3]:
def mlflow_setup():
    MLFLOW_TRACKING_URI ="sqlite:////home/pmspraju/tracking-server/mlflow.db" 
    mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
    client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)

    return client

In [8]:
@task
def create_mlflow_experiment(experiment_name):
    try:
        experiment_id = mlflow.create_experiment(
            experiment_name,
            #artifact_location=Path.cwd().joinpath("mlruns").as_uri(),
            artifact_location='//home/pmspraju/tracking-server/mlruns/',
            tags={"version": "v1", "priority": "P1"},
        )
    except Exception as MlflowException:
        print(f"Experiment exists")
        experiment= mlflow.set_experiment(experiment_name)
        # Examine the experiment details.
        print("Experiment_id: {}".format(experiment.experiment_id))
        print("Name: {}".format(experiment.name))
        print("Artifact Location: {}".format(experiment.artifact_location))
        print("Tags: {}".format(experiment.tags))
        print("Lifecycle_stage: {}".format(experiment.lifecycle_stage))
        print("Last Updated timestamp: {}".format(experiment.last_update_time))

In [9]:
#Normalize the images in the dataset. We must also normalize the masks so that the classes are numbered from 0 through 2, 
# instead of from 1 through 3
def normalize(input_image, input_mask):
    input_image = tf.cast(input_image, tf.float32) / 255.0
    input_mask -= 1
    return input_image, input_mask

In [10]:
# Load an image, given an element from a TensorFlow dataset data structure. Note that we resize both the image and the mask to 256x256. 
# Also, if the train flag is set to True, we perform augmentation by randomly mirroring the image and its mask. 
# Finally, we normalize the inputs:
@tf.function
def load_image(dataset_element, train=True):

    input_image = tf.image.resize(dataset_element['image'], (256, 256))
    input_mask = tf.image.resize(dataset_element['segmentation_mask'],(256, 256))

    if train and np.random.uniform() > 0.5:
        input_image = tf.image.flip_left_right(input_image)
        input_mask = tf.image.flip_left_right(input_mask)

    input_image, input_mask = normalize(input_image, input_mask)

    return input_image, input_mask

In [36]:
# class, UNet(), that will contain all the logic necessary to build, train, and evaluate our U-Net.
class UNet(object):

    # output_channels is, by default, 3, because each pixel can be categorized into one of three classes.
    def __init__(self, input_size=(256, 256, 3), output_channels=3):
        self.input_size = input_size
        self.output_channels = output_channels
        self.model = self._create_model()
        loss = SparseCategoricalCrossentropy(from_logits=True)
        self.model.compile(optimizer=RMSprop(), loss=loss, metrics=['accuracy'])

    
    # This is a convolution that can be (optionally) batch normalized and that's activated with LeakyReLU:
    @staticmethod
    def _downsample(filters, size, batch_norm=True):

        initializer = tf.random_normal_initializer(0.0, 0.02)
        layers = Sequential()

        layers.add(Conv2D(filters=filters,
                          kernel_size=size,
                          strides=2,
                          padding='same',
                          kernel_initializer=initializer,
                          use_bias=False))
        
        if batch_norm:
            layers.add(BatchNormalization())

        layers.add(LeakyReLU())

        return layers
    
    # the _upsample() helper method expands its input through a transposed convolution, which is also batch normalized 
    # and ReLU activated (optionally, we can add a dropout layer to prevent overfitting):
    def _upsample(self, filters, size, drop_out=False):

        init = tf.random_normal_initializer(0.0, 0.02)

        layers = Sequential()

        layers.add(Conv2DTranspose(filters=filters,
                                    kernel_size=size,
                                    strides=2,
                                    padding='same',
                                    kernel_initializer=init,
                                    use_bias=False))
        
        layers.add(BatchNormalization())

        if drop_out:
            layers.add(Dropout(rate=0.5))

        layers.add(ReLU())

        return layers
    
    # The encoding part of the network is just a stack of downsampling blocks
    # the decoding portion is, as expected, comprised of a series of upsampling blocks
    def _create_model(self):

        down_stack = [self._downsample(64, 4, batch_norm=False)]

        for filters in (128, 256, 512, 512, 512, 512, 512):
            down_block = self._downsample(filters, 4)
            down_stack.append(down_block)

        up_stack = []
        for _ in range(3):
            up_block = self._upsample(512, 4, drop_out=True)
            up_stack.append(up_block)

        for filters in (512, 256, 128, 64):
            up_block = self._upsample(filters, 4)
            up_stack.append(up_block)

        inputs = Input(shape=self.input_size)

        x = inputs

        skip_layers = []

        for down in down_stack:
            x = down(x)
            skip_layers.append(x)

        skip_layers = reversed(skip_layers[:-1])

        for up, skip_connection in zip(up_stack, skip_layers):
            x = up(x)
            x = Concatenate()([x, skip_connection])

        init = tf.random_normal_initializer(0.0, 0.02)
        output = Conv2DTranspose(   filters=self.output_channels,
                                    kernel_size=3,
                                    strides=2,
                                    padding='same',
                                    kernel_initializer=init)(x)
        
        return Model(inputs, outputs=output)
    
    @staticmethod
    def _plot_model_history(model_history, metric, ylim=None):
        
        plt.style.use('seaborn-darkgrid')
        plotter = tfdocs.plots.HistoryPlotter()
        plotter.plot({'Model': model_history}, metric=metric)
        plt.title(f'{metric.upper()}')

        if ylim is None:
            plt.ylim([0, 1])
        else:
            plt.ylim(ylim)

        plt.savefig(f'{metric}.png')
        plt.close()

    def train(self, train_dataset, epochs, steps_per_epoch, validation_dataset, validation_steps):
        hist = self.model.fit(train_dataset,
                                epochs=epochs,
                                steps_per_epoch=steps_per_epoch,
                                validation_steps=validation_steps,
                                validation_data=validation_dataset)
        
        self._plot_model_history(hist, 'loss', [0., 2.0])
        self._plot_model_history(hist, 'accuracy')
        
    @staticmethod
    def _process_mask(mask):
        mask = (mask.numpy() * 127.5).astype('uint8')
        mask = cv2.cvtColor(mask, cv2.COLOR_GRAY2RGB)

        return mask
    
    def _save_image_and_masks(self, image, ground_truth_mask, prediction_mask, image_id):

        image = (image.numpy() * 255.0).astype('uint8')
        gt_mask = self._process_mask(ground_truth_mask)
        pred_mask = self._process_mask(prediction_mask)

        mosaic = np.hstack([image, gt_mask, pred_mask])
        mosaic = cv2.cvtColor(mosaic, cv2.COLOR_RGB2BGR)

        cv2.imwrite(f'mosaic_{image_id}.jpg', mosaic)

    @staticmethod
    def _create_mask(prediction_mask):
        prediction_mask = tf.argmax(prediction_mask, axis=-1)
        prediction_mask = prediction_mask[...,tf.newaxis]
        
        return prediction_mask[0]

    def _save_predictions(self, dataset, sample_size=1):

        for id, (image, mask) in enumerate(dataset.take(sample_size), start=1):
            pred_mask = self.model.predict(image)
            pred_mask = self._create_mask(pred_mask)

            image = image[0]
            ground_truth_mask = mask[0]
            self._save_image_and_masks(image, ground_truth_mask, pred_mask, image_id=id)

    def evaluate(self, test_dataset, sample_size=5):

        result = self.model.evaluate(test_dataset)
        print(f'Accuracy: {result[1] * 100:.2f}%')

        self._save_predictions(test_dataset, sample_size)

In [75]:
def load_data(AUTOTUNE, BUFFER_SIZE, BATCH_SIZE):

    dataset, info = tfdata.load('oxford_iiit_pet', with_info=True)
    print(info)

    TRAIN_SIZE = info.splits['train[:10%]'].num_examples
    VALIDATION_SIZE = info.splits['test[:10%]'].num_examples
    print(TRAIN_SIZE, VALIDATION_SIZE)
    
    i=0
    for item in tfdata.as_numpy(dataset['train'].take(TRAIN_SIZE)):
        i = i + 1
    print(i)


    train_dataset = (dataset['train'].take(TRAIN_SIZE) #dataset['train']
                        .map(load_image, num_parallel_calls=AUTOTUNE)
                        .cache()
                        .shuffle(BUFFER_SIZE)
                        .batch(BATCH_SIZE)
                        .repeat()
                        .prefetch(buffer_size=AUTOTUNE))
    
    test_dataset = (dataset['test'].take(VALIDATION_SIZE) #dataset['test']
                    .map(lambda d: load_image(d, train=False), num_parallel_calls=AUTOTUNE)
                    .batch(BATCH_SIZE))
    
    return train_dataset, test_dataset, TRAIN_SIZE, VALIDATION_SIZE

In [13]:
def train_model(train_dataset, test_dataset, EPOCHS, STEPS_PER_EPOCH, VALIDATION_STEPS):

    unet = UNet()
    unet.train(train_dataset,
                epochs=EPOCHS,
                steps_per_epoch=STEPS_PER_EPOCH,
                validation_steps=VALIDATION_STEPS,
                validation_dataset=test_dataset)
    unet.evaluate(test_dataset)

In [79]:
@flow(task_runner=SequentialTaskRunner())
def main():
    
    # client = mlflow_setup()
    # experiment_name = 'IMAGE-SEGMENTATION'
    # create_mlflow_experiment(experiment_name)

    BATCH_SIZE = 32
    BUFFER_SIZE = 1000
    AUTOTUNE = tf.data.experimental.AUTOTUNE

    train_dataset, test_dataset, TRAIN_SIZE, VALIDATION_SIZE = load_data(AUTOTUNE, BUFFER_SIZE, BATCH_SIZE)

    EPOCHS = 1 #50
    STEPS_PER_EPOCH = TRAIN_SIZE // BATCH_SIZE
    VALIDATION_SUBSPLITS = 5
    VALIDATION_STEPS = VALIDATION_SIZE // BATCH_SIZE
    VALIDATION_STEPS //= VALIDATION_SUBSPLITS
    
    train_dataset, test_dataset, TRAIN_SIZE, VALIDATION_SIZE = load_data(AUTOTUNE, BUFFER_SIZE, BATCH_SIZE)
    train_model(train_dataset, test_dataset, EPOCHS, STEPS_PER_EPOCH, VALIDATION_STEPS)

In [80]:
if __name__ == "__main__":
    main()

15:28:10.116 | INFO    | prefect.engine - Created flow run 'wondrous-hog' for flow 'main'
15:28:10.116 | INFO    | Flow run 'wondrous-hog' - Using task runner 'SequentialTaskRunner'


tfds.core.DatasetInfo(
    name='oxford_iiit_pet',
    full_name='oxford_iiit_pet/3.2.0',
    description="""
    The Oxford-IIIT pet dataset is a 37 category pet image dataset with roughly 200
    images for each class. The images have large variations in scale, pose and
    lighting. All images have an associated ground truth annotation of breed.
    """,
    homepage='http://www.robots.ox.ac.uk/~vgg/data/pets/',
    data_path='/home/pmspraju/tensorflow_datasets/oxford_iiit_pet/3.2.0',
    download_size=773.52 MiB,
    dataset_size=774.69 MiB,
    features=FeaturesDict({
        'file_name': Text(shape=(), dtype=tf.string),
        'image': Image(shape=(None, None, 3), dtype=tf.uint8),
        'label': ClassLabel(shape=(), dtype=tf.int64, num_classes=37),
        'segmentation_mask': Image(shape=(None, None, 1), dtype=tf.uint8),
        'species': ClassLabel(shape=(), dtype=tf.int64, num_classes=2),
    }),
    supervised_keys=('image', 'label'),
    disable_shuffling=False,
    spli

2023-03-26 15:28:13.009936: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] layout failed: INVALID_ARGUMENT: Size of values 0 does not match size of permutation 4 @ fanin shape inmodel/sequential_8/dropout/dropout/SelectV2-2-TransposeNHWCToNCHW-LayoutOptimizer
2023-03-26 15:28:15.250291: W tensorflow/core/kernels/gpu_utils.cc:50] Failed to allocate memory for convolution redzone checking; skipping this check. This is benign and only means that we won't check cudnn for out-of-bounds reads and writes. This message will only be printed once.
2023-03-26 15:28:25.489274: W tensorflow/tsl/framework/bfc_allocator.cc:479] Allocator (GPU_0_bfc) ran out of memory trying to allocate 128.00MiB (rounded to 134217728)requested by op model/sequential_14/conv2d_transpose_6/conv2d_transpose
If the cause is memory fragmentation maybe the environment variable 'TF_GPU_ALLOCATOR=cuda_malloc_async' will improve the situation. 
Current allocation summary follows.
Current allocation summary follo