In [1]:
%matplotlib inline

In [2]:
import tensorflow as tf
import tensorflow_models as tfm

from official.vision.configs import maskrcnn as exp_cfg
from official.core import config_definitions as cfg
from official.core import exp_factory
from official.vision.tasks import maskrcnn
from official.vision.dataloaders.tf_example_decoder import TfExampleDecoder
from official.vision.serving import export_saved_model_lib
from official.vision.configs import backbones as backbones_cfg
import tensorflow_datasets as tfds
import matplotlib.pyplot as plt
import os
import numpy as np
from official.vision.utils.object_detection import visualization_utils
import time
from custom_utils import send_pushover_notification, intercept_stdout
from functools import partial
from tfm_configs import *

In [3]:
IMAGE_SIZE = (640, 640)
BATCH_SIZE = 4
TFDS_NAME = 'leaf_instance_dataset'
INPUT_PATH = "/home/stefan.steinheber/tensorflow_datasets/leaf_instance_dataset/1.0.0/"
MODEL = "retina_net"
MODEL_DIR = "out/" + MODEL

In [13]:
exp_config = maskrcnn_vit_fpn(INPUT_PATH)

In [6]:
logical_device_names = [logical_device.name for logical_device in tf.config.list_logical_devices('GPU')]

if len(logical_device_names) == 0:
    logical_device_names = [logical_device.name for logical_device in tf.config.list_logical_devices()]

distribution_strategy = tf.distribute.OneDeviceStrategy(logical_device_names[0])

print("Created distribution Strategy on Device", logical_device_names[0])

Created distribution Strategy on Device /device:GPU:0


2024-07-14 14:52:50.171848: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1928] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 46866 MB memory:  -> device: 0, name: NVIDIA RTX A6000, pci bus id: 0000:06:00.0, compute capability: 8.6


In [7]:
def show_batch(raw_records):
    tf_ex_decoder = TfExampleDecoder(include_mask=True)
    plt.figure(figsize=(20, 20))
    use_normalized_coordinates=True
    min_score_thresh = 0.30
    for i, serialized_example in enumerate(raw_records):
        plt.subplot(1, 3, i + 1)
        decoded_tensors = tf_ex_decoder.decode(serialized_example)
        image = decoded_tensors['image'].numpy().astype('uint8')
        scores = np.ones(shape=(len(decoded_tensors['groundtruth_boxes'])))
        visualization_utils.visualize_boxes_and_labels_on_image_array(
            image,
            decoded_tensors['groundtruth_boxes'].numpy(),
            decoded_tensors['groundtruth_classes'].numpy().astype('int'),
            scores,
            category_index={
                1: {
                    'id': 1,
                    'name': 'leaf',
                },
            },
            use_normalized_coordinates=use_normalized_coordinates,
            min_score_thresh=min_score_thresh,
            instance_masks=decoded_tensors['groundtruth_instance_masks'].numpy().astype('uint8'),
            line_thickness=4)

        plt.imshow(image)
        plt.axis("off")
        plt.title(f"Image-{i+1}")
    #plt.show()
    plt.savefig("out/fig.png")

In [8]:
with distribution_strategy.scope():
    task = tfm.core.task_factory.get_task(exp_config.task, logging_dir=MODEL_DIR)

In [9]:
buffer_size = 100
num_of_examples = 3

train_tfrecords = tf.io.gfile.glob(exp_config.task.train_data.input_path)
raw_records = tf.data.TFRecordDataset(train_tfrecords).shuffle(buffer_size=buffer_size).take(num_of_examples)
show_batch(raw_records)

2024-07-14 14:52:51.614671: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [10]:
%load_ext tensorboard
%tensorboard --logdir {MODEL_DIR}

In [11]:
send_pushover_notification("Starting Training", "Tensorflow Models")

with intercept_stdout(partial(send_pushover_notification, title="Tensorflow Models Training", priority=-1)):
    model, eval_logs = tfm.core.train_lib.run_experiment(
        distribution_strategy=distribution_strategy,
        task=task,
        mode='train',
        params=exp_config,
        model_dir=MODEL_DIR,
        run_post_eval=False)

restoring or initializing model...
restored model from out/retina_net/ckpt-2000.
restored from checkpoint: out/retina_net/ckpt-2000
train | step:   2000 | training until step 224000...


2024-07-14 14:53:48.202537: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:465] Loaded cuDNN version 8907


train | step:   2200 | steps/sec:    3.7 | output: 
    {'box_loss': 0.009864903,
     'cls_loss': 0.58861494,
     'learning_rate': 0.06998334,
     'model_loss': 1.0818603,
     'total_loss': 5.6780415,
     'training_loss': 5.6780415}
saved checkpoint to out/retina_net/ckpt-2200.
train | step:   2400 | steps/sec:    7.8 | output: 
    {'box_loss': 0.009791267,
     'cls_loss': 0.5803076,
     'learning_rate': 0.069980174,
     'model_loss': 1.0698707,
     'total_loss': 5.539984,
     'training_loss': 5.539984}
saved checkpoint to out/retina_net/ckpt-2400.
train | step:   2600 | steps/sec:    7.8 | output: 
    {'box_loss': 0.009657195,
     'cls_loss': 0.57970667,
     'learning_rate': 0.06997673,
     'model_loss': 1.0625662,
     'total_loss': 5.410213,
     'training_loss': 5.410213}
saved checkpoint to out/retina_net/ckpt-2600.
train | step:   2800 | steps/sec:    7.5 | output: 
    {'box_loss': 0.009532521,
     'cls_loss': 0.5581615,
     'learning_rate': 0.069973014,
     'm

KeyboardInterrupt: 

In [16]:
send_pushover_notification("Finished Training", "Tensorflow Models")

export_saved_model_lib.export_inference_graph(
    input_type='image_tensor',
    batch_size=1,
    input_image_size=[IMAGE_SIZE[1], IMAGE_SIZE[0]],
    params=exp_config,
    checkpoint_path=tf.train.latest_checkpoint(MODEL_DIR),
    export_dir=f'{MODEL_DIR}/final')









INFO:tensorflow:Assets written to: out/retina_net/final/retina_net1720966054.093223/assets


INFO:tensorflow:Assets written to: out/retina_net/final/retina_net1720966054.093223/assets
