In [1]:
%matplotlib inline
#%pdb on

In [2]:
import tensorflow as tf
import tensorflow_models as tfm

from official.vision.configs import maskrcnn as exp_cfg
from official.core import config_definitions as cfg
from official.core import exp_factory
from official.vision.tasks import maskrcnn
from official.vision.dataloaders.tf_example_decoder import TfExampleDecoder
from official.vision.serving import export_saved_model_lib
from official.vision.configs import backbones as backbones_cfg
import tensorflow_datasets as tfds
import matplotlib.pyplot as plt
import os
import numpy as np
from official.vision.utils.object_detection import visualization_utils
import time
from custom_utils import send_pushover_notification, intercept_stdout
from functools import partial
from tfm_configs import *
import re
import tensorflow_datasets as tfds
import pandas as pd

2024-07-23 20:37:42.738156: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-23 20:37:42.752454: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-23 20:37:42.767909: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-23 20:37:42.767957: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-23 20:37:42.779924: I tensorflow/core/platform/cpu_feature_gua

In [3]:
IMAGE_SIZE = (512, 512)
BATCH_SIZE = 4
TFDS_NAME = 'leaf_instance_dataset'
INPUT_PATH = "/home/stefan.steinheber/tensorflow_datasets/leaf_instance_dataset/1.0.0/"
MODEL = "maskrcnn_mobilenet_fpn"
MODEL_DIR = "out/" + MODEL
START = time.time()
RESTORE_METRICS = False

In [4]:
exp_config = maskrcnn_mobilenet_fpn(INPUT_PATH, image_size=IMAGE_SIZE)
exp_config.trainer.steps_per_loop = 100
exp_config.trainer.validation_interval = 100

In [5]:
logical_device_names = [logical_device.name for logical_device in tf.config.list_logical_devices('GPU')]

if len(logical_device_names) == 0:
    logical_device_names = [logical_device.name for logical_device in tf.config.list_logical_devices()]

distribution_strategy = tf.distribute.OneDeviceStrategy(logical_device_names[0])

print("Created distribution Strategy on Device", logical_device_names[0])

Created distribution Strategy on Device /device:GPU:0


2024-07-23 20:37:45.503360: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1928] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 46866 MB memory:  -> device: 0, name: NVIDIA RTX A6000, pci bus id: 0000:07:00.0, compute capability: 8.6


In [6]:
def show_batch(raw_records):
    tf_ex_decoder = TfExampleDecoder(include_mask=True)
    plt.figure(figsize=(20, 20))
    use_normalized_coordinates=True
    min_score_thresh = 0.30
    for i, serialized_example in enumerate(raw_records):
        plt.subplot(1, 3, i + 1)
        decoded_tensors = tf_ex_decoder.decode(serialized_example)
        image = decoded_tensors['image'].numpy().astype('uint8')
        scores = np.ones(shape=(len(decoded_tensors['groundtruth_boxes'])))
        visualization_utils.visualize_boxes_and_labels_on_image_array(
            image,
            decoded_tensors['groundtruth_boxes'].numpy(),
            decoded_tensors['groundtruth_classes'].numpy().astype('int'),
            scores,
            category_index={
                1: {
                    'id': 1,
                    'name': 'leaf',
                },
            },
            use_normalized_coordinates=use_normalized_coordinates,
            min_score_thresh=min_score_thresh,
            instance_masks=decoded_tensors['groundtruth_instance_masks'].numpy().astype('uint8'),
            line_thickness=4)

        plt.imshow(image)
        plt.axis("off")
        plt.title(f"Image-{i+1}")
    plt.show()
    #plt.savefig("out/fig.png")

In [7]:
buffer_size = 100
num_of_examples = 2
tf_ex_decoder = TfExampleDecoder(include_mask=True)

train_tfrecords = tf.io.gfile.glob(exp_config.task.train_data.input_path)
raw_records = tf.data.TFRecordDataset(train_tfrecords).shuffle(buffer_size=buffer_size).take(num_of_examples)
#show_batch(raw_records)

val_tfrecords = tf.io.gfile.glob(exp_config.task.validation_data.input_path)
val_raw_records = tf.data.TFRecordDataset(train_tfrecords).shuffle(buffer_size=buffer_size).take(num_of_examples)
#show_batch(val_raw_records)
show = True
for train, val in zip(raw_records, val_raw_records):
    train_decoded = tf_ex_decoder.decode(train)
    val_decoded = tf_ex_decoder.decode(val)
    
    for key in train_decoded.keys():
        hor_ok = train_decoded[key].shape[1:] == val_decoded[key].shape[1:]
        if not hor_ok:
            print("Horizontal Integrity not given", key, train_decoded[key].shape[1:], val_decoded[key].shape[1:])

    sizes_train = [train_decoded[key].shape[0] for key in train_decoded.keys() if len(train_decoded[key].shape) > 0]
    train_ver_ok = len(set(sizes_train)) == 1
    if not train_ver_ok:
        print("Train vertical integrity not given", sizes_train,  [(key, value.shape) for key, value in train_decoded.items()])

    sizes_val = [val_decoded[key].shape[0] for key in val_decoded.keys() if len(val_decoded[key].shape) > 0]
    val_ver_ok = len(set(sizes_val)) == 1
    if not val_ver_ok:
        print("Val vertical integrity not given", sizes_val, [(key, value.shape) for key, value in val_decoded.items()])
print("integrity given")

Train vertical integrity not given [512, 4, 4, 4, 4, 4, 4] [('source_id', TensorShape([])), ('image', TensorShape([512, 512, 3])), ('height', TensorShape([])), ('width', TensorShape([])), ('groundtruth_classes', TensorShape([4])), ('groundtruth_is_crowd', TensorShape([4])), ('groundtruth_area', TensorShape([4])), ('groundtruth_boxes', TensorShape([4, 4])), ('groundtruth_instance_masks', TensorShape([4, 512, 512])), ('groundtruth_instance_masks_png', TensorShape([4]))]
Val vertical integrity not given [512, 2, 2, 2, 2, 2, 2] [('source_id', TensorShape([])), ('image', TensorShape([512, 512, 3])), ('height', TensorShape([])), ('width', TensorShape([])), ('groundtruth_classes', TensorShape([2])), ('groundtruth_is_crowd', TensorShape([2])), ('groundtruth_area', TensorShape([2])), ('groundtruth_boxes', TensorShape([2, 4])), ('groundtruth_instance_masks', TensorShape([2, 512, 512])), ('groundtruth_instance_masks_png', TensorShape([2]))]
Train vertical integrity not given [512, 6, 6, 6, 6, 6, 

2024-07-23 20:37:46.868145: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


integrity given


In [8]:
with distribution_strategy.scope():
    task = tfm.core.task_factory.get_task(exp_config.task, logging_dir=MODEL_DIR)

In [9]:
def send_notification(text):
    if "loss" not in text or 'eval' not in text:
        return
    step = re.search(r"step:.*?(\d+).*?\|", text)
    step = step.group(1)
    losses = re.findall(r"'(.*)_loss':.*?(\d+\.\d+)", text)
    losses.reverse()
    msg = f"Step #{step}:" + ' - '.join([f"{name} Loss: {value}" for name, value in losses])
    send_pushover_notification(msg, title="Training Losses", priority=-1)

In [10]:
data_train = []
data_val = []
os.makedirs(f"metrics/{MODEL}", exist_ok=True)
    
def log_eval(text):
    step = re.search(r"step:.*?(\d+).*?\|", text)
    step = step.group(1)
    metrics_dict = re.findall(r"\s+{?'(.*?)': (.*?)(?:,|}),", text)
    print(metrics_dict)
    metrics = {name: value for name, value in metrics_dict}
    metrics.update({'step': step, 'mode': 'train' if 'train' in text else 'eval'})
    
    data_val.append(metrics)

    df = pd.DataFrame(data_val)
    df.to_csv(f"metrics/{MODEL}/metrics_val_{START}.csv", index=False)
    
def log_train(text):
    step = re.search(r"step:.*?(\d+).*?\|", text)
    step = step.group(1)
    losses = re.findall(r"'(.*)_loss':.*?(\d+\.\d+)", text)
    metrics = {name: value for name, value in losses}
    metrics.update({'step': step, 'mode': 'train' if 'train' in text else 'eval'})

    data_train.append(metrics)

    df = pd.DataFrame(data_train)
    os.makedirs(f"metrics/{MODEL}", exist_ok=True)
    df.to_csv(f"metrics/{MODEL}/metrics_train_{START}.csv", index=False)
    
    
def tfm_log(text):
    if "output" not in text:
        return
    if "eval" in text:
        log_eval(text)
        return
    if "train" in text:
        log_train(text)
        return

In [None]:
send_pushover_notification("Starting Training", "Tensorflow Models")

with intercept_stdout(tfm_log):
    model, eval_logs = tfm.core.train_lib.run_experiment(
        distribution_strategy=distribution_strategy,
        task=task,
        mode='train_and_eval',
        params=exp_config,
        model_dir=MODEL_DIR,
        run_post_eval=True)







Instructions for updating:
Use fn_output_signature instead


Instructions for updating:
Use fn_output_signature instead


loading annotations into memory...
Done (t=0.02s)
creating index...
index created!
restoring or initializing model...
restored model from out/maskrcnn_mobilenet_fpn/ckpt-10000.
restored from checkpoint: out/maskrcnn_mobilenet_fpn/ckpt-10000
train | step:  10000 | training until step 10100...


W0000 00:00:1721759895.956551 2681704 op_level_cost_estimator.cc:699] Error in PredictCost() for the op: op: "CropAndResize" attr { key: "T" value { type: DT_FLOAT } } attr { key: "extrapolation_value" value { f: 0 } } attr { key: "method" value { s: "bilinear" } } inputs { dtype: DT_FLOAT shape { dim { size: -58 } dim { size: -43 } dim { size: -44 } dim { size: 1 } } } inputs { dtype: DT_FLOAT shape { dim { size: -4 } dim { size: 4 } } } inputs { dtype: DT_INT32 shape { dim { size: -4 } } } inputs { dtype: DT_INT32 shape { dim { size: 2 } } value { dtype: DT_INT32 tensor_shape { dim { size: 2 } } int_val: 112 } } device { type: "CPU" vendor: "AuthenticAMD" model: "241" frequency: 2700 num_cores: 2 environment { key: "cpu_instruction_set" value: "AVX SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2" } environment { key: "eigen" value: "3.4.90" } l1_cache_size: 32768 l2_cache_size: 1048576 l3_cache_size: 134217728 memory_size: 268435456 } outputs { dtype: DT_FLOAT shape { dim { size: -4 } dim { s







2024-07-23 20:38:52.663669: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:465] Loaded cuDNN version 8907


train | step:  10100 | steps/sec:    1.5 | output: 
    {'frcnn_box_loss': 0.20523308,
     'frcnn_cls_loss': 0.07739122,
     'learning_rate': 0.12,
     'mask_loss': 0.18824834,
     'model_loss': 0.5338817,
     'rpn_box_loss': 0.055377863,
     'rpn_score_loss': 0.0076312465,
     'total_loss': 0.71559393,
     'training_loss': 0.71559393}
 eval | step:  10100 | running 278 steps of evaluation...


W0000 00:00:1721759963.200437 2681704 op_level_cost_estimator.cc:699] Error in PredictCost() for the op: op: "CropAndResize" attr { key: "T" value { type: DT_FLOAT } } attr { key: "extrapolation_value" value { f: 0 } } attr { key: "method" value { s: "bilinear" } } inputs { dtype: DT_FLOAT shape { dim { size: -5 } dim { size: -6 } dim { size: -7 } dim { size: 1 } } } inputs { dtype: DT_FLOAT shape { dim { size: -2 } dim { size: 4 } } } inputs { dtype: DT_INT32 shape { dim { size: -2 } } } inputs { dtype: DT_INT32 shape { dim { size: 2 } } value { dtype: DT_INT32 tensor_shape { dim { size: 2 } } int_val: 112 } } device { type: "CPU" vendor: "AuthenticAMD" model: "241" frequency: 2700 num_cores: 2 environment { key: "cpu_instruction_set" value: "AVX SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2" } environment { key: "eigen" value: "3.4.90" } l1_cache_size: 32768 l2_cache_size: 1048576 l3_cache_size: 134217728 memory_size: 268435456 } outputs { dtype: DT_FLOAT shape { dim { size: -2 } dim { size

creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=5.17s).
Accumulating evaluation results...
DONE (t=0.22s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.464
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.745
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.524
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.473
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = -1.000
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = -1.000
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.144
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.527
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.541
 Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.541
 Average Recall     (AR) @[ IoU=0.

W0000 00:00:1721760066.365365 2681704 op_level_cost_estimator.cc:699] Error in PredictCost() for the op: op: "CropAndResize" attr { key: "T" value { type: DT_FLOAT } } attr { key: "extrapolation_value" value { f: 0 } } attr { key: "method" value { s: "bilinear" } } inputs { dtype: DT_FLOAT shape { dim { size: -5 } dim { size: -6 } dim { size: -7 } dim { size: 1 } } } inputs { dtype: DT_FLOAT shape { dim { size: -2 } dim { size: 4 } } } inputs { dtype: DT_INT32 shape { dim { size: -2 } } } inputs { dtype: DT_INT32 shape { dim { size: 2 } } value { dtype: DT_INT32 tensor_shape { dim { size: 2 } } int_val: 112 } } device { type: "CPU" vendor: "AuthenticAMD" model: "241" frequency: 2700 num_cores: 2 environment { key: "cpu_instruction_set" value: "AVX SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2" } environment { key: "eigen" value: "3.4.90" } l1_cache_size: 32768 l2_cache_size: 1048576 l3_cache_size: 134217728 memory_size: 268435456 } outputs { dtype: DT_FLOAT shape { dim { size: -2 } dim { size

In [None]:
send_pushover_notification("Finished Training", "Tensorflow Models")

export_saved_model_lib.export_inference_graph(
    input_type='image_tensor',
    batch_size=1,
    input_image_size=[IMAGE_SIZE[1], IMAGE_SIZE[0]],
    params=exp_config,
    checkpoint_path=tf.train.latest_checkpoint(MODEL_DIR),
    export_dir=f'{MODEL_DIR}/final')