In [1]:
%matplotlib inline

In [2]:
import tensorflow as tf
import tensorflow_models as tfm

from official.vision.configs import maskrcnn as exp_cfg
from official.core import config_definitions as cfg
from official.core import exp_factory
from official.vision.tasks import maskrcnn
from official.vision.dataloaders.tf_example_decoder import TfExampleDecoder
from official.vision.serving import export_saved_model_lib
from official.vision.configs import backbones as backbones_cfg
import tensorflow_datasets as tfds
import matplotlib.pyplot as plt
import os
import numpy as np
from official.vision.utils.object_detection import visualization_utils
import time
from custom_utils import send_pushover_notification, intercept_stdout
from functools import partial
from tfm_configs import *
import re
import tensorflow_datasets as tfds

In [3]:
IMAGE_SIZE = (640, 640)
BATCH_SIZE = 4
TFDS_NAME = 'leaf_instance_dataset'
INPUT_PATH = "/home/stefan.steinheber/tensorflow_datasets/leaf_instance_dataset/1.0.0/"
MODEL = "retinanet_resnet_fpn"
MODEL_DIR = "out/" + MODEL

In [4]:
exp_config = retinanet_resnet_fpn(INPUT_PATH)

In [5]:
logical_device_names = [logical_device.name for logical_device in tf.config.list_logical_devices('GPU')]

if len(logical_device_names) == 0:
    logical_device_names = [logical_device.name for logical_device in tf.config.list_logical_devices()]

distribution_strategy = tf.distribute.OneDeviceStrategy(logical_device_names[0])

print("Created distribution Strategy on Device", logical_device_names[0])

Created distribution Strategy on Device /device:GPU:0


2024-07-16 13:42:03.010026: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1928] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 45507 MB memory:  -> device: 0, name: NVIDIA RTX A6000, pci bus id: 0000:06:00.0, compute capability: 8.6


In [6]:
def show_batch(raw_records):
    tf_ex_decoder = TfExampleDecoder(include_mask=True)
    plt.figure(figsize=(20, 20))
    use_normalized_coordinates=True
    min_score_thresh = 0.30
    for i, serialized_example in enumerate(raw_records):
        plt.subplot(1, 3, i + 1)
        decoded_tensors = tf_ex_decoder.decode(serialized_example)
        image = decoded_tensors['image'].numpy().astype('uint8')
        scores = np.ones(shape=(len(decoded_tensors['groundtruth_boxes'])))
        visualization_utils.visualize_boxes_and_labels_on_image_array(
            image,
            decoded_tensors['groundtruth_boxes'].numpy(),
            decoded_tensors['groundtruth_classes'].numpy().astype('int'),
            scores,
            category_index={
                1: {
                    'id': 1,
                    'name': 'leaf',
                },
            },
            use_normalized_coordinates=use_normalized_coordinates,
            min_score_thresh=min_score_thresh,
            instance_masks=decoded_tensors['groundtruth_instance_masks'].numpy().astype('uint8'),
            line_thickness=4)

        plt.imshow(image)
        plt.axis("off")
        plt.title(f"Image-{i+1}")
    plt.show()
    #plt.savefig("out/fig.png")

In [7]:
with distribution_strategy.scope():
    task = tfm.core.task_factory.get_task(exp_config.task, logging_dir=MODEL_DIR)

In [8]:
buffer_size = 100
num_of_examples = 3

train_tfrecords = tf.io.gfile.glob(exp_config.task.train_data.input_path)
raw_records = tf.data.TFRecordDataset(train_tfrecords).shuffle(buffer_size=buffer_size).take(num_of_examples)
show_batch(raw_records)

2024-07-16 13:42:07.208258: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [9]:
def send_notification(text):
    if "loss" not in text or 'eval' not in text:
        return
    step = re.search(r"step:.*?(\d+).*?\|", text)
    step = step.group(1)
    losses = re.findall(r"'(.*)_loss':.*?(\d+\.\d+)", text)
    losses.reverse()
    msg = f"Step #{step}:" + ' - '.join([f"{name} Loss: {value}" for name, value in losses])
    send_pushover_notification(msg, title="Training Losses", priority=-1)

In [None]:
send_pushover_notification("Starting Training", "Tensorflow Models")

with intercept_stdout(send_notification):
    model, eval_logs = tfm.core.train_lib.run_experiment(
        distribution_strategy=distribution_strategy,
        task=task,
        mode='train_and_eval',
        params=exp_config,
        model_dir=MODEL_DIR,
        run_post_eval=True)

restoring or initializing model...
restored model from out/retinanet_resnet_fpn/ckpt-184200.
restored from checkpoint: out/retinanet_resnet_fpn/ckpt-184200
train | step:  184200 | training until step 185200...


2024-07-16 09:40:32.478372: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:465] Loaded cuDNN version 8907


train | step:  185200 | steps/sec:    4.0 | output: 
    {'box_loss': 0.0015649764,
     'cls_loss': 0.13380967,
     'learning_rate': 0.0012,
     'model_loss': 0.21205865,
     'total_loss': 0.33398414,
     'training_loss': 0.33398414}
saved checkpoint to out/retinanet_resnet_fpn/ckpt-185200.
 eval | step:  185200 | running 278 steps of evaluation...


2024-07-16 09:45:05.960424: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
2024-07-16 09:45:05.960546: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
	 [[RemoteCall]]
2024-07-16 09:45:05.961395: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


 eval | step:  185200 | steps/sec:    5.5 | eval time:   50.7 sec | output: 
    {'box_loss': 0.0033559632,
     'cls_loss': 0.22008419,
     'model_loss': 0.38788235,
     'steps_per_second': 5.485444635236317,
     'total_loss': 0.50968564,
     'validation_loss': 0.50968564}
train | step:  185200 | training until step 186200...
train | step:  186200 | steps/sec:    3.8 | output: 
    {'box_loss': 0.0015796728,
     'cls_loss': 0.13456315,
     'learning_rate': 0.0012,
     'model_loss': 0.21354692,
     'total_loss': 0.33522737,
     'training_loss': 0.33522737}
saved checkpoint to out/retinanet_resnet_fpn/ckpt-186200.
 eval | step:  186200 | running 278 steps of evaluation...


2024-07-16 09:49:24.045424: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
2024-07-16 09:49:24.045523: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
	 [[RemoteCall]]
2024-07-16 09:49:24.046301: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


 eval | step:  186200 | steps/sec:    6.3 | eval time:   43.9 sec | output: 
    {'box_loss': 0.0033696536,
     'cls_loss': 0.21950579,
     'model_loss': 0.38798845,
     'steps_per_second': 6.338785292834686,
     'total_loss': 0.5095458,
     'validation_loss': 0.5095458}
train | step:  186200 | training until step 187200...
train | step:  187200 | steps/sec:    3.9 | output: 
    {'box_loss': 0.001587319,
     'cls_loss': 0.13449213,
     'learning_rate': 0.0012,
     'model_loss': 0.2138582,
     'total_loss': 0.33529434,
     'training_loss': 0.33529434}
saved checkpoint to out/retinanet_resnet_fpn/ckpt-187200.
 eval | step:  187200 | running 278 steps of evaluation...


2024-07-16 09:53:39.778483: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
2024-07-16 09:53:39.778584: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
	 [[RemoteCall]]
2024-07-16 09:53:39.779359: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


 eval | step:  187200 | steps/sec:    6.4 | eval time:   43.2 sec | output: 
    {'box_loss': 0.0033728804,
     'cls_loss': 0.22014911,
     'model_loss': 0.38879323,
     'steps_per_second': 6.432878175580375,
     'total_loss': 0.51010746,
     'validation_loss': 0.51010746}
train | step:  187200 | training until step 188200...
train | step:  188200 | steps/sec:    3.9 | output: 
    {'box_loss': 0.0015480496,
     'cls_loss': 0.13294694,
     'learning_rate': 0.0012,
     'model_loss': 0.21034922,
     'total_loss': 0.33154207,
     'training_loss': 0.33154207}
saved checkpoint to out/retinanet_resnet_fpn/ckpt-188200.
 eval | step:  188200 | running 278 steps of evaluation...


2024-07-16 09:57:54.276722: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
2024-07-16 09:57:54.276838: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
	 [[RemoteCall]]
2024-07-16 09:57:54.277660: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


 eval | step:  188200 | steps/sec:    6.7 | eval time:   41.8 sec | output: 
    {'box_loss': 0.0033605928,
     'cls_loss': 0.21976084,
     'model_loss': 0.38779053,
     'steps_per_second': 6.656945182762414,
     'total_loss': 0.5088612,
     'validation_loss': 0.5088612}
train | step:  188200 | training until step 189200...
train | step:  189200 | steps/sec:    3.9 | output: 
    {'box_loss': 0.0015509591,
     'cls_loss': 0.13318229,
     'learning_rate': 0.0012,
     'model_loss': 0.21073005,
     'total_loss': 0.3316796,
     'training_loss': 0.3316796}
saved checkpoint to out/retinanet_resnet_fpn/ckpt-189200.
 eval | step:  189200 | running 278 steps of evaluation...


2024-07-16 10:02:12.670412: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
2024-07-16 10:02:12.670523: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
	 [[RemoteCall]]
2024-07-16 10:02:12.671429: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


 eval | step:  189200 | steps/sec:    6.1 | eval time:   45.9 sec | output: 
    {'box_loss': 0.0033584666,
     'cls_loss': 0.2198563,
     'model_loss': 0.38777968,
     'steps_per_second': 6.059614233063131,
     'total_loss': 0.5086083,
     'validation_loss': 0.5086083}
train | step:  189200 | training until step 190200...
train | step:  190200 | steps/sec:    3.8 | output: 
    {'box_loss': 0.0015430672,
     'cls_loss': 0.13236006,
     'learning_rate': 0.0012,
     'model_loss': 0.20951329,
     'total_loss': 0.3302214,
     'training_loss': 0.3302214}
saved checkpoint to out/retinanet_resnet_fpn/ckpt-190200.
 eval | step:  190200 | running 278 steps of evaluation...


2024-07-16 10:06:29.054200: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
2024-07-16 10:06:29.055142: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
	 [[RemoteCall]]
2024-07-16 10:06:29.055944: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


 eval | step:  190200 | steps/sec:    6.6 | eval time:   42.0 sec | output: 
    {'box_loss': 0.003351191,
     'cls_loss': 0.22059475,
     'model_loss': 0.38815427,
     'steps_per_second': 6.612267671965124,
     'total_loss': 0.5087421,
     'validation_loss': 0.5087421}
train | step:  190200 | training until step 191200...
train | step:  191200 | steps/sec:    3.9 | output: 
    {'box_loss': 0.0015489463,
     'cls_loss': 0.1320232,
     'learning_rate': 0.0012,
     'model_loss': 0.20947039,
     'total_loss': 0.3299373,
     'training_loss': 0.3299373}
saved checkpoint to out/retinanet_resnet_fpn/ckpt-191200.
 eval | step:  191200 | running 278 steps of evaluation...


2024-07-16 10:10:45.543941: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
2024-07-16 10:10:45.544056: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
	 [[RemoteCall]]
2024-07-16 10:10:45.544920: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


 eval | step:  191200 | steps/sec:    6.4 | eval time:   43.2 sec | output: 
    {'box_loss': 0.0033199128,
     'cls_loss': 0.21977563,
     'model_loss': 0.38577127,
     'steps_per_second': 6.435260507246321,
     'total_loss': 0.50611776,
     'validation_loss': 0.50611776}
train | step:  191200 | training until step 192200...
train | step:  192200 | steps/sec:    3.9 | output: 
    {'box_loss': 0.0015550184,
     'cls_loss': 0.13206476,
     'learning_rate': 0.0012,
     'model_loss': 0.20981562,
     'total_loss': 0.33004192,
     'training_loss': 0.33004192}
saved checkpoint to out/retinanet_resnet_fpn/ckpt-192200.
 eval | step:  192200 | running 278 steps of evaluation...


2024-07-16 10:14:58.724267: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
2024-07-16 10:14:58.724376: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
	 [[RemoteCall]]
2024-07-16 10:14:58.725170: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


 eval | step:  192200 | steps/sec:    6.7 | eval time:   41.2 sec | output: 
    {'box_loss': 0.0033454727,
     'cls_loss': 0.22134066,
     'model_loss': 0.38861433,
     'steps_per_second': 6.749962916252046,
     'total_loss': 0.5087209,
     'validation_loss': 0.5087209}
train | step:  192200 | training until step 193200...
train | step:  193200 | steps/sec:    4.0 | output: 
    {'box_loss': 0.0015361613,
     'cls_loss': 0.13181394,
     'learning_rate': 0.0012,
     'model_loss': 0.20862196,
     'total_loss': 0.32861042,
     'training_loss': 0.32861042}
saved checkpoint to out/retinanet_resnet_fpn/ckpt-193200.
 eval | step:  193200 | running 278 steps of evaluation...


2024-07-16 10:19:11.845073: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
2024-07-16 10:19:11.845192: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
	 [[RemoteCall]]
2024-07-16 10:19:11.846095: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


 eval | step:  193200 | steps/sec:    6.8 | eval time:   41.0 sec | output: 
    {'box_loss': 0.0033520649,
     'cls_loss': 0.2211119,
     'model_loss': 0.38871527,
     'steps_per_second': 6.775949436977424,
     'total_loss': 0.5085854,
     'validation_loss': 0.5085854}
train | step:  193200 | training until step 194200...
train | step:  194200 | steps/sec:    4.0 | output: 
    {'box_loss': 0.0015490807,
     'cls_loss': 0.13200934,
     'learning_rate': 0.0012,
     'model_loss': 0.20946328,
     'total_loss': 0.32921576,
     'training_loss': 0.32921576}
saved checkpoint to out/retinanet_resnet_fpn/ckpt-194200.
 eval | step:  194200 | running 278 steps of evaluation...


2024-07-16 10:23:24.966845: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
2024-07-16 10:23:24.966961: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
	 [[RemoteCall]]
2024-07-16 10:23:24.967650: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


 eval | step:  194200 | steps/sec:    6.8 | eval time:   41.1 sec | output: 
    {'box_loss': 0.0033821203,
     'cls_loss': 0.22168721,
     'model_loss': 0.39079323,
     'steps_per_second': 6.769775577047412,
     'total_loss': 0.51042765,
     'validation_loss': 0.51042765}
train | step:  194200 | training until step 195200...
train | step:  195200 | steps/sec:    3.9 | output: 
    {'box_loss': 0.0015450403,
     'cls_loss': 0.13174312,
     'learning_rate': 0.0012,
     'model_loss': 0.20899506,
     'total_loss': 0.32851237,
     'training_loss': 0.32851237}
saved checkpoint to out/retinanet_resnet_fpn/ckpt-195200.
 eval | step:  195200 | running 278 steps of evaluation...


2024-07-16 10:27:39.129245: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
2024-07-16 10:27:39.129365: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
	 [[RemoteCall]]
2024-07-16 10:27:39.130177: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


 eval | step:  195200 | steps/sec:    6.6 | eval time:   41.8 sec | output: 
    {'box_loss': 0.003391085,
     'cls_loss': 0.22385646,
     'model_loss': 0.3934108,
     'steps_per_second': 6.649140434563416,
     'total_loss': 0.51281095,
     'validation_loss': 0.51281095}
train | step:  195200 | training until step 196200...
train | step:  196200 | steps/sec:    3.9 | output: 
    {'box_loss': 0.0015079918,
     'cls_loss': 0.1297393,
     'learning_rate': 0.0012,
     'model_loss': 0.2051387,
     'total_loss': 0.32441998,
     'training_loss': 0.32441998}
saved checkpoint to out/retinanet_resnet_fpn/ckpt-196200.
 eval | step:  196200 | running 278 steps of evaluation...


2024-07-16 10:31:53.105351: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
2024-07-16 10:31:53.105470: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
	 [[RemoteCall]]
2024-07-16 10:31:53.106434: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


 eval | step:  196200 | steps/sec:    6.7 | eval time:   41.2 sec | output: 
    {'box_loss': 0.0033945262,
     'cls_loss': 0.22284023,
     'model_loss': 0.39256647,
     'steps_per_second': 6.7498327602106025,
     'total_loss': 0.5117294,
     'validation_loss': 0.5117294}
train | step:  196200 | training until step 197200...
train | step:  197200 | steps/sec:    3.9 | output: 
    {'box_loss': 0.0015194779,
     'cls_loss': 0.1290709,
     'learning_rate': 0.0012,
     'model_loss': 0.20504487,
     'total_loss': 0.32409003,
     'training_loss': 0.32409003}
saved checkpoint to out/retinanet_resnet_fpn/ckpt-197200.
 eval | step:  197200 | running 278 steps of evaluation...


2024-07-16 10:36:06.341952: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
2024-07-16 10:36:06.342058: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
	 [[RemoteCall]]
2024-07-16 10:36:06.342885: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


 eval | step:  197200 | steps/sec:    6.8 | eval time:   41.1 sec | output: 
    {'box_loss': 0.0033513291,
     'cls_loss': 0.22279775,
     'model_loss': 0.39036426,
     'steps_per_second': 6.758181182660438,
     'total_loss': 0.50929195,
     'validation_loss': 0.50929195}
train | step:  197200 | training until step 198200...
train | step:  198200 | steps/sec:    3.9 | output: 
    {'box_loss': 0.001520841,
     'cls_loss': 0.13053337,
     'learning_rate': 0.0012,
     'model_loss': 0.20657548,
     'total_loss': 0.32538596,
     'training_loss': 0.32538596}
saved checkpoint to out/retinanet_resnet_fpn/ckpt-198200.
 eval | step:  198200 | running 278 steps of evaluation...
train | step:  199200 | steps/sec:    3.9 | output: 
    {'box_loss': 0.0015091663,
     'cls_loss': 0.12928377,
     'learning_rate': 0.0012,
     'model_loss': 0.20474215,
     'total_loss': 0.32331964,
     'training_loss': 0.32331964}
saved checkpoint to out/retinanet_resnet_fpn/ckpt-199200.
 eval | step:  

2024-07-16 10:44:33.933162: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
2024-07-16 10:44:33.933284: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
	 [[RemoteCall]]
2024-07-16 10:44:33.934055: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


 eval | step:  199200 | steps/sec:    6.7 | eval time:   41.7 sec | output: 
    {'box_loss': 0.0033511494,
     'cls_loss': 0.2205162,
     'model_loss': 0.38807368,
     'steps_per_second': 6.671459017927765,
     'total_loss': 0.50653535,
     'validation_loss': 0.50653535}
train | step:  199200 | training until step 200200...
train | step:  200200 | steps/sec:    3.9 | output: 
    {'box_loss': 0.0014710492,
     'cls_loss': 0.1276689,
     'learning_rate': 0.0012,
     'model_loss': 0.2012215,
     'total_loss': 0.3195673,
     'training_loss': 0.3195673}
saved checkpoint to out/retinanet_resnet_fpn/ckpt-200200.
 eval | step:  200200 | running 278 steps of evaluation...


2024-07-16 10:48:47.194226: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
2024-07-16 10:48:47.194339: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
	 [[RemoteCall]]
2024-07-16 10:48:47.195167: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


 eval | step:  200200 | steps/sec:    6.7 | eval time:   41.6 sec | output: 
    {'box_loss': 0.0043968777,
     'cls_loss': 0.29598036,
     'model_loss': 0.51582414,
     'steps_per_second': 6.681245893756401,
     'total_loss': 0.6340604,
     'validation_loss': 0.6340604}
train | step:  200200 | training until step 201200...
train | step:  201200 | steps/sec:    3.9 | output: 
    {'box_loss': 0.0015643962,
     'cls_loss': 0.13516353,
     'learning_rate': 0.0012,
     'model_loss': 0.21338333,
     'total_loss': 0.331511,
     'training_loss': 0.331511}
saved checkpoint to out/retinanet_resnet_fpn/ckpt-201200.
 eval | step:  201200 | running 278 steps of evaluation...


2024-07-16 10:53:00.651698: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
2024-07-16 10:53:00.651829: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
	 [[RemoteCall]]
2024-07-16 10:53:00.652595: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


 eval | step:  201200 | steps/sec:    6.8 | eval time:   41.1 sec | output: 
    {'box_loss': 0.0033571236,
     'cls_loss': 0.22044964,
     'model_loss': 0.3883059,
     'steps_per_second': 6.769686828329647,
     'total_loss': 0.5063189,
     'validation_loss': 0.5063189}
train | step:  201200 | training until step 202200...
train | step:  202200 | steps/sec:    3.9 | output: 
    {'box_loss': 0.0015339686,
     'cls_loss': 0.13261463,
     'learning_rate': 0.0012,
     'model_loss': 0.20931277,
     'total_loss': 0.32721415,
     'training_loss': 0.32721415}
saved checkpoint to out/retinanet_resnet_fpn/ckpt-202200.
 eval | step:  202200 | running 278 steps of evaluation...


2024-07-16 10:57:13.369514: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
2024-07-16 10:57:13.369617: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
	 [[RemoteCall]]
2024-07-16 10:57:13.370343: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


 eval | step:  202200 | steps/sec:    6.8 | eval time:   41.0 sec | output: 
    {'box_loss': 0.0033560984,
     'cls_loss': 0.22390784,
     'model_loss': 0.3917127,
     'steps_per_second': 6.777720694341262,
     'total_loss': 0.50949985,
     'validation_loss': 0.50949985}
train | step:  202200 | training until step 203200...
train | step:  203200 | steps/sec:    4.0 | output: 
    {'box_loss': 0.0015350365,
     'cls_loss': 0.13171323,
     'learning_rate': 0.0012,
     'model_loss': 0.2084651,
     'total_loss': 0.32613784,
     'training_loss': 0.32613784}
saved checkpoint to out/retinanet_resnet_fpn/ckpt-203200.
 eval | step:  203200 | running 278 steps of evaluation...


2024-07-16 11:01:27.598532: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
2024-07-16 11:01:27.598647: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
	 [[RemoteCall]]
2024-07-16 11:01:27.599422: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


 eval | step:  203200 | steps/sec:    6.7 | eval time:   41.5 sec | output: 
    {'box_loss': 0.0033902312,
     'cls_loss': 0.22618036,
     'model_loss': 0.39569196,
     'steps_per_second': 6.691865518478943,
     'total_loss': 0.51325047,
     'validation_loss': 0.51325047}
train | step:  203200 | training until step 204200...
train | step:  204200 | steps/sec:    3.9 | output: 
    {'box_loss': 0.0014803212,
     'cls_loss': 0.12747212,
     'learning_rate': 0.0012,
     'model_loss': 0.20148809,
     'total_loss': 0.31893194,
     'training_loss': 0.31893194}
saved checkpoint to out/retinanet_resnet_fpn/ckpt-204200.
 eval | step:  204200 | running 278 steps of evaluation...


2024-07-16 11:05:40.719100: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
2024-07-16 11:05:40.719198: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
	 [[RemoteCall]]
2024-07-16 11:05:40.719942: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


 eval | step:  204200 | steps/sec:    6.8 | eval time:   41.1 sec | output: 
    {'box_loss': 0.0033589366,
     'cls_loss': 0.22421534,
     'model_loss': 0.392162,
     'steps_per_second': 6.7674570044907245,
     'total_loss': 0.5094912,
     'validation_loss': 0.5094912}
train | step:  204200 | training until step 205200...
train | step:  205200 | steps/sec:    3.9 | output: 
    {'box_loss': 0.0014874515,
     'cls_loss': 0.1278383,
     'learning_rate': 0.0012,
     'model_loss': 0.20221095,
     'total_loss': 0.31942466,
     'training_loss': 0.31942466}
saved checkpoint to out/retinanet_resnet_fpn/ckpt-205200.
 eval | step:  205200 | running 278 steps of evaluation...


2024-07-16 11:09:53.961547: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
2024-07-16 11:09:53.961658: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
	 [[RemoteCall]]
2024-07-16 11:09:53.962431: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


 eval | step:  205200 | steps/sec:    6.8 | eval time:   41.0 sec | output: 
    {'box_loss': 0.0033787086,
     'cls_loss': 0.22210407,
     'model_loss': 0.3910395,
     'steps_per_second': 6.774313822708527,
     'total_loss': 0.50813967,
     'validation_loss': 0.50813967}
train | step:  205200 | training until step 206200...
train | step:  206200 | steps/sec:    4.0 | output: 
    {'box_loss': 0.0014814076,
     'cls_loss': 0.12677519,
     'learning_rate': 0.0012,
     'model_loss': 0.20084567,
     'total_loss': 0.3178321,
     'training_loss': 0.3178321}
saved checkpoint to out/retinanet_resnet_fpn/ckpt-206200.
 eval | step:  206200 | running 278 steps of evaluation...


2024-07-16 11:14:07.640433: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
2024-07-16 11:14:07.640565: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
	 [[RemoteCall]]
2024-07-16 11:14:07.641336: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


 eval | step:  206200 | steps/sec:    6.8 | eval time:   41.2 sec | output: 
    {'box_loss': 0.0033588742,
     'cls_loss': 0.22311515,
     'model_loss': 0.3910588,
     'steps_per_second': 6.751133061182377,
     'total_loss': 0.5079315,
     'validation_loss': 0.5079315}
train | step:  206200 | training until step 207200...
train | step:  207200 | steps/sec:    3.9 | output: 
    {'box_loss': 0.0014441541,
     'cls_loss': 0.12537178,
     'learning_rate': 0.0012,
     'model_loss': 0.1975795,
     'total_loss': 0.31433773,
     'training_loss': 0.31433773}
saved checkpoint to out/retinanet_resnet_fpn/ckpt-207200.
 eval | step:  207200 | running 278 steps of evaluation...


2024-07-16 11:18:20.770035: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
2024-07-16 11:18:20.770146: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
	 [[RemoteCall]]
2024-07-16 11:18:20.770899: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


 eval | step:  207200 | steps/sec:    6.8 | eval time:   41.1 sec | output: 
    {'box_loss': 0.0033445442,
     'cls_loss': 0.22286026,
     'model_loss': 0.3900875,
     'steps_per_second': 6.7574082878091275,
     'total_loss': 0.5067315,
     'validation_loss': 0.5067315}
train | step:  207200 | training until step 208200...
train | step:  208200 | steps/sec:    3.9 | output: 
    {'box_loss': 0.0014427562,
     'cls_loss': 0.12530966,
     'learning_rate': 0.0012,
     'model_loss': 0.19744746,
     'total_loss': 0.31397876,
     'training_loss': 0.31397876}
saved checkpoint to out/retinanet_resnet_fpn/ckpt-208200.
 eval | step:  208200 | running 278 steps of evaluation...
train | step:  209200 | steps/sec:    4.0 | output: 
    {'box_loss': 0.0014617781,
     'cls_loss': 0.12573089,
     'learning_rate': 0.0012,
     'model_loss': 0.19881988,
     'total_loss': 0.315126,
     'training_loss': 0.315126}
saved checkpoint to out/retinanet_resnet_fpn/ckpt-209200.
 eval | step:  20920

2024-07-16 11:26:46.615419: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
2024-07-16 11:26:46.615513: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
	 [[RemoteCall]]
2024-07-16 11:26:46.616430: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


 eval | step:  209200 | steps/sec:    6.8 | eval time:   41.1 sec | output: 
    {'box_loss': 0.0033639695,
     'cls_loss': 0.22156172,
     'model_loss': 0.38976023,
     'steps_per_second': 6.758612159861357,
     'total_loss': 0.5059553,
     'validation_loss': 0.5059553}
train | step:  209200 | training until step 210200...
train | step:  210200 | steps/sec:    3.9 | output: 
    {'box_loss': 0.0014581535,
     'cls_loss': 0.12573843,
     'learning_rate': 0.0012,
     'model_loss': 0.19864626,
     'total_loss': 0.31473044,
     'training_loss': 0.31473044}
saved checkpoint to out/retinanet_resnet_fpn/ckpt-210200.
 eval | step:  210200 | running 278 steps of evaluation...


2024-07-16 11:30:59.537810: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
2024-07-16 11:30:59.537930: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
	 [[RemoteCall]]
2024-07-16 11:30:59.538901: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


 eval | step:  210200 | steps/sec:    6.8 | eval time:   41.0 sec | output: 
    {'box_loss': 0.0033621618,
     'cls_loss': 0.22330499,
     'model_loss': 0.39141312,
     'steps_per_second': 6.787051176580651,
     'total_loss': 0.50738555,
     'validation_loss': 0.50738555}
train | step:  210200 | training until step 211200...
train | step:  211200 | steps/sec:    4.0 | output: 
    {'box_loss': 0.0014306734,
     'cls_loss': 0.124149665,
     'learning_rate': 0.0012,
     'model_loss': 0.19568312,
     'total_loss': 0.31154418,
     'training_loss': 0.31154418}
saved checkpoint to out/retinanet_resnet_fpn/ckpt-211200.
 eval | step:  211200 | running 278 steps of evaluation...


2024-07-16 11:35:13.470629: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
2024-07-16 11:35:13.470746: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
	 [[RemoteCall]]
2024-07-16 11:35:13.471717: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


 eval | step:  211200 | steps/sec:    6.7 | eval time:   41.7 sec | output: 
    {'box_loss': 0.0033519121,
     'cls_loss': 0.22155322,
     'model_loss': 0.38914907,
     'steps_per_second': 6.660933957340898,
     'total_loss': 0.50489694,
     'validation_loss': 0.50489694}
train | step:  211200 | training until step 212200...
train | step:  212200 | steps/sec:    3.9 | output: 
    {'box_loss': 0.0014283055,
     'cls_loss': 0.12389809,
     'learning_rate': 0.0012,
     'model_loss': 0.19531341,
     'total_loss': 0.31095067,
     'training_loss': 0.31095067}
saved checkpoint to out/retinanet_resnet_fpn/ckpt-212200.
 eval | step:  212200 | running 278 steps of evaluation...


2024-07-16 11:39:26.442168: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
2024-07-16 11:39:26.442267: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
	 [[RemoteCall]]
2024-07-16 11:39:26.442997: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


 eval | step:  212200 | steps/sec:    6.8 | eval time:   41.1 sec | output: 
    {'box_loss': 0.0033771743,
     'cls_loss': 0.22601448,
     'model_loss': 0.39487326,
     'steps_per_second': 6.763477527414129,
     'total_loss': 0.51039964,
     'validation_loss': 0.51039964}
train | step:  212200 | training until step 213200...
train | step:  213200 | steps/sec:    4.0 | output: 
    {'box_loss': 0.0014191993,
     'cls_loss': 0.12340513,
     'learning_rate': 0.0012,
     'model_loss': 0.19436516,
     'total_loss': 0.30977908,
     'training_loss': 0.30977908}
saved checkpoint to out/retinanet_resnet_fpn/ckpt-213200.
 eval | step:  213200 | running 278 steps of evaluation...


2024-07-16 11:43:39.665898: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
2024-07-16 11:43:39.666012: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
	 [[RemoteCall]]
2024-07-16 11:43:39.666781: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


 eval | step:  213200 | steps/sec:    6.7 | eval time:   41.6 sec | output: 
    {'box_loss': 0.0033866381,
     'cls_loss': 0.22579055,
     'model_loss': 0.39512232,
     'steps_per_second': 6.6813350186408895,
     'total_loss': 0.5104251,
     'validation_loss': 0.5104251}
train | step:  213200 | training until step 214200...
train | step:  214200 | steps/sec:    4.0 | output: 
    {'box_loss': 0.0014353403,
     'cls_loss': 0.12517144,
     'learning_rate': 0.0012,
     'model_loss': 0.19693844,
     'total_loss': 0.31213352,
     'training_loss': 0.31213352}
saved checkpoint to out/retinanet_resnet_fpn/ckpt-214200.
 eval | step:  214200 | running 278 steps of evaluation...


2024-07-16 11:47:52.659963: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
2024-07-16 11:47:52.660069: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
	 [[RemoteCall]]
2024-07-16 11:47:52.660953: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


 eval | step:  214200 | steps/sec:    6.7 | eval time:   41.2 sec | output: 
    {'box_loss': 0.003347811,
     'cls_loss': 0.2285033,
     'model_loss': 0.3958938,
     'steps_per_second': 6.7407760679708035,
     'total_loss': 0.5109781,
     'validation_loss': 0.5109781}
train | step:  214200 | training until step 215200...
train | step:  215200 | steps/sec:    4.0 | output: 
    {'box_loss': 0.0014044453,
     'cls_loss': 0.122147955,
     'learning_rate': 0.0012,
     'model_loss': 0.19236995,
     'total_loss': 0.30734512,
     'training_loss': 0.30734512}
saved checkpoint to out/retinanet_resnet_fpn/ckpt-215200.
 eval | step:  215200 | running 278 steps of evaluation...


2024-07-16 11:52:04.822266: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
2024-07-16 11:52:04.822384: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
	 [[RemoteCall]]
2024-07-16 11:52:04.823124: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


 eval | step:  215200 | steps/sec:    6.8 | eval time:   41.1 sec | output: 
    {'box_loss': 0.0033790402,
     'cls_loss': 0.22865951,
     'model_loss': 0.39761147,
     'steps_per_second': 6.769931030475394,
     'total_loss': 0.5124765,
     'validation_loss': 0.5124765}
train | step:  215200 | training until step 216200...
train | step:  216200 | steps/sec:    4.0 | output: 
    {'box_loss': 0.0014255429,
     'cls_loss': 0.124547444,
     'learning_rate': 0.0012,
     'model_loss': 0.19582465,
     'total_loss': 0.3105823,
     'training_loss': 0.3105823}
saved checkpoint to out/retinanet_resnet_fpn/ckpt-216200.
 eval | step:  216200 | running 278 steps of evaluation...


2024-07-16 11:56:17.001172: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
2024-07-16 11:56:17.001291: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
	 [[RemoteCall]]
2024-07-16 11:56:17.001924: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


 eval | step:  216200 | steps/sec:    6.7 | eval time:   41.3 sec | output: 
    {'box_loss': 0.0033874277,
     'cls_loss': 0.22517401,
     'model_loss': 0.3945454,
     'steps_per_second': 6.736591799753057,
     'total_loss': 0.5091941,
     'validation_loss': 0.5091941}
train | step:  216200 | training until step 217200...
train | step:  217200 | steps/sec:    4.0 | output: 
    {'box_loss': 0.0014005172,
     'cls_loss': 0.1223157,
     'learning_rate': 0.0012,
     'model_loss': 0.19234186,
     'total_loss': 0.30687982,
     'training_loss': 0.30687982}
saved checkpoint to out/retinanet_resnet_fpn/ckpt-217200.
 eval | step:  217200 | running 278 steps of evaluation...


2024-07-16 12:00:30.196289: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
2024-07-16 12:00:30.196393: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
	 [[RemoteCall]]
2024-07-16 12:00:30.197289: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


 eval | step:  217200 | steps/sec:    6.8 | eval time:   41.2 sec | output: 
    {'box_loss': 0.0034020531,
     'cls_loss': 0.2283623,
     'model_loss': 0.398465,
     'steps_per_second': 6.75169965696698,
     'total_loss': 0.5128935,
     'validation_loss': 0.5128935}
train | step:  217200 | training until step 218200...
train | step:  218200 | steps/sec:    4.0 | output: 
    {'box_loss': 0.0014048388,
     'cls_loss': 0.1220641,
     'learning_rate': 0.0012,
     'model_loss': 0.19230615,
     'total_loss': 0.30662557,
     'training_loss': 0.30662557}
saved checkpoint to out/retinanet_resnet_fpn/ckpt-218200.
 eval | step:  218200 | running 278 steps of evaluation...


2024-07-16 12:04:43.544027: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
2024-07-16 12:04:43.544135: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
	 [[RemoteCall]]
2024-07-16 12:04:43.544913: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


 eval | step:  218200 | steps/sec:    6.7 | eval time:   41.6 sec | output: 
    {'box_loss': 0.0033675777,
     'cls_loss': 0.22548407,
     'model_loss': 0.39386284,
     'steps_per_second': 6.680877819390068,
     'total_loss': 0.5080728,
     'validation_loss': 0.5080728}
train | step:  218200 | training until step 219200...
train | step:  219200 | steps/sec:    3.9 | output: 
    {'box_loss': 0.0013812702,
     'cls_loss': 0.121416144,
     'learning_rate': 0.0012,
     'model_loss': 0.19047965,
     'total_loss': 0.3045818,
     'training_loss': 0.3045818}
saved checkpoint to out/retinanet_resnet_fpn/ckpt-219200.
 eval | step:  219200 | running 278 steps of evaluation...


2024-07-16 12:08:56.556452: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
2024-07-16 12:08:56.556551: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
	 [[RemoteCall]]
2024-07-16 12:08:56.557384: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


 eval | step:  219200 | steps/sec:    6.8 | eval time:   41.0 sec | output: 
    {'box_loss': 0.003350944,
     'cls_loss': 0.22638078,
     'model_loss': 0.39392796,
     'steps_per_second': 6.775189519006938,
     'total_loss': 0.5079215,
     'validation_loss': 0.5079215}
train | step:  219200 | training until step 220200...
train | step:  220200 | steps/sec:    4.0 | output: 
    {'box_loss': 0.0014076651,
     'cls_loss': 0.1217664,
     'learning_rate': 0.0012,
     'model_loss': 0.19214958,
     'total_loss': 0.3060352,
     'training_loss': 0.3060352}
saved checkpoint to out/retinanet_resnet_fpn/ckpt-220200.
 eval | step:  220200 | running 278 steps of evaluation...


2024-07-16 12:13:09.395591: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
2024-07-16 12:13:09.395705: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
	 [[RemoteCall]]
2024-07-16 12:13:09.396433: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


 eval | step:  220200 | steps/sec:    6.8 | eval time:   40.9 sec | output: 
    {'box_loss': 0.0033832535,
     'cls_loss': 0.2312575,
     'model_loss': 0.40042022,
     'steps_per_second': 6.7958527864050176,
     'total_loss': 0.51419777,
     'validation_loss': 0.51419777}
train | step:  220200 | training until step 221200...
train | step:  221200 | steps/sec:    4.0 | output: 
    {'box_loss': 0.0013766022,
     'cls_loss': 0.12051289,
     'learning_rate': 0.0012,
     'model_loss': 0.18934308,
     'total_loss': 0.3030135,
     'training_loss': 0.3030135}
saved checkpoint to out/retinanet_resnet_fpn/ckpt-221200.
 eval | step:  221200 | running 278 steps of evaluation...


2024-07-16 12:17:21.839070: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
2024-07-16 12:17:21.839174: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
	 [[RemoteCall]]
2024-07-16 12:17:21.839867: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


 eval | step:  221200 | steps/sec:    6.8 | eval time:   41.2 sec | output: 
    {'box_loss': 0.0033542614,
     'cls_loss': 0.22565025,
     'model_loss': 0.39336327,
     'steps_per_second': 6.7504291122177875,
     'total_loss': 0.506926,
     'validation_loss': 0.506926}
train | step:  221200 | training until step 222200...
train | step:  222200 | steps/sec:    4.0 | output: 
    {'box_loss': 0.0013907147,
     'cls_loss': 0.12150329,
     'learning_rate': 0.0012,
     'model_loss': 0.19103907,
     'total_loss': 0.30449435,
     'training_loss': 0.30449435}
saved checkpoint to out/retinanet_resnet_fpn/ckpt-222200.
 eval | step:  222200 | running 278 steps of evaluation...


2024-07-16 12:21:35.415571: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
2024-07-16 12:21:35.415686: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
	 [[RemoteCall]]
2024-07-16 12:21:35.416418: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


 eval | step:  222200 | steps/sec:    6.8 | eval time:   41.0 sec | output: 
    {'box_loss': 0.0034601297,
     'cls_loss': 0.23639257,
     'model_loss': 0.40939906,
     'steps_per_second': 6.77721637207679,
     'total_loss': 0.52274686,
     'validation_loss': 0.52274686}
train | step:  222200 | training until step 223200...


2024-07-16 12:25:48.988444: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
2024-07-16 12:25:48.988569: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
	 [[RemoteCall]]
2024-07-16 12:25:48.989303: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


 eval | step:  223200 | steps/sec:    3.4 | eval time:   82.2 sec | output: 
    {'box_loss': 0.0033652668,
     'cls_loss': 0.22710608,
     'model_loss': 0.39536956,
     'steps_per_second': 3.3806201815301806,
     'total_loss': 0.50850046,
     'validation_loss': 0.50850046}
train | step:  223200 | training until step 224200...
train | step:  224200 | steps/sec:    3.4 | output: 
    {'box_loss': 0.0013897954,
     'cls_loss': 0.12042271,
     'learning_rate': 0.0012,
     'model_loss': 0.1899123,
     'total_loss': 0.30293795,
     'training_loss': 0.30293795}
saved checkpoint to out/retinanet_resnet_fpn/ckpt-224200.
 eval | step:  224200 | running 278 steps of evaluation...


2024-07-16 12:30:41.988198: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
2024-07-16 12:30:41.988302: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
	 [[RemoteCall]]
2024-07-16 12:30:41.989054: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


 eval | step:  224200 | steps/sec:    6.7 | eval time:   41.3 sec | output: 
    {'box_loss': 0.003384053,
     'cls_loss': 0.22990195,
     'model_loss': 0.39910448,
     'steps_per_second': 6.7365680584764185,
     'total_loss': 0.51202387,
     'validation_loss': 0.51202387}
train | step:  224200 | training until step 225200...
train | step:  225200 | steps/sec:    4.0 | output: 
    {'box_loss': 0.0013686484,
     'cls_loss': 0.119903974,
     'learning_rate': 0.0012,
     'model_loss': 0.18833642,
     'total_loss': 0.3011494,
     'training_loss': 0.3011494}
saved checkpoint to out/retinanet_resnet_fpn/ckpt-225200.
 eval | step:  225200 | running 278 steps of evaluation...


2024-07-16 12:34:55.280067: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
2024-07-16 12:34:55.280172: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
	 [[RemoteCall]]
2024-07-16 12:34:55.280966: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


 eval | step:  225200 | steps/sec:    6.8 | eval time:   41.1 sec | output: 
    {'box_loss': 0.0033862733,
     'cls_loss': 0.22732176,
     'model_loss': 0.3966354,
     'steps_per_second': 6.757906376236056,
     'total_loss': 0.50934213,
     'validation_loss': 0.50934213}
train | step:  225200 | training until step 226200...
train | step:  226200 | steps/sec:    3.9 | output: 
    {'box_loss': 0.0013754714,
     'cls_loss': 0.11934859,
     'learning_rate': 0.0012,
     'model_loss': 0.18812208,
     'total_loss': 0.30072224,
     'training_loss': 0.30072224}
saved checkpoint to out/retinanet_resnet_fpn/ckpt-226200.
 eval | step:  226200 | running 278 steps of evaluation...


2024-07-16 12:39:08.585299: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
2024-07-16 12:39:08.585400: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
	 [[RemoteCall]]
2024-07-16 12:39:08.586247: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


 eval | step:  226200 | steps/sec:    6.8 | eval time:   41.1 sec | output: 
    {'box_loss': 0.0033718555,
     'cls_loss': 0.22725557,
     'model_loss': 0.3958483,
     'steps_per_second': 6.765948722633663,
     'total_loss': 0.5083412,
     'validation_loss': 0.5083412}
train | step:  226200 | training until step 227200...
train | step:  227200 | steps/sec:    4.0 | output: 
    {'box_loss': 0.0013381433,
     'cls_loss': 0.11779544,
     'learning_rate': 0.0012,
     'model_loss': 0.18470263,
     'total_loss': 0.29708955,
     'training_loss': 0.29708955}
saved checkpoint to out/retinanet_resnet_fpn/ckpt-227200.
 eval | step:  227200 | running 278 steps of evaluation...


2024-07-16 12:43:21.605753: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
2024-07-16 12:43:21.605862: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
	 [[RemoteCall]]
2024-07-16 12:43:21.606654: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


 eval | step:  227200 | steps/sec:    6.8 | eval time:   41.0 sec | output: 
    {'box_loss': 0.003401585,
     'cls_loss': 0.23290111,
     'model_loss': 0.40298036,
     'steps_per_second': 6.77810452119328,
     'total_loss': 0.51526153,
     'validation_loss': 0.51526153}
train | step:  227200 | training until step 228200...
train | step:  228200 | steps/sec:    4.0 | output: 
    {'box_loss': 0.0013595847,
     'cls_loss': 0.11926267,
     'learning_rate': 0.0012,
     'model_loss': 0.18724187,
     'total_loss': 0.29941678,
     'training_loss': 0.29941678}
saved checkpoint to out/retinanet_resnet_fpn/ckpt-228200.
 eval | step:  228200 | running 278 steps of evaluation...


2024-07-16 12:47:35.697000: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
2024-07-16 12:47:35.697119: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
	 [[RemoteCall]]
2024-07-16 12:47:35.697885: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


 eval | step:  228200 | steps/sec:    6.6 | eval time:   42.3 sec | output: 
    {'box_loss': 0.0034008278,
     'cls_loss': 0.22955768,
     'model_loss': 0.39959908,
     'steps_per_second': 6.5691786491398805,
     'total_loss': 0.51166886,
     'validation_loss': 0.51166886}
train | step:  228200 | training until step 229200...
train | step:  229200 | steps/sec:    3.9 | output: 
    {'box_loss': 0.0013595577,
     'cls_loss': 0.11950614,
     'learning_rate': 0.0012,
     'model_loss': 0.18748371,
     'total_loss': 0.29944828,
     'training_loss': 0.29944828}
saved checkpoint to out/retinanet_resnet_fpn/ckpt-229200.
 eval | step:  229200 | running 278 steps of evaluation...


2024-07-16 12:51:48.610366: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
2024-07-16 12:51:48.610484: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
	 [[RemoteCall]]
2024-07-16 12:51:48.611221: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


 eval | step:  229200 | steps/sec:    6.8 | eval time:   41.1 sec | output: 
    {'box_loss': 0.0034227644,
     'cls_loss': 0.23410904,
     'model_loss': 0.40524724,
     'steps_per_second': 6.762283608466516,
     'total_loss': 0.5171072,
     'validation_loss': 0.5171072}
train | step:  229200 | training until step 230200...
train | step:  230200 | steps/sec:    4.0 | output: 
    {'box_loss': 0.0013263325,
     'cls_loss': 0.117364846,
     'learning_rate': 0.0012,
     'model_loss': 0.18368137,
     'total_loss': 0.295437,
     'training_loss': 0.295437}
saved checkpoint to out/retinanet_resnet_fpn/ckpt-230200.
 eval | step:  230200 | running 278 steps of evaluation...


In [9]:
send_pushover_notification("Finished Training", "Tensorflow Models")

export_saved_model_lib.export_inference_graph(
    input_type='image_tensor',
    batch_size=1,
    input_image_size=[IMAGE_SIZE[1], IMAGE_SIZE[0]],
    params=exp_config,
    checkpoint_path=tf.train.latest_checkpoint(MODEL_DIR),
    export_dir=f'{MODEL_DIR}/final')









INFO:tensorflow:Assets written to: out/retinanet_resnet_fpn/final/assets


INFO:tensorflow:Assets written to: out/retinanet_resnet_fpn/final/assets
