In [1]:
%matplotlib inline
#%pdb on

In [2]:
import tensorflow as tf
import tensorflow_models as tfm

from official.vision.configs import maskrcnn as exp_cfg
from official.core import config_definitions as cfg
from official.core import exp_factory
from official.vision.tasks import maskrcnn
from official.vision.dataloaders.tf_example_decoder import TfExampleDecoder
from official.vision.serving import export_saved_model_lib
from official.vision.configs import backbones as backbones_cfg
import tensorflow_datasets as tfds
import matplotlib.pyplot as plt
import os
import numpy as np
from official.vision.utils.object_detection import visualization_utils
import time
from custom_utils import send_pushover_notification, intercept_stdout
from functools import partial
from tfm_configs import *
import re
import tensorflow_datasets as tfdsw
import pandas as pd

2024-09-16 12:13:30.517071: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-16 12:13:30.541254: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-16 12:13:30.541300: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-16 12:13:30.558811: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
IMAGE_SIZE = (512, 512)
BATCH_SIZE = 4
TFDS_NAME = 'leaf_instance_dataset'
INPUT_PATH = "/home/stefan.steinheber/tensorflow_datasets/leaf_instance_dataset/1.0.0/"
#MODEL = "maskrcnn_mobilenet_fpn"
#MODEL = "retinanet_resnet_fpn"
MODEL = "maskrcnn_resnet_fpn"
MODEL_DIR = "out/" + MODEL
START = time.time()
RESTORE_METRICS = True

In [4]:
os.makedirs(MODEL_DIR, exist_ok=True)

In [5]:
#exp_config = maskrcnn_mobilenet_fpn(INPUT_PATH, image_size=IMAGE_SIZE)
#exp_config = retinanet_resnet_fpn(INPUT_PATH, image_size=IMAGE_SIZE)
exp_config = maskrcnn_resnet_fpn(INPUT_PATH, image_size=IMAGE_SIZE)
#exp_config.trainer.validation_interval = 100

In [6]:
logical_device_names = [logical_device.name for logical_device in tf.config.list_logical_devices('GPU')]

if len(logical_device_names) == 0:
    logical_device_names = [logical_device.name for logical_device in tf.config.list_logical_devices()]

distribution_strategy = tf.distribute.OneDeviceStrategy(logical_device_names[0])

print("Created distribution Strategy on Device", logical_device_names[0])

Created distribution Strategy on Device /device:GPU:0


2024-09-16 12:13:39.691555: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1928] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 38484 MB memory:  -> device: 0, name: NVIDIA A100-SXM4-40GB, pci bus id: 0000:41:00.0, compute capability: 8.0


In [7]:
def show_batch(raw_records):
    tf_ex_decoder = TfExampleDecoder(include_mask=True)
    plt.figure(figsize=(20, 20))
    use_normalized_coordinates=True
    min_score_thresh = 0.30
    for i, serialized_example in enumerate(raw_records):
        plt.subplot(1, 3, i + 1)
        decoded_tensors = tf_ex_decoder.decode(serialized_example)
        image = decoded_tensors['image'].numpy().astype('uint8')
        scores = np.ones(shape=(len(decoded_tensors['groundtruth_boxes'])))
        visualization_utils.visualize_boxes_and_labels_on_image_array(
            image,
            decoded_tensors['groundtruth_boxes'].numpy(),
            decoded_tensors['groundtruth_classes'].numpy().astype('int'),
            scores,
            category_index={
                1: {
                    'id': 1,
                    'name': 'leaf',
                },
            },
            use_normalized_coordinates=use_normalized_coordinates,
            min_score_thresh=min_score_thresh,
            instance_masks=decoded_tensors['groundtruth_instance_masks'].numpy().astype('uint8'),
            line_thickness=4)

        plt.imshow(image)
        plt.axis("off")
        plt.title(f"Image-{i+1}")
    plt.show()
    #plt.savefig("out/fig.png")

In [8]:
buffer_size = 100
num_of_examples = 2
tf_ex_decoder = TfExampleDecoder(include_mask=True)

train_tfrecords = tf.io.gfile.glob(exp_config.task.train_data.input_path)
raw_records = tf.data.TFRecordDataset(train_tfrecords).shuffle(buffer_size=buffer_size).take(num_of_examples)
#show_batch(raw_records)

val_tfrecords = tf.io.gfile.glob(exp_config.task.validation_data.input_path)
val_raw_records = tf.data.TFRecordDataset(train_tfrecords).shuffle(buffer_size=buffer_size).take(num_of_examples)
#show_batch(val_raw_records)
show = True
for train, val in zip(raw_records, val_raw_records):
    train_decoded = tf_ex_decoder.decode(train)
    val_decoded = tf_ex_decoder.decode(val)
    
    for key in train_decoded.keys():
        hor_ok = train_decoded[key].shape[1:] == val_decoded[key].shape[1:]
        if not hor_ok:
            print("Horizontal Integrity not given", key, train_decoded[key].shape[1:], val_decoded[key].shape[1:])

    sizes_train = [train_decoded[key].shape[0] for key in train_decoded.keys() if len(train_decoded[key].shape) > 0]
    train_ver_ok = len(set(sizes_train)) == 1
    if not train_ver_ok:
        print("Train vertical integrity not given", sizes_train,  [(key, value.shape) for key, value in train_decoded.items()])

    sizes_val = [val_decoded[key].shape[0] for key in val_decoded.keys() if len(val_decoded[key].shape) > 0]
    val_ver_ok = len(set(sizes_val)) == 1
    if not val_ver_ok:
        print("Val vertical integrity not given", sizes_val, [(key, value.shape) for key, value in val_decoded.items()])
print("integrity given")

Train vertical integrity not given [512, 9, 9, 9, 9, 9, 9] [('source_id', TensorShape([])), ('image', TensorShape([512, 512, 3])), ('height', TensorShape([])), ('width', TensorShape([])), ('groundtruth_classes', TensorShape([9])), ('groundtruth_is_crowd', TensorShape([9])), ('groundtruth_area', TensorShape([9])), ('groundtruth_boxes', TensorShape([9, 4])), ('groundtruth_instance_masks', TensorShape([9, 512, 512])), ('groundtruth_instance_masks_png', TensorShape([9]))]
Val vertical integrity not given [512, 19, 19, 19, 19, 19, 19] [('source_id', TensorShape([])), ('image', TensorShape([512, 512, 3])), ('height', TensorShape([])), ('width', TensorShape([])), ('groundtruth_classes', TensorShape([19])), ('groundtruth_is_crowd', TensorShape([19])), ('groundtruth_area', TensorShape([19])), ('groundtruth_boxes', TensorShape([19, 4])), ('groundtruth_instance_masks', TensorShape([19, 512, 512])), ('groundtruth_instance_masks_png', TensorShape([19]))]
Train vertical integrity not given [512, 6, 

2024-09-16 12:13:41.673748: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [9]:
with distribution_strategy.scope():
    task = tfm.core.task_factory.get_task(exp_config.task, logging_dir=MODEL_DIR)

In [10]:
def send_notification(text):
    if "loss" not in text or 'eval' not in text:
        return
    step = re.search(r"step:.*?(\d+).*?\|", text)
    step = step.group(1)
    losses = re.findall(r"'(.*)_loss':.*?(\d+\.\d+)", text)
    losses.reverse()
    msg = f"Step #{step}:" + ' - '.join([f"{name} Loss: {value}" for name, value in losses])
    send_pushover_notification(msg, title="Training Losses", priority=-1)

In [11]:
data_train = []
data_val = []
os.makedirs(f"metrics/{MODEL}", exist_ok=True)

if RESTORE_METRICS:
    files = os.listdir(f"metrics/{MODEL}")
    vals = [file for file in files if "val" in file]
    trains = [file for file in files if "train" in file]
    vals.sort()
    trains.sort()
    last_val = vals[-1]
    last_train = trains[-1]
    data_train = pd.read_csv(os.path.join(f"metrics/{MODEL}", last_train)).to_dict('records')
    data_val = pd.read_csv(os.path.join(f"metrics/{MODEL}", last_val)).to_dict('records')
    
def log_eval(text):
    step = re.search(r"step:.*?(\d+).*?\|", text)
    step = step.group(1)
    metrics_dict = re.findall(r"\s+.'(.*?)':\s(.*\d)", text)
    metrics = {name: value for name, value in metrics_dict}
    metrics.update({'step': step, 'mode': 'train' if 'train' in text else 'eval'})
    
    data_val.append(metrics)

    df = pd.DataFrame(data_val)
    df.to_csv(f"metrics/{MODEL}/metrics_val_{START}.csv", index=False)
    
def log_train(text):
    step = re.search(r"step:.*?(\d+).*?\|", text)
    step = step.group(1)
    losses = re.findall(r"'(.*)_loss':.*?(\d+\.\d+)", text)
    metrics = {name: value for name, value in losses}
    metrics.update({'step': step, 'mode': 'train' if 'train' in text else 'eval'})

    data_train.append(metrics)

    df = pd.DataFrame(data_train)
    os.makedirs(f"metrics/{MODEL}", exist_ok=True)
    df.to_csv(f"metrics/{MODEL}/metrics_train_{START}.csv", index=False)
    
    
def tfm_log(text):
    if "output" not in text:
        return
    if "eval" in text:
        log_eval(text)
        return
    if "train" in text:
        log_train(text)
        return

In [12]:
send_pushover_notification("Starting Training", "Tensorflow Models")

with intercept_stdout(tfm_log):
    model, eval_logs = tfm.core.train_lib.run_experiment(
        distribution_strategy=distribution_strategy,
        task=task,
        mode='train_and_eval',
        params=exp_config,
        model_dir=MODEL_DIR,
        run_post_eval=True)





Instructions for updating:
Use fn_output_signature instead


Instructions for updating:
Use fn_output_signature instead
2024-09-16 12:14:25.792992: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


loading annotations into memory...
Done (t=0.04s)
creating index...
index created!
restoring or initializing model...


2024-09-16 12:14:56.132440: W external/local_tsl/tsl/platform/cloud/google_auth_provider.cc:184] All attempts to get a Google authentication bearer token failed, returning an empty token. Retrieving token from files failed with "NOT_FOUND: Could not locate the credentials file.". Retrieving token from GCE failed with "FAILED_PRECONDITION: Error executing an HTTP request: libcurl code 6 meaning 'Couldn't resolve host name', error details: Could not resolve host: metadata.google.internal".


ValueError: Unsuccessful TensorSliceReader constructor: Failed to get matching files on gs://cloud-tpu-checkpoints/vision-2.0/resnet50_imagenet/ckpt-28080: FAILED_PRECONDITION: Error executing an HTTP request: libcurl code 77 meaning 'Problem with the SSL CA cert (path? access rights?)', error details: error setting certificate verify locations:  CAfile: /etc/ssl/certs/ca-certificates.crt CApath: none
	 when reading gs://cloud-tpu-checkpoints/vision-2.0/resnet50_imagenet

In [None]:
send_pushover_notification("Finished Training", "Tensorflow Models")

export_saved_model_lib.export_inference_graph(
    input_type='image_tensor',
    batch_size=1,
    input_image_size=[IMAGE_SIZE[1], IMAGE_SIZE[0]],
    params=exp_config,
    checkpoint_path=tf.train.latest_checkpoint(MODEL_DIR),
    export_dir=f'{MODEL_DIR}/final')