### Load --> Run --> Compute --> Cache

Run inference ONCE on the evaluation dataset and compute global metrics from the exported SavedModel.

In [1]:
# loading model
from keras.layers import TFSMLayer
import tensorflow as tf
import pathlib

SAVED_MODEL_DIR = pathlib.Path(
    "/mnt/d/04_Food101-EfficientNetV2S-model/models/export/food101_savedmodel"
)

model = tf.keras.Sequential([
    TFSMLayer(SAVED_MODEL_DIR.as_posix(), call_endpoint="serving_default")
])


2026-01-16 14:14:11.114696: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2026-01-16 14:14:17.869795: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
I0000 00:00:1768569263.935005   49097 gpu_device.cc:2020] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 5518 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 4070 Laptop GPU, pci bus id: 0000:64:00.0, compute capability: 8.9


In [2]:
# Building evaluation dataset
import tensorflow_datasets as tfds

(ds_train, ds_val), ds_info = tfds.load(
                                            "food101",
                                            split=["train", "validation"],
                                            shuffle_files=False,
                                            as_supervised=True,
                                            with_info=True,
                                        )

IMG_SIZE = (384, 384)
BATCH_SIZE = 32

def preprocess_fn(image, label):
    image = tf.image.resize(image, IMG_SIZE)
    image = tf.cast(image, tf.float32)
    image = tf.keras.applications.efficientnet_v2.preprocess_input(image)
    return image, label

test_ds = (
                ds_val
                .map(preprocess_fn, num_parallel_calls=tf.data.AUTOTUNE)
                .batch(BATCH_SIZE)
                .prefetch(tf.data.AUTOTUNE)
            )


In [3]:
import json

class_names = ds_info.features["label"].names

with open("class_names.json", "w") as f:
    json.dump(class_names, f, indent=2)

In [4]:
# Running inference ONCE and cache outputs
import numpy as np
from tqdm import tqdm

y_true = []
y_probs = []


# For each batch in the test dataset, here we send the images through the model as each batch
# the model runs vectorized inference on the whole batch at once.
# in this case test_ds is (num_batches, batch_size, height, width, channels)
for images, labels in tqdm(test_ds):                                # tqdm adds a progress bar (useful for long evaluations)
    outputs = model(images)
    probs = outputs["output_layer"]

    y_true.append(labels.numpy())
    y_probs.append(probs.numpy())

y_true = np.concatenate(y_true)     # Earlier labels were stored as list of arrays(batches), now we concatenate them into a single array(each image's label)
y_probs = np.concatenate(y_probs)
y_pred = np.argmax(y_probs, axis=1)

print(y_true.shape, y_probs.shape)


  0%|          | 0/790 [00:00<?, ?it/s]2026-01-16 14:14:37.226900: I tensorflow/core/kernels/data/tf_record_dataset_op.cc:396] The default buffer size is 262144, which is overridden by the user specified `buffer_size` of 8388608
2026-01-16 14:14:39.100749: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:473] Loaded cuDNN version 91300
100%|██████████| 790/790 [03:13<00:00,  4.27it/s]2026-01-16 14:17:50.902098: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
100%|██████████| 790/790 [03:13<00:00,  4.08it/s]

(25250,) (25250, 101)





In [5]:
# Compute global metrics
top1_acc = np.mean(y_pred == y_true)

top5_preds = np.argsort(y_probs, axis=1)[:, -5:]  #sort indeces of probs(lowest to highest) and take last 5 (top 5)
top5_acc = np.mean(
    [y_true[i] in top5_preds[i] for i in range(len(y_true))]
)

print(f"Top-1 Accuracy: {top1_acc:.4f}")
print(f"Top-5 Accuracy: {top5_acc:.4f}")


Top-1 Accuracy: 0.8427
Top-5 Accuracy: 0.9668


In [6]:
# Cache everything (CRITICAL)
# It simply stores your arrays to disk

np.save("y_true.npy", y_true)
np.save("y_pred.npy", y_pred)
np.save("y_probs.npy", y_probs)

import json
with open("metrics.json", "w") as f:
    json.dump(
        {
            "top1_accuracy": float(top1_acc),
            "top5_accuracy": float(top5_acc),
            "num_samples": int(len(y_true)),
        },
        f,
        indent=2,
    )
