In [None]:
# Setup & Paths
import os, re, random, math, json
from pathlib import Path

import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, precision_recall_curve, average_precision_score

print("TF:", tf.__version__)
SEED = 42
tf.random.set_seed(SEED); np.random.seed(SEED); random.seed(SEED)

# data folders 
ROOT = Path(r"C:/Users/sad77/Desktop/Part-A.1-CalEnvAgency/shipdata_2025")
CROPS_DIR  = ROOT / "cropped_ship_dataset"   
SCENES_DIR = ROOT / "scenes"                 
assert CROPS_DIR.exists(), f"Missing: {CROPS_DIR}"

# Output folders 
OUT_ROOT = Path("outputs")
(OUT_ROOT / "models").mkdir(parents=True, exist_ok=True)
(OUT_ROOT / "figs").mkdir(parents=True, exist_ok=True)

# Training hyperparams 
IMG_SIZE   = 128      
BATCH_SIZE = 64     
EPOCHS     = 70
VAL_SPLIT  = 0.20    


os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
for g in tf.config.list_physical_devices("GPU"):
    try: tf.config.experimental.set_memory_growth(g, True)
    except Exception: pass


TF: 2.10.1


In [2]:
import re, pandas as pd
from pathlib import Path

CROPS_DIR = Path(r"C:/Users/sad77/Desktop/Part-A.1-CalEnvAgency/shipdata_2025/cropped_ship_dataset")

rows, bad = [], []
for p in sorted([q for q in CROPS_DIR.rglob("*") if q.suffix.lower() in {".png",".jpg",".jpeg"}]):
    name = p.name
    parts = name.rsplit(".", 1)[0].split("__")
    if len(parts) < 3:
        bad.append(name); continue
    raw_label, scene = parts[0].lower(), parts[1]
    # label normalization
    if raw_label in {"0","no","noship","negative","neg"}:
        label = 0
    elif raw_label in {"1","yes","ship","positive","pos"}:
        label = 1
    else:
        # try first char digit
        label = 1 if raw_label[:1] == "1" else (0 if raw_label[:1] == "0" else None)
        if label is None:
            bad.append(name); continue
    rows.append((str(p), label, scene))

df = pd.DataFrame(rows, columns=["path","label","scene"])
print("Parsed:", len(df), " | bad filenames:", len(bad))
if bad: print("Bad examples:", bad[:20])
print(df["label"].value_counts(dropna=False))
print("Unique scenes:", df["scene"].nunique())


Parsed: 4000  | bad filenames: 0
label
0    3000
1    1000
Name: count, dtype: int64
Unique scenes: 434


In [None]:
import numpy as np

scenes = df.groupby("scene")["label"].agg(["sum","count"]).reset_index()
scenes["has_pos"] = scenes["sum"] > 0


pos_scenes = scenes[scenes["has_pos"]]["scene"].tolist()
neg_scenes = scenes[~scenes["has_pos"]]["scene"].tolist()
rng = np.random.default_rng(42)
rng.shuffle(pos_scenes); rng.shuffle(neg_scenes)

n_val = max(1, int(len(scenes)*VAL_SPLIT))
val_sel = []
if pos_scenes:
    val_sel.append(pos_scenes.pop())  
while len(val_sel) < n_val and (pos_scenes or neg_scenes):
    pick_from = pos_scenes if (len(pos_scenes) > 0 and len(val_sel) < n_val*0.5) else neg_scenes
    if not pick_from: pick_from = pos_scenes or neg_scenes
    val_sel.append(pick_from.pop())

val_scenes = set(val_sel)
df["split"] = np.where(df["scene"].isin(val_scenes), "val", "train")
df["label_name"] = df["label"].map({0:"No-Ship", 1:"Ship"})

print(df["split"].value_counts())
print(df.groupby(["split","label_name"]).size().unstack(fill_value=0))


split
train    3148
val       852
Name: count, dtype: int64
label_name  No-Ship  Ship
split                    
train          2358   790
val             642   210


In [None]:
AUTOTUNE = tf.data.AUTOTUNE

def decode_resize(path, label):
    img = tf.io.decode_png(tf.io.read_file(path), channels=3)
    img = tf.image.resize(img, (IMG_SIZE, IMG_SIZE), antialias=True)
    img = tf.cast(img, tf.float32) / 255.0
    return img, tf.cast(label, tf.int32)

def augment(img, label):
    img = tf.image.random_flip_left_right(img)
    img = tf.image.random_flip_up_down(img)
    k   = tf.random.uniform([], 0, 4, dtype=tf.int32)  
    img = tf.image.rot90(img, k)
    img = tf.image.random_brightness(img, 0.05)
    img = tf.image.random_contrast(img, 0.95, 1.05)
    return img, label

def make_ds(frame, training=True):
    ds = tf.data.Dataset.from_tensor_slices((frame["path"].values, frame["label"].values))
    if training:
        ds = ds.shuffle(8000, seed=SEED)
    ds = ds.map(decode_resize, num_parallel_calls=AUTOTUNE)
    if training:
        ds = ds.map(augment, num_parallel_calls=AUTOTUNE)
    ds = ds.batch(BATCH_SIZE).prefetch(AUTOTUNE)
    return ds

train_df = df[df["split"]=="train"].reset_index(drop=True)
val_df   = df[df["split"]=="val"].reset_index(drop=True)
train_ds = make_ds(train_df, training=True)
val_ds   = make_ds(val_df,   training=False)

pos = int((train_df["label"]==1).sum()); neg = int((train_df["label"]==0).sum())
total = len(train_df)
w_pos = total / (2*max(1,pos))
w_neg = total / (2*max(1,neg))
class_weights = {0: 1.0, 1: float(min(w_pos, 1.4))}
print("Class weights used:", class_weights)


Class weights used: {0: 1.0, 1: 1.4}


In [13]:
# Build ViT 
from tensorflow.keras import layers, models, regularizers

class AddClassTokenAndPos(layers.Layer):
    def __init__(self, num_patches, dim, **kwargs):
        super().__init__(**kwargs)
        self.num_patches = num_patches
        self.dim = dim
        self.cls_token = self.add_weight(
            "cls_token", shape=(1, 1, dim), initializer="zeros", trainable=True
        )
        self.pos_emb = self.add_weight(
            "pos_emb", shape=(1, num_patches + 1, dim), initializer="zeros", trainable=True
        )

    def call(self, tokens):
        B = tf.shape(tokens)[0]
        cls = tf.repeat(self.cls_token, repeats=B, axis=0)  
        x = tf.concat([cls, tokens], axis=1)                
        return x + self.pos_emb

    def get_config(self):
        return {"num_patches": self.num_patches, "dim": self.dim, **super().get_config()}

def build_vit(
    img_size=128, patch=16, dim=256, depth=6, heads=4, mlp_dim=512, drop=0.1, l2=1e-6
) -> tf.keras.Model:
    assert img_size % patch == 0, "img_size must be divisible by patch"
    n_h = img_size // patch
    n_w = img_size // patch
    num_patches = n_h * n_w

    inputs = layers.Input((img_size, img_size, 3))
    x = layers.Conv2D(32, 3, padding="same", activation="relu")(inputs)

    
    # Output: [B, n_h, n_w, dim]
    x = layers.Conv2D(dim, kernel_size=patch, strides=patch, padding="valid")(x)
    # Flatten to tokens: [B, N, dim]
    x = layers.Reshape((num_patches, dim))(x)

    # Add [CLS] and positional embedding
    x = AddClassTokenAndPos(num_patches=num_patches, dim=dim)(x)

    # Transformer encoder blocks
    for _ in range(depth):
        # Norm → MHA → skip
        h = layers.LayerNormalization(epsilon=1e-6)(x)
        h = layers.MultiHeadAttention(num_heads=heads, key_dim=dim, dropout=drop)(h, h)
        x = layers.Add()([x, layers.Dropout(drop)(h)])
        # Norm → MLP → skip
        h = layers.LayerNormalization(epsilon=1e-6)(x)
        h = layers.Dense(mlp_dim, activation="gelu")(h)
        h = layers.Dropout(drop)(h)
        h = layers.Dense(dim)(h)
        x = layers.Add()([x, layers.Dropout(drop)(h)])

    x = layers.LayerNormalization(epsilon=1e-6)(x)
    cls = x[:, 0]  # [CLS]
    cls = layers.Dropout(drop)(cls)

    logits = layers.Dense(
        1,
        kernel_regularizer=regularizers.l2(l2),
        bias_regularizer=regularizers.l2(l2)
    )(cls)  

    return models.Model(inputs, logits, name="vit_binary")

vit = build_vit(
    img_size=IMG_SIZE, patch=16, dim=256, depth=6, heads=4, mlp_dim=512, drop=0.1, l2=1e-6
)
vit.summary()


Model: "vit_binary"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_5 (InputLayer)           [(None, 128, 128, 3  0           []                               
                                )]                                                                
                                                                                                  
 conv2d_8 (Conv2D)              (None, 128, 128, 32  896         ['input_5[0][0]']                
                                )                                                                 
                                                                                                  
 conv2d_9 (Conv2D)              (None, 8, 8, 256)    2097408     ['conv2d_8[0][0]']               
                                                                                         

In [None]:
# Train ViT
from tensorflow.keras.metrics import BinaryAccuracy, AUC
from tensorflow.keras import optimizers

vit.compile(
    optimizer=optimizers.Adam(2e-4),
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=True, label_smoothing=0.03),
    metrics=[BinaryAccuracy(threshold=0.0, name="acc"), AUC(from_logits=True, name="auc")]
)

cb = [
    tf.keras.callbacks.EarlyStopping(monitor="val_loss", mode="min",
                                     patience=10, restore_best_weights=True),
    tf.keras.callbacks.ModelCheckpoint(
        OUT_ROOT/"models/vit_final.weights.h5",
        save_best_only=True, save_weights_only=True,
        monitor="val_loss", mode="min"
    )
]

print("ViT class weights:", class_weights)
_ = vit.fit(
    train_ds,
    validation_data=val_ds,
    epochs=EPOCHS,
    class_weight=class_weights,
    callbacks=cb,
    verbose=2
)
print("Training done. Saved best weights →", OUT_ROOT/"models/vit_final.weights.h5")


ViT class weights: {0: 1.0, 1: 1.4}
Epoch 1/70
50/50 - 32s - loss: 0.8954 - acc: 0.6630 - auc: 0.5042 - val_loss: 0.5930 - val_acc: 0.7535 - val_auc: 0.5208 - 32s/epoch - 635ms/step
Epoch 2/70
50/50 - 5s - loss: 0.7290 - acc: 0.7084 - auc: 0.5068 - val_loss: 0.6285 - val_acc: 0.7535 - val_auc: 0.5193 - 5s/epoch - 109ms/step
Epoch 3/70
50/50 - 6s - loss: 0.7155 - acc: 0.7182 - auc: 0.5315 - val_loss: 0.6057 - val_acc: 0.7477 - val_auc: 0.5229 - 6s/epoch - 111ms/step
Epoch 4/70
50/50 - 6s - loss: 0.6996 - acc: 0.7141 - auc: 0.5710 - val_loss: 0.6375 - val_acc: 0.6714 - val_auc: 0.5448 - 6s/epoch - 112ms/step
Epoch 5/70
50/50 - 5s - loss: 0.6799 - acc: 0.7141 - auc: 0.6207 - val_loss: 0.5976 - val_acc: 0.6984 - val_auc: 0.5986 - 5s/epoch - 109ms/step
Epoch 6/70
50/50 - 5s - loss: 0.7034 - acc: 0.6874 - auc: 0.5968 - val_loss: 0.8008 - val_acc: 0.7535 - val_auc: 0.5838 - 5s/epoch - 107ms/step
Epoch 7/70
50/50 - 5s - loss: 0.6868 - acc: 0.7392 - auc: 0.5856 - val_loss: 0.5749 - val_acc: 0.7

In [None]:
# Evaluation 
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt

# Rebuild & load weights 
eval_model = build_vit(
    img_size=IMG_SIZE, patch=16, dim=256, depth=6, heads=4, mlp_dim=512, drop=0.1, l2=1e-6
)
eval_model.load_weights(OUT_ROOT/"models/vit_final.weights.h5")

AUTOTUNE = tf.data.AUTOTUNE
def decode_resize_eval(path, label):
    img = tf.io.decode_png(tf.io.read_file(path), channels=3)
    img = tf.image.resize(img, (IMG_SIZE, IMG_SIZE), antialias=True)
    img = tf.cast(img, tf.float32) / 255.0
    return img, tf.cast(label, tf.int32)

val_df_eval = df[df["split"]=="val"].reset_index(drop=True)
val_ds_eval = (tf.data.Dataset
               .from_tensor_slices((val_df_eval["path"].values, val_df_eval["label"].values))
               .map(decode_resize_eval, num_parallel_calls=AUTOTUNE)
               .batch(64).prefetch(AUTOTUNE))

ys, yh = [], []
for x,y in val_ds_eval:
    p = eval_model.predict(x, verbose=0)
    ys.append(y.numpy()); yh.append(p)
y_true = np.concatenate(ys).astype(int)
logits = np.concatenate(yh).ravel().astype(np.float32)
probs  = 1/(1+np.exp(-logits))
probs  = np.clip(probs, 1e-7, 1-1e-7)


ap = average_precision_score(y_true, probs)
prec, rec, thr = precision_recall_curve(y_true, probs)

ts = np.linspace(0.05, 0.50, 46) 
best_f1, best_t, best_cm = -1.0, 0.5, None
for t in ts:
    yhat = (probs >= t).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_true, yhat, labels=[0,1]).ravel()
    P = tp/(tp+fp+1e-9); R = tp/(tp+fn+1e-9)
    F1 = 2*P*R/(P+R+1e-9)
    if F1 > best_f1:
        best_f1, best_t, best_cm = F1, t, (tn,fp,fn,tp)

print(f"AP={ap:.3f} | Best-F1={best_f1:.3f} @ thr={best_t:.2f}")
print("CM @ best-F1 [TN FP FN TP]:", best_cm)


for t in [0.10, 0.15, 0.20, 0.25, 0.30, 0.40, 0.50]:
    yhat = (probs >= t).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_true, yhat, labels=[0,1]).ravel()
    P = tp/(tp+fp+1e-9); R = tp/(tp+fn+1e-9); F1 = 2*P*R/(P+R+1e-9)
    print(f"t={t:.2f}  P={P:.3f}  R={R:.3f}  F1={F1:.3f}  TN={tn} FP={fp} FN={fn} TP={tp}")

# Save confusion matrix & PR curve
FIG_DIR = OUT_ROOT / "figs"
cm = np.array([[best_cm[0], best_cm[1]],[best_cm[2], best_cm[3]]], dtype=int)

plt.figure(figsize=(4,4))
plt.imshow(cm, cmap="Blues")
plt.xticks([0,1], ["No-Ship","Ship"]); plt.yticks([0,1], ["No-Ship","Ship"])
for (i,j),v in np.ndenumerate(cm): plt.text(j,i,str(v),ha="center",va="center")
plt.title(f"Confusion Matrix (val @ F1={best_f1:.3f}, thr={best_t:.2f})")
plt.xlabel("Predicted"); plt.ylabel("True"); plt.tight_layout()
plt.savefig(FIG_DIR/"vit_confusion.png", dpi=200); plt.close()

plt.figure()
plt.plot(rec, prec)
plt.xlabel("Recall"); plt.ylabel("Precision")
plt.title(f"Precision–Recall (AP={ap:.3f})"); plt.grid(True); plt.tight_layout()
plt.savefig(FIG_DIR/"vit_pr.png", dpi=200); plt.close()

with open(OUT_ROOT/"vit_val_metrics.json","w") as f:
    json.dump({
        "AP": float(ap), "F1_best": float(best_f1), "thr_best": float(best_t),
        "cm_best": {"tn": int(best_cm[0]), "fp": int(best_cm[1]),
                    "fn": int(best_cm[2]), "tp": int(best_cm[3])},
        "img_size": int(IMG_SIZE),
        "vit": {"patch": 16, "dim": 256, "depth": 6, "heads": 4, "mlp_dim": 512, "drop": 0.1}
    }, f, indent=2)

np.save(OUT_ROOT/"vit_val_probs.npy",  probs)
np.save(OUT_ROOT/"vit_val_labels.npy", y_true)

print("Saved → figs: vit_confusion.png, vit_pr.png | metrics: vit_val_metrics.json")


AP=0.992 | Best-F1=0.972 @ thr=0.40
CM @ best-F1 [TN FP FN TP]: (635, 7, 5, 205)
t=0.10  P=0.908  R=0.990  F1=0.948  TN=621 FP=21 FN=2 TP=208
t=0.15  P=0.920  R=0.981  F1=0.949  TN=624 FP=18 FN=4 TP=206
t=0.20  P=0.928  R=0.981  F1=0.954  TN=626 FP=16 FN=4 TP=206
t=0.25  P=0.945  R=0.981  F1=0.963  TN=630 FP=12 FN=4 TP=206
t=0.30  P=0.953  R=0.976  F1=0.965  TN=632 FP=10 FN=5 TP=205
t=0.40  P=0.967  R=0.976  F1=0.972  TN=635 FP=7 FN=5 TP=205
t=0.50  P=0.971  R=0.971  F1=0.971  TN=636 FP=6 FN=6 TP=204
Saved → figs: vit_confusion.png, vit_pr.png | metrics: vit_val_metrics.json


In [None]:
# FINAL TEST EVALUATION (scene-level, ViT)
import os, json, re
from pathlib import Path
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.metrics import confusion_matrix, precision_recall_curve, average_precision_score

IMG_SIZE     = 128         
PATCH        = 16
DIM          = 256
DEPTH        = 6
HEADS        = 4
MLP_DIM      = 512
DROP         = 0.10
L2_REG       = 1e-6
WEIGHTS_PATH = Path("outputs/models/vit_final.weights.h5")
OUT_ROOT     = Path("outputs"); OUT_ROOT.mkdir(parents=True, exist_ok=True)
FIG_DIR      = OUT_ROOT / "figs"; FIG_DIR.mkdir(parents=True, exist_ok=True)
TEST_SCENES_TXT = OUT_ROOT / "test_scenes.txt"  

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
for g in tf.config.list_physical_devices("GPU"):
    try: tf.config.experimental.set_memory_growth(g, True)
    except Exception: pass

assert "df" in globals(), "`df` is missing. Run your parsing & split cells first."
assert {"path","label","scene"}.issubset(df.columns), "df must have: path, label, scene"

rng = np.random.default_rng(42)
if TEST_SCENES_TXT.exists():
    test_scenes = set(TEST_SCENES_TXT.read_text().splitlines())
else:
    all_scenes = sorted(df["scene"].unique())
    rng.shuffle(all_scenes)
    n_test = max(1, int(len(all_scenes) * 0.10))
    test_scenes = set(all_scenes[:n_test])
    TEST_SCENES_TXT.write_text("\n".join(sorted(test_scenes)))

df = df.copy()
df["split2"] = np.where(df["scene"].isin(test_scenes), "test", df.get("split", "train"))

print("Split counts:")
print(df["split2"].value_counts())
print(df.groupby(["split2", df["label"].map({0:"No-Ship",1:"Ship"})]).size().unstack(fill_value=0))


# Build test dataset

AUTOTUNE = tf.data.AUTOTUNE

def _decode_resize_fallback(path, label):
    img = tf.io.decode_png(tf.io.read_file(path), channels=3)
    img = tf.image.resize(img, (IMG_SIZE, IMG_SIZE), antialias=True)
    img = tf.cast(img, tf.float32) / 255.0           
    return img, tf.cast(label, tf.int32)

decode_fn = globals().get("decode_resize", _decode_resize_fallback)

test_df = df[df["split2"] == "test"].reset_index(drop=True)
assert len(test_df) > 0, "Empty test set — adjust your split size."

test_ds = (tf.data.Dataset
           .from_tensor_slices((test_df["path"].values, test_df["label"].values))
           .map(lambda p,l: decode_fn(p,l), num_parallel_calls=AUTOTUNE)
           .batch(64)
           .prefetch(AUTOTUNE))

from tensorflow.keras import layers, models, regularizers

def _build_vit_fallback(img_size=IMG_SIZE, patch=PATCH, dim=DIM, depth=DEPTH, heads=HEADS,
                        mlp_dim=MLP_DIM, drop=DROP, l2=L2_REG):
    assert img_size % patch == 0
    n_h = img_size // patch
    n_w = img_size // patch
    num_patches = n_h * n_w

    class AddClassTokenAndPos(layers.Layer):
        def __init__(self, num_patches, dim, **kwargs):
            super().__init__(**kwargs)
            self.num_patches = num_patches
            self.dim = dim
            self.cls_token = self.add_weight("cls_token", shape=(1,1,dim),
                                             initializer="zeros", trainable=True)
            self.pos_emb   = self.add_weight("pos_emb",   shape=(1,num_patches+1,dim),
                                             initializer="zeros", trainable=True)
        def call(self, tokens):
            B = tf.shape(tokens)[0]
            cls = tf.repeat(self.cls_token, repeats=B, axis=0)
            x = tf.concat([cls, tokens], axis=1)
            return x + self.pos_emb

    inputs = layers.Input((img_size, img_size, 3))
    x = layers.Conv2D(32, 3, padding="same", activation="relu")(inputs)
    x = layers.Conv2D(dim, kernel_size=patch, strides=patch, padding="valid")(x)  
    x = layers.Reshape((num_patches, dim))(x)
    x = AddClassTokenAndPos(num_patches=num_patches, dim=dim)(x)

    for _ in range(depth):
        h = layers.LayerNormalization(epsilon=1e-6)(x)
        h = layers.MultiHeadAttention(num_heads=heads, key_dim=dim, dropout=drop)(h, h)
        x = layers.Add()([x, layers.Dropout(drop)(h)])
        h = layers.LayerNormalization(epsilon=1e-6)(x)
        h = layers.Dense(mlp_dim, activation="gelu")(h)
        h = layers.Dropout(drop)(h)
        h = layers.Dense(dim)(h)
        x = layers.Add()([x, layers.Dropout(drop)(h)])

    x = layers.LayerNormalization(epsilon=1e-6)(x)
    cls = x[:, 0]
    cls = layers.Dropout(drop)(cls)
    logits = layers.Dense(1,
                          kernel_regularizer=regularizers.l2(l2),
                          bias_regularizer=regularizers.l2(l2))(cls)  
    return models.Model(inputs, logits, name="vit_binary")

build_vit = globals().get("build_vit", _build_vit_fallback)

assert WEIGHTS_PATH.exists(), f"Missing weights: {WEIGHTS_PATH}"
model = build_vit(IMG_SIZE, PATCH, DIM, DEPTH, HEADS, MLP_DIM, DROP, L2_REG)
model.load_weights(WEIGHTS_PATH)


# Predict on TEST and score

ys, yh = [], []
for x, y in test_ds:
    p = model.predict(x, verbose=0)   
    ys.append(y.numpy()); yh.append(p)

y_true = np.concatenate(ys).astype(int)
logits = np.concatenate(yh).ravel().astype(np.float32)
probs  = 1.0 / (1.0 + np.exp(-logits))
probs  = np.clip(probs, 1e-7, 1-1e-7)

ap = average_precision_score(y_true, probs)
prec, rec, thr = precision_recall_curve(y_true, probs)

ts = np.linspace(0.05, 0.50, 46)  
best_f1, best_t, best_cm = -1.0, 0.5, None
for t in ts:
    yhat = (probs >= t).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_true, yhat, labels=[0,1]).ravel()
    P = tp/(tp+fp+1e-9); R = tp/(tp+fn+1e-9)
    F1 = 2*P*R/(P+R+1e-9)
    if F1 > best_f1:
        best_f1, best_t, best_cm = F1, t, (tn, fp, fn, tp)

print(f"[TEST] AP={ap:.3f} | Best-F1={best_f1:.3f} @ thr={best_t:.2f}")
print("[TEST] CM @ best-F1 [TN FP FN TP]:", best_cm)
for t in [0.10, 0.15, 0.20, 0.25, 0.30, 0.40, 0.50]:
    yhat = (probs >= t).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_true, yhat, labels=[0,1]).ravel()
    P = tp/(tp+fp+1e-9); R = tp/(tp+fn+1e-9); F1 = 2*P*R/(P+R+1e-9)
    print(f"[TEST] t={t:.2f}  P={P:.3f}  R={R:.3f}  F1={F1:.3f}  TN={tn} FP={fp} FN={fn} TP={tp}")


# Save TEST figs & metrics
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt

cm = np.array([[best_cm[0], best_cm[1]],
               [best_cm[2], best_cm[3]]], dtype=int)

plt.figure(figsize=(4,4))
plt.imshow(cm, cmap="Blues")
plt.xticks([0,1], ["No-Ship","Ship"]); plt.yticks([0,1], ["No-Ship","Ship"])
for (i,j),v in np.ndenumerate(cm): plt.text(j,i,str(v),ha="center",va="center")
plt.title(f"Confusion Matrix (TEST @ F1={best_f1:.3f}, thr={best_t:.2f})")
plt.xlabel("Predicted"); plt.ylabel("True"); plt.tight_layout()
plt.savefig(FIG_DIR/"vit_test_confusion.png", dpi=200); plt.close()

plt.figure()
plt.plot(rec, prec)
plt.xlabel("Recall"); plt.ylabel("Precision")
plt.title(f"Precision–Recall (TEST AP={ap:.3f})")
plt.grid(True); plt.tight_layout()
plt.savefig(FIG_DIR/"vit_test_pr.png", dpi=200); plt.close()

with open(OUT_ROOT/"vit_test_metrics.json","w") as f:
    json.dump({
        "AP": float(ap), "F1_best": float(best_f1), "thr_best": float(best_t),
        "cm_best": {"tn": int(best_cm[0]), "fp": int(best_cm[1]),
                    "fn": int(best_cm[2]), "tp": int(best_cm[3])},
        "img_size": int(IMG_SIZE),
        "vit": {"patch": PATCH, "dim": DIM, "depth": DEPTH, "heads": HEADS,
                "mlp_dim": MLP_DIM, "drop": DROP, "l2": L2_REG},
        "test_scenes_file": str(TEST_SCENES_TXT)
    }, f, indent=2)

np.save(OUT_ROOT/"vit_test_probs.npy",  probs)
np.save(OUT_ROOT/"vit_test_labels.npy", y_true)

print("Saved TEST → figs: vit_test_confusion.png, vit_test_pr.png | metrics: vit_test_metrics.json")
print("Test scenes stored in:", TEST_SCENES_TXT)


Split counts:
split2
train    2799
val       733
test      468
Name: count, dtype: int64
label   No-Ship  Ship
split2               
test        336   132
train      2111   688
val         553   180
[TEST] AP=0.999 | Best-F1=0.985 @ thr=0.21
[TEST] CM @ best-F1 [TN FP FN TP]: (332, 4, 0, 132)
[TEST] t=0.10  P=0.943  R=1.000  F1=0.971  TN=328 FP=8 FN=0 TP=132
[TEST] t=0.15  P=0.964  R=1.000  F1=0.981  TN=331 FP=5 FN=0 TP=132
[TEST] t=0.20  P=0.964  R=1.000  F1=0.981  TN=331 FP=5 FN=0 TP=132
[TEST] t=0.25  P=0.970  R=0.992  F1=0.981  TN=332 FP=4 FN=1 TP=131
[TEST] t=0.30  P=0.970  R=0.992  F1=0.981  TN=332 FP=4 FN=1 TP=131
[TEST] t=0.40  P=0.970  R=0.985  F1=0.977  TN=332 FP=4 FN=2 TP=130
[TEST] t=0.50  P=0.970  R=0.977  F1=0.974  TN=332 FP=4 FN=3 TP=129
Saved TEST → figs: vit_test_confusion.png, vit_test_pr.png | metrics: vit_test_metrics.json
Test scenes stored in: outputs\test_scenes.txt
