<a href="https://www.kaggle.com/code/magnusmakgasane/animetagger?scriptVersionId=289558985" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
# =========================================================
# WD14 / WD1.4 ANIME TAGGER (pretrained) + Grad-CAM (Keras 3 fix)
# =========================================================

!pip -q install huggingface_hub opencv-python

import os, csv
import numpy as np
import cv2
import tensorflow as tf
import keras
from huggingface_hub import hf_hub_download
from PIL import Image
import matplotlib.pyplot as plt

# -----------------------------
# 1) Choose model + image
# -----------------------------
REPO_ID = "SmilingWolf/wd-v1-4-convnext-tagger-v2"
MODEL_DIR = "/kaggle/working/wd14_model"
IMG_PATH  = "/kaggle/input/sasuke-img/Sasuke_Part_1.webp"

# -----------------------------
# 2) Download model files (SavedModel + tags CSV)
# -----------------------------
os.makedirs(MODEL_DIR, exist_ok=True)
var_dir = os.path.join(MODEL_DIR, "variables")
os.makedirs(var_dir, exist_ok=True)

files_root = ["saved_model.pb", "keras_metadata.pb", "selected_tags.csv"]
files_vars = ["variables.data-00000-of-00001", "variables.index"]

def dl(repo_id, filename, local_path, subfolder=None):
    path = hf_hub_download(repo_id=repo_id, filename=filename, subfolder=subfolder, local_dir=os.path.dirname(local_path))
    if path != local_path:
        os.replace(path, local_path)

for f in files_root:
    dl(REPO_ID, f, os.path.join(MODEL_DIR, f))
for f in files_vars:
    dl(REPO_ID, f, os.path.join(var_dir, f), subfolder="variables")

print("Model files ready at:", MODEL_DIR)

# -----------------------------
# 3) Read tags
# -----------------------------
tags = []
cats = []
with open(os.path.join(MODEL_DIR, "selected_tags.csv"), "r", encoding="utf-8") as f:
    reader = csv.reader(f)
    _ = next(reader)  # header
    for row in reader:
        tags.append(row[1])   # name
        cats.append(row[2])   # category (0 general, 4 character)

# -----------------------------
# 4) Keras 3: Load SavedModel via TFSMLayer (auto-find endpoint)
# -----------------------------
# Find available endpoints inside the SavedModel
loaded_sm = tf.saved_model.load(MODEL_DIR)
endpoints = list(loaded_sm.signatures.keys())
print("Available endpoints:", endpoints)

# Pick the first (usually 'serving_default')
endpoint = "serving_default" if "serving_default" in endpoints else endpoints[0]
print("Using endpoint:", endpoint)

# Wrap as a Keras model
layer = keras.layers.TFSMLayer(MODEL_DIR, call_endpoint=endpoint)

# We need an input shape; WD14 tagger uses 448x448x3
IMAGE_SIZE = 448
inp = keras.Input(shape=(IMAGE_SIZE, IMAGE_SIZE, 3), dtype=tf.float32)
out = layer(inp)
# Some TFSMLayer outputs are dict-like; normalize to a tensor
if isinstance(out, dict):
    out = list(out.values())[0]
model = keras.Model(inp, out)

# -----------------------------
# 5) Preprocess image (WD14 expects BGR, padded square, 448, float32 0..255)
# -----------------------------
def preprocess_wd14(pil_img):
    img = np.array(pil_img.convert("RGB"))
    img = img[:, :, ::-1]  # RGB -> BGR

    size = max(img.shape[0], img.shape[1])
    pad_x = size - img.shape[1]
    pad_y = size - img.shape[0]
    pad_l = pad_x // 2
    pad_t = pad_y // 2
    img = np.pad(img,
                 ((pad_t, pad_y - pad_t), (pad_l, pad_x - pad_l), (0, 0)),
                 mode="constant", constant_values=255)

    interp = cv2.INTER_AREA if size > IMAGE_SIZE else cv2.INTER_LANCZOS4
    img = cv2.resize(img, (IMAGE_SIZE, IMAGE_SIZE), interpolation=interp)
    return img.astype(np.float32)

pil = Image.open(IMG_PATH)
x_bgr = preprocess_wd14(pil)
x = np.expand_dims(x_bgr, 0)

plt.figure(figsize=(4,4))
plt.imshow(x_bgr[:, :, ::-1].astype(np.uint8))
plt.axis("off")
plt.title("WD14 input (square padded)")
plt.show()

# -----------------------------
# 6) Predict tags
# -----------------------------
probs = model(x, training=False).numpy()[0]

# WD14 models often have 4 rating outputs first
start = 4

GENERAL_THRESH = 0.35
CHAR_THRESH = 0.50

general, chars = [], []
for i in range(start, len(probs)):
    p = float(probs[i])
    name = tags[i].replace("_", " ")
    cat = cats[i]
    if cat == "0" and p >= GENERAL_THRESH:
        general.append((name, p))
    elif cat == "4" and p >= CHAR_THRESH:
        chars.append((name, p))

general.sort(key=lambda t: t[1], reverse=True)
chars.sort(key=lambda t: t[1], reverse=True)

print("=== Character tags ===")
for name, p in chars[:15]:
    print(f"{name:30s} {p:.3f}")

print("\n=== General tags ===")
for name, p in general[:20]:
    print(f"{name:30s} {p:.3f}")

top_idx = int(np.argmax(probs[start:]) + start)
top_tag = tags[top_idx].replace("_", " ")
top_p = float(probs[top_idx])
print(f"\nTop tag for Grad-CAM: {top_tag}  (p={top_p:.3f})")

# -----------------------------
# 7) Grad-CAM: find last conv-like 4D layer inside TFSMLayer output graph
#    Since TFSMLayer is opaque, classic "pick last conv layer by name" won't work.
#    We'll instead use Grad-CAM on the INPUT by computing gradients w.r.t input pixels.
#    This is still a valid saliency-style heatmap and looks very AI-ish.
# -----------------------------
x_tf = tf.convert_to_tensor(x)

with tf.GradientTape() as tape:
    tape.watch(x_tf)
    y = model(x_tf, training=False)  # (1, num_labels)
    score = y[:, top_idx]

grads = tape.gradient(score, x_tf)[0]  # (448,448,3)
grads = tf.reduce_mean(tf.abs(grads), axis=-1)  # (448,448)
heatmap = grads.numpy()
heatmap = heatmap / (heatmap.max() + 1e-8)

# Overlay
base_rgb = x_bgr[:, :, ::-1].astype(np.uint8)
heat_u8 = (heatmap * 255).astype(np.uint8)
heat_color = cv2.applyColorMap(heat_u8, cv2.COLORMAP_JET)
heat_color = cv2.cvtColor(heat_color, cv2.COLOR_BGR2RGB)
overlay = cv2.addWeighted(base_rgb, 0.55, heat_color, 0.45, 0)

plt.figure(figsize=(10,4))
plt.subplot(1,3,1); plt.imshow(base_rgb); plt.axis("off"); plt.title("Input to tagger")
plt.subplot(1,3,2); plt.imshow(heatmap, cmap="jet"); plt.axis("off"); plt.title("Saliency heatmap")
plt.subplot(1,3,3); plt.imshow(overlay); plt.axis("off"); plt.title(f"Overlay: {top_tag}")
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Build a tidy table of top tags (excluding first 4 rating outputs)
start = 4
df = pd.DataFrame({
    "tag": [t.replace("_"," ") for t in tags[start:]],
    "prob": probs[start:],
    "cat":  [c for c in cats[start:]]
})

# show top 25 overall
top = df.sort_values("prob", ascending=False).head(25)

display(top)

plt.figure(figsize=(8,6))
plt.barh(top["tag"][::-1], top["prob"][::-1])
plt.title("Top 25 tags (WD14)")
plt.xlabel("probability")
plt.show()


In [None]:
# pick best character tag (cat == "4")
start = 4
char_indices = [i for i in range(start, len(probs)) if cats[i] == "4"]
if len(char_indices) == 0:
    print("No character tags above threshold found.")
else:
    target_index = max(char_indices, key=lambda i: probs[i])
    target_tag = tags[target_index].replace("_"," ")
    print("Heatmap target:", target_tag, "p=", float(probs[target_index]))

    x_tf = tf.convert_to_tensor(x)
    with tf.GradientTape() as tape:
        tape.watch(x_tf)
        y = model(x_tf, training=False)
        score = y[:, target_index]

    grads = tape.gradient(score, x_tf)[0]
    grads = tf.reduce_mean(tf.abs(grads), axis=-1)
    heatmap = grads.numpy()
    heatmap = heatmap / (heatmap.max() + 1e-8)

    # smooth it (looks way nicer)
    heatmap = cv2.GaussianBlur(heatmap, (0,0), sigmaX=2.0)
    heatmap = heatmap / (heatmap.max() + 1e-8)

    base_rgb = x_bgr[:, :, ::-1].astype(np.uint8)
    heat_u8 = (heatmap * 255).astype(np.uint8)
    heat_color = cv2.applyColorMap(heat_u8, cv2.COLORMAP_JET)
    heat_color = cv2.cvtColor(heat_color, cv2.COLOR_BGR2RGB)
    overlay = cv2.addWeighted(base_rgb, 0.55, heat_color, 0.45, 0)

    plt.figure(figsize=(10,4))
    plt.subplot(1,3,1); plt.imshow(base_rgb); plt.axis("off"); plt.title("Input")
    plt.subplot(1,3,2); plt.imshow(heatmap, cmap="jet"); plt.axis("off"); plt.title("Smoothed heatmap")
    plt.subplot(1,3,3); plt.imshow(overlay); plt.axis("off"); plt.title(f"Overlay: {target_tag}")
    plt.show()


In [None]:
import os, glob
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt

# folder with many images
DATA_DIR = "/kaggle/input/sasuke-img"  # change to your anime dataset folder

exts = ("*.jpg","*.jpeg","*.png","*.webp","*.bmp")
paths = []
for e in exts:
    paths.extend(glob.glob(os.path.join(DATA_DIR, "**", e), recursive=True))

paths = sorted(paths)
print("Found images:", len(paths))

def embed_one(path):
    pil = Image.open(path)
    xbgr = preprocess_wd14(pil)
    xx = np.expand_dims(xbgr, 0)
    p = model(xx, training=False).numpy()[0]
    vec = p[4:]                      # tag probabilities as an "embedding"
    vec = vec / (np.linalg.norm(vec) + 1e-8)
    return vec

# pick a query image (index 0 here)
query_path = paths[0]
qvec = embed_one(query_path)

# compute similarity
sims = []
for pth in paths:
    v = embed_one(pth)
    sims.append((float(v @ qvec), pth))

sims.sort(reverse=True, key=lambda t: t[0])

print("Query:", query_path)
for s, pth in sims[:10]:
    print(f"{s:.3f}  {pth}")

# show top-6 similar
plt.figure(figsize=(12,6))
for i, (_, pth) in enumerate(sims[:6]):
    plt.subplot(2,3,i+1)
    plt.imshow(Image.open(pth).convert("RGB"))
    plt.title(f"sim={sims[i][0]:.3f}")
    plt.axis("off")
plt.show()


In [None]:
!pip -q install transformers sentencepiece

from transformers import pipeline
from PIL import Image

IMG_PATH = "/kaggle/input/sasuke-img/Sasuke_Part_1.webp"

captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
out = captioner(Image.open(IMG_PATH).convert("RGB"))
print(out)
