In [3]:
import tensorflow as tf
from transformers import ViTFeatureExtractor, TFViTForImageClassification
from PIL import Image
import requests
import numpy as np


In [2]:

# Load a pretrained Vision Transformer model from Hugging Face (TensorFlow)
model_name = "google/vit-base-patch16-224"
model = TFViTForImageClassification.from_pretrained(model_name)
feature_extractor = ViTFeatureExtractor.from_pretrained(model_name)


All PyTorch model weights were used when initializing TFViTForImageClassification.

All the weights of TFViTForImageClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFViTForImageClassification for predictions without further training.


In [7]:

# Load and preprocess the image
image = Image.open("cat.jpg")

# Preprocess the image: resize, normalize, and prepare for the model
inputs = feature_extractor(images=image, return_tensors="tf")
pixel_values = inputs["pixel_values"]  # The preprocessed image


In [8]:

# Perform inference
outputs = model(pixel_values)
logits = outputs.logits

# Get the predicted class
predicted_class = tf.argmax(logits, axis=-1).numpy()[0]
print(f"Predicted class ID: {predicted_class}")

# Get the class label (optional)
class_names = model.config.id2label
print(f"Predicted class: {class_names[predicted_class]}")


Predicted class ID: 281
Predicted class: tabby, tabby cat
