# Zero-shot classification example in nnabla CLIP

In [None]:
#@title Set up environment in Colab (you can skip this if you have done it.)
!pip install nnabla-ext-cuda114
!pip install -U --no-deps numpy scipy
!git clone https://github.com/sony/nnabla-examples/
CLIP_ROOT = 'nnabla-examples/vision-and-language/clip'
!cd {CLIP_ROOT} && pip install -r requirements.txt
!mkdir -p data
!cd {CLIP_ROOT} && if [ ! -f data/ViT-L-14.h5 ] ; then curl -o data/ViT-L-14.h5 https://nnabla.org/pretrained-models/nnabla-examples/vision-and-language/clip/ViT-L-14.h5; fi

## Load CLIP pretrained model

In [1]:
cd nnabla-examples/vision-and-language/clip

/content/nnabla-examples/vision-and-language/clip


In [2]:
import nnabla as nn
from  nnabla.ext_utils import get_extension_context
nn.set_default_context(get_extension_context('cudnn'))

In [3]:
from clip import CLIP
from demo_zero_shot_classification import download_image
clip = CLIP(model_path='data/ViT-L-14.h5')

## Run zero-shot classification


In [4]:
#@markdown Provide an image url as input.
import IPython
image_url = 'https://images.pexels.com/photos/5682847/pexels-photo-5682847.jpeg?auto=compress&cs=tinysrgb&w=600' #@param {type:"string"}
IPython.display.Image(url=image_url, height=256)

In [5]:
# Define zero-shot categories.
categories = [f'A photo of a person with {color} hair' for color in ['black', 'white', 'brown', 'brond', 'red', 'blue', 'green', 'pink', 'orange', 'yellow']]
for c in categories:
  print(c)

A photo of a person with black hair
A photo of a person with white hair
A photo of a person with brown hair
A photo of a person with brond hair
A photo of a person with red hair
A photo of a person with blue hair
A photo of a person with green hair
A photo of a person with pink hair
A photo of a person with orange hair
A photo of a person with yellow hair


In [6]:
# Run zero-shot classification for the image and the specified categories above.
image_bytes = download_image(image_url, return_bytes=True)
with nn.auto_forward(), nn.no_grad():
  probs = clip.run(image_bytes, categories)

for cate, prob in sorted(zip(categories, probs), key=lambda x: x[1], reverse=True):
  print(f'{cate}: {prob * 100:.2f}')



A photo of a person with white hair: 98.05
A photo of a person with yellow hair: 1.10
A photo of a person with blue hair: 0.43
A photo of a person with brond hair: 0.30
A photo of a person with green hair: 0.09
A photo of a person with pink hair: 0.03
A photo of a person with black hair: 0.00
A photo of a person with brown hair: 0.00
A photo of a person with orange hair: 0.00
A photo of a person with red hair: 0.00


## Step-by-step execution

In [7]:
# Preprocess image and text
image_bytes.seek(0)
with nn.auto_forward(), nn.no_grad():
  image = clip.preprocess_image(image_bytes)
  tokens = clip.preprocess_texts(categories)
print(f'Image shape: {image.shape}, tokens shape: {tokens.shape}')
print(f'First category "{categories[0]}" is tokenized as:\n{tokens.d[0]}')

Image shape: (1, 3, 224, 224), tokens shape: (10, 77)
First category "A photo of a person with black hair" is tokenized as:
[49406   320  1125   539   320  2533   593  1449  2225 49407     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0]


In [8]:
# Encode image and text by pre-trained CLIP encoders
with nn.auto_forward(), nn.no_grad():
  image_features = clip.encode_image(image)
  text_features = clip.encode_text(tokens)
print(f'image_features shape: {image_features.shape}, text_features shape: {text_features.shape}') 

image_features shape: (1, 768), text_features shape: (10, 768)


In [9]:
# Compute similarity (normalized by softmax) between the image and each of categories
with nn.auto_forward(), nn.no_grad():
  probs = clip.probabilities(image_features, text_features)
print(f' probs shape: {probs.shape}')

 probs shape: (1, 10)


In [10]:
for cate, prob in sorted(zip(categories, probs.d[0]), key=lambda x: x[1], reverse=True):
  print(f'{cate}: {prob * 100:.2f}')

A photo of a person with white hair: 98.05
A photo of a person with yellow hair: 1.10
A photo of a person with blue hair: 0.43
A photo of a person with brond hair: 0.30
A photo of a person with green hair: 0.09
A photo of a person with pink hair: 0.03
A photo of a person with black hair: 0.00
A photo of a person with brown hair: 0.00
A photo of a person with orange hair: 0.00
A photo of a person with red hair: 0.00
