### Recyclability Classifier, based on OpenAI's CLIP Model
Source: https://openai.com/research/clip

In [1]:
from common import *
from PIL import Image
import torch
import clip
import numpy as np

# Initialise material classes
material_classes = list(material_class_mapping.values())

# Initialise model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device, download_root=MODEL_FOLDER)

In [2]:
clip.available_models()

['RN50',
 'RN101',
 'RN50x4',
 'RN50x16',
 'RN50x64',
 'ViT-B/32',
 'ViT-B/16',
 'ViT-L/14',
 'ViT-L/14@336px']

In [3]:
# Compare all available models
for current_model in clip.available_models():
    # Get model and its specifications
    model, preprocess = clip.load(current_model, device=device, download_root=MODEL_FOLDER)
    input_resolution = model.visual.input_resolution
    context_length = model.context_length
    vocab_size = model.vocab_size

    print("Model:", current_model)
    print("Model parameters:", f"{np.sum([int(np.prod(p.shape)) for p in model.parameters()]):,}")
    print("Input resolution:", input_resolution)
    print("Context length:", context_length)
    print("Vocab size:", vocab_size, end='\n\n')

100%|███████████████████████████████████████| 244M/244M [00:50<00:00, 5.06MiB/s]


Model: RN50
Model parameters: 102,007,137
Input resolution: 224
Context length: 77
Vocab size: 49408



100%|███████████████████████████████████████| 278M/278M [02:00<00:00, 2.43MiB/s]


Model: RN101
Model parameters: 119,688,033
Input resolution: 224
Context length: 77
Vocab size: 49408



100%|███████████████████████████████████████| 402M/402M [03:27<00:00, 2.03MiB/s]


Model: RN50x4
Model parameters: 178,300,601
Input resolution: 288
Context length: 77
Vocab size: 49408



100%|███████████████████████████████████████| 630M/630M [00:56<00:00, 11.8MiB/s]


Model: RN50x16
Model parameters: 290,979,217
Input resolution: 384
Context length: 77
Vocab size: 49408



100%|█████████████████████████████████████| 1.26G/1.26G [14:45<00:00, 1.53MiB/s]


Model: RN50x64
Model parameters: 623,258,305
Input resolution: 448
Context length: 77
Vocab size: 49408

Model: ViT-B/32
Model parameters: 151,277,313
Input resolution: 224
Context length: 77
Vocab size: 49408



100%|███████████████████████████████████████| 335M/335M [00:15<00:00, 22.7MiB/s]


Model: ViT-B/16
Model parameters: 149,620,737
Input resolution: 224
Context length: 77
Vocab size: 49408



100%|███████████████████████████████████████| 890M/890M [00:39<00:00, 23.8MiB/s]


Model: ViT-L/14
Model parameters: 427,616,513
Input resolution: 224
Context length: 77
Vocab size: 49408



100%|███████████████████████████████████████| 891M/891M [00:55<00:00, 16.7MiB/s]


Model: ViT-L/14@336px
Model parameters: 427,944,193
Input resolution: 336
Context length: 77
Vocab size: 49408



In [5]:
image = preprocess(Image.open("dataset/waste_dataset/Paper/bandicam 2019-11-05 23-40-47-671.jpg")).unsqueeze(0).to(device)
text = torch.cat([clip.tokenize(f"a photo of an object made of {c}") for c in material_classes]).to(device)

with torch.no_grad():
    image_features = model.encode_image(image)
    text_features = model.encode_text(text)
    
    logits_per_image, logits_per_text = model(image, text)
    probs = logits_per_image.softmax(dim=-1).cpu().numpy()

image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
values, indices = similarity[0].topk(len(material_classes))

# Print the result
print("\nTop predictions:\n")
for value, index in zip(values, indices):
    print(f"{material_classes[index]:>16s}: {100 * value.item():.2f}%")


Top predictions:

         Plastic: 49.51%
           Paper: 34.50%
          Others: 9.77%
           Metal: 5.91%
           Glass: 0.31%
