### Recyclability Classifier, based on OpenAI's CLIP Model
Source: https://openai.com/research/clip

In [2]:
from common import *
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, classification_report
import torch
import clip
import numpy as np

# Load dataset
df_dataset = load_from_pickle(dataset_file)
x_train, x_test, text_prompt, y_test = train_test_split(df_dataset['File'], df_dataset['Material Class'], test_size=0.2, stratify=df_dataset['Material Class'], random_state=1234)
zs_x = df_dataset['File']
zs_y = df_dataset['Material Class']

# Initialise material classes
material_classes = list(material_class_mapping.values())

# Preparations for model
device = "cuda" if torch.cuda.is_available() else "cpu"
text_prompt = torch.cat([clip.tokenize(f"a photo of an object made of {c}") for c in material_classes]).to(device)

In [3]:
# Compare all available models
for current_model in clip.available_models():
    # Initialise predictions
    y_train = [0] * len(text_prompt)
    y_test_pred = [0] * len(y_test)
    
    # Get model and its specifications
    model, preprocess = clip.load(current_model, device=device, download_root=MODEL_FOLDER)
    input_resolution = model.visual.input_resolution
    context_length = model.context_length
    vocab_size = model.vocab_size

    print("Model:", current_model)
    print("Model parameters:", f"{np.sum([int(np.prod(p.shape)) for p in model.parameters()]):,}")
    print("Input resolution:", input_resolution)
    print("Context length:", context_length)
    print("Vocab size:", vocab_size, end='\n\n')

Model: RN50
Model parameters: 102,007,137
Input resolution: 224
Context length: 77
Vocab size: 49408

Model: RN101
Model parameters: 119,688,033
Input resolution: 224
Context length: 77
Vocab size: 49408

Model: RN50x4
Model parameters: 178,300,601
Input resolution: 288
Context length: 77
Vocab size: 49408

Model: RN50x16
Model parameters: 290,979,217
Input resolution: 384
Context length: 77
Vocab size: 49408

Model: RN50x64
Model parameters: 623,258,305
Input resolution: 448
Context length: 77
Vocab size: 49408

Model: ViT-B/32
Model parameters: 151,277,313
Input resolution: 224
Context length: 77
Vocab size: 49408

Model: ViT-B/16
Model parameters: 149,620,737
Input resolution: 224
Context length: 77
Vocab size: 49408

Model: ViT-L/14
Model parameters: 427,616,513
Input resolution: 224
Context length: 77
Vocab size: 49408

Model: ViT-L/14@336px
Model parameters: 427,944,193
Input resolution: 336
Context length: 77
Vocab size: 49408



In [None]:
# Initialise predictions
y_train = [0] * len(text_prompt)
y_test_pred = [0] * len(y_test)
zs_y_pred = [0] * len(zs_y)

""" Zero Shot Learning """
model, preprocess = clip.load("ViT-B/32", device=device, download_root=MODEL_FOLDER)
for current_image in zs_x:
    # Initialise image
    image = preprocess(Image.open(current_image)).unsqueeze(0).to(device)

    # Classify image's material type
    with torch.no_grad():
        image_features = model.encode_image(image)
        text_features = model.encode_text(text_prompt)
        
        # logits_per_image, logits_per_text = model(image, text_prompt)
        # probs = logits_per_image.softmax(dim=-1).cpu().numpy()

        image_features /= image_features.norm(dim=-1, keepdim=True)
        text_features /= text_features.norm(dim=-1, keepdim=True)
        similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
        values, indices = similarity[0].topk(len(material_classes))

        # Add result to predictions
        y_train.append(int(indices[0]))

        # Print the result
        # print("\nTop predictions:\n")
        # for value, index in zip(values, indices):
        #     print(f"{material_classes[index]:>16s}: {100 * value.item():.2f}%")
    

In [None]:
# Assuming y_train contains the true class labels and y_pred contains the predicted class labels
# Example usage:
y_train = [0, 1, 2, 2, 0, 1]
y_pred = [0, 1, 1, 2, 1, 0]

# Confusion matrix
cm = confusion_matrix(y_train, y_pred)
print("Confusion Matrix:")
print(cm)

# Accuracy
accuracy = accuracy_score(y_train, y_pred)
print("Accuracy:", accuracy)

# Precision
precision = precision_score(y_train, y_pred, average='macro')
print("Precision:", precision)

# Recall
recall = recall_score(y_train, y_pred, average='macro')
print("Recall:", recall)

# F1 Score
f1 = f1_score(y_train, y_pred, average='macro')
print("F1 Score:", f1)

# Classification Report
class_report = classification_report(y_train, y_pred)
print("Classification Report:")
print(class_report)