### Recyclability Classifier, based on OpenAI's CLIP Model
Source: https://openai.com/research/clip

In [1]:
from common import *
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, classification_report, matthews_corrcoef, cohen_kappa_score, hamming_loss
import torch
import clip
import numpy as np

# Load dataset
df_dataset = load_from_pickle(dataset_file)
# Take 10% of stratified samples for zero-shot classification testing
_x_train, zs_x, _y_train, zs_y = train_test_split(df_dataset['File'], df_dataset['Material Class'], test_size=0.1, stratify=df_dataset['Material Class'], random_state=9876)
# 80-20 Train-Test split
x_train, x_test, y_train, y_test = train_test_split(df_dataset['File'], df_dataset['Material Class'], test_size=0.2, stratify=df_dataset['Material Class'], random_state=1234)

# Initialise material classes
material_classes = [i.lower() for i in material_class_mapping.values()]
material_classes[0] = "anything other than paper, plastic, glass, or metal"

# Preparations for model
device = "cuda" if torch.cuda.is_available() else "cpu"
text_prompt = torch.cat([clip.tokenize(f"photo of an object made of {c}") for c in material_classes]).to(device)

In [2]:
""" Zero Shot Classification """
# Compare performance of available models
for current_model in clip.available_models(): 
    # Initialise model
    model, preprocess = clip.load(current_model, device=device, download_root=MODEL_FOLDER)

    # Get model specifications
    input_resolution = model.visual.input_resolution
    context_length = model.context_length
    vocab_size = model.vocab_size
    
    print("Model:", current_model)
    print("Model parameters:", f"{np.sum([int(np.prod(p.shape)) for p in model.parameters()]):,}")
    print("Input resolution:", input_resolution)
    print("Context length:", context_length)
    print("Vocab size:", vocab_size, end='\n\n')
    
    # Initialise predictions
    zs_y_pred = []
    for current_image in zs_x:
        # Initialise image
        image = preprocess(Image.open(current_image)).unsqueeze(0).to(device)

        # Classify image's material type
        with torch.no_grad():
            image_features = model.encode_image(image)
            text_features = model.encode_text(text_prompt)
            
            # logits_per_image, logits_per_text = model(image, text_prompt)
            # probs = logits_per_image.softmax(dim=-1).cpu().numpy()

            image_features /= image_features.norm(dim=-1, keepdim=True)
            text_features /= text_features.norm(dim=-1, keepdim=True)
            similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
            values, indices = similarity[0].topk(len(material_classes))

            # Add result to predictions
            zs_y_pred.append(int(indices[0]))

            # Print the result
            # print("\nTop predictions:\n")
            # for value, index in zip(values, indices):
            #     print(f"{material_classes[index]:>16s}: {100 * value.item():.2f}%")

    # Get model performance
    list_zs_y = list(zs_y)

    # Accuracy
    accuracy = accuracy_score(list_zs_y, zs_y_pred)
    print("Accuracy:", accuracy)

    # Precision
    precision = precision_score(list_zs_y, zs_y_pred, average='macro')
    print("Precision:", precision)

    # Recall
    recall = recall_score(list_zs_y, zs_y_pred, average='macro')
    print("Recall:", recall)

    # F1 Score
    f1 = f1_score(list_zs_y, zs_y_pred, average='macro')
    print("F1 Score:", f1)

    # Matthews Correlation Coefficient (MCC)
    mcc = matthews_corrcoef(list_zs_y, zs_y_pred)
    print("Matthews Correlation Coefficient (MCC):", mcc)

    # Cohen's Kappa
    kappa = cohen_kappa_score(list_zs_y, zs_y_pred)
    print("Cohen's Kappa:", kappa)

    # Hamming Loss
    hamming_loss_val = hamming_loss(list_zs_y, zs_y_pred)
    print("Hamming Loss:", hamming_loss_val, end='\n\n')

    # Confusion matrix
    cm = confusion_matrix(list_zs_y, zs_y_pred)
    print("Confusion Matrix:")
    print(cm, end="\n\n")

    # Classification Report
    class_report = classification_report(list_zs_y, zs_y_pred)
    print("Classification Report:")
    print(class_report, end="\n\n\n")

Model: RN50
Model parameters: 102,007,137
Input resolution: 224
Context length: 77
Vocab size: 49408

Accuracy: 0.459983498349835
Precision: 0.5364220929770019
Recall: 0.38003848168448484
F1 Score: 0.3680587218509033
Matthews Correlation Coefficient (MCC): 0.25340113747196713
Cohen's Kappa: 0.23193098306528914
Hamming Loss: 0.540016501650165

Confusion Matrix:
[[698 122  77   1  25]
 [227 173  51   0   6]
 [127   5 159   7   0]
 [216   1  61  42   2]
 [267  26  83   5  43]]

Classification Report:
              precision    recall  f1-score   support

           0       0.45      0.76      0.57       923
           1       0.53      0.38      0.44       457
           2       0.37      0.53      0.44       298
           3       0.76      0.13      0.22       322
           4       0.57      0.10      0.17       424

    accuracy                           0.46      2424
   macro avg       0.54      0.38      0.37      2424
weighted avg       0.52      0.46      0.41      2424



Model: