In [41]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [42]:
data_dir_path = "/content/drive/MyDrive/instseg/data/multimodal_mapped_annotations_0_test.json"

In [43]:
!pip install open_clip_torch==2.26.1



In [44]:
import argparse
import json
import torch
import PIL.Image
import open_clip
import sklearn.metrics
from tqdm import tqdm
from typing import List, Tuple

In [45]:
def prepare_text_prompts(path: str) -> Tuple[List[str], dict]:
    with open(path) as f:
        data = json.load(f)
    categories = {category["id"]: category["name"] for category in data["categories"]}
    return list(categories.values()), data

In [46]:
def configure_model(
    model_id: str,
) -> Tuple[
    open_clip.CLIP, open_clip.transform.Compose, open_clip.tokenizer.SimpleTokenizer
]:
    model, _, preprocess = open_clip.create_model_and_transforms(
        model_id, pretrained="laion2b_s34b_b79k"
    )
    model.eval()
    tokenizer = open_clip.get_tokenizer(model_id)
    return model, preprocess, tokenizer

In [47]:
def inference(
    data: dict,
    model: open_clip.CLIP,
    preprocess: open_clip.tokenizer.SimpleTokenizer,
    text: torch.LongTensor,
):
    ground_truths = []
    predictions = []

    images = data["images"]

    for annotation in tqdm(data["annotations"]):
        image_id = annotation["image_id"]
        image_path = f"/content/drive/MyDrive/instseg/data/images/{next((image for image in images if image['id'] == image_id), None)['file_name']}"
        image_label = annotation["category_id"]

        image = preprocess(PIL.Image.open(image_path).convert("RGB")).unsqueeze(0)

        with torch.no_grad(), torch.cuda.amp.autocast():
            image_features = model.encode_image(image)
            text_features = model.encode_text(text)
            image_features /= image_features.norm(dim=-1, keepdim=True)
            text_features /= text_features.norm(dim=-1, keepdim=True)

            text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
            predicted_label = text_probs.argmax(dim=-1).item()

        ground_truths.append(image_label)
        predictions.append(predicted_label)

    return ground_truths, predictions

In [48]:
def calculate_metrics(true_labels: List[int], predicted_labels: List[int]):
    accuracy = sklearn.metrics.accuracy_score(true_labels, predicted_labels) * 100
    precision = sklearn.metrics.precision_score(true_labels, predicted_labels, average='weighted') * 100
    recall = sklearn.metrics.recall_score(true_labels, predicted_labels, average='weighted') * 100
    f1 = sklearn.metrics.f1_score(true_labels, predicted_labels, average='weighted') * 100

    print(f"\nAccuracy: {accuracy:.2f}%")
    print(f"Precision: {precision:.2f}%")
    print(f"Recall: {recall:.2f}%")
    print(f"F1 Score: {f1:.2f}%")

In [49]:
def main() -> None:
    text_prompts, taco_data = prepare_text_prompts(data_dir_path)
    model, preprocess, tokenizer = configure_model("ViT-B-32")
    text = tokenizer(text_prompts)
    ground_truths, predictions = inference(taco_data, model, preprocess, text)
    calculate_metrics(true_labels=ground_truths, predicted_labels=predictions)

In [50]:
main()

  checkpoint = torch.load(checkpoint_path, map_location=map_location)
  with torch.no_grad(), torch.cuda.amp.autocast():
100%|██████████| 569/569 [03:30<00:00,  2.71it/s]


Accuracy: 6.85%
Precision: 8.68%
Recall: 6.85%
F1 Score: 6.99%



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
