In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
data_dir_path = "/content/drive/MyDrive/instseg/data/mapped_annotations_0_test.json"

In [3]:
!pip install open_clip_torch==2.26.1

Collecting open_clip_torch==2.26.1
  Downloading open_clip_torch-2.26.1-py3-none-any.whl.metadata (31 kB)
Collecting ftfy (from open_clip_torch==2.26.1)
  Downloading ftfy-6.2.0-py3-none-any.whl.metadata (7.3 kB)
Collecting timm (from open_clip_torch==2.26.1)
  Downloading timm-1.0.8-py3-none-any.whl.metadata (53 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/53.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.8/53.8 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.9.0->open_clip_torch==2.26.1)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.9.0->open_clip_torch==2.26.1)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.9.0->open_clip

In [4]:
import argparse
import json
import torch
import PIL.Image
import open_clip
import sklearn.metrics
from tqdm import tqdm
from typing import List, Tuple

In [5]:
def prepare_text_prompts(path: str) -> Tuple[List[str], dict]:
    with open(path) as f:
        data = json.load(f)
    categories = {category["id"]: category["name"] for category in data["categories"]}
    return list(categories.values()), data

In [6]:
def configure_model(
    model_id: str,
) -> Tuple[
    open_clip.CLIP, open_clip.transform.Compose, open_clip.tokenizer.SimpleTokenizer
]:
    model, _, preprocess = open_clip.create_model_and_transforms(
        model_id, pretrained="laion2b_s34b_b79k"
    )
    model.eval()
    tokenizer = open_clip.get_tokenizer(model_id)
    return model, preprocess, tokenizer

In [7]:
def inference(
    data: dict,
    model: open_clip.CLIP,
    preprocess: open_clip.tokenizer.SimpleTokenizer,
    text: torch.LongTensor,
):
    ground_truths = []
    predictions = []

    images = data["images"]

    for annotation in tqdm(data["annotations"]):
        image_id = annotation["image_id"]
        image_path = f"/content/drive/MyDrive/instseg/data/images/{next((image for image in images if image['id'] == image_id), None)['file_name']}"
        image_label = annotation["category_id"]

        image = preprocess(PIL.Image.open(image_path).convert("RGB")).unsqueeze(0)

        with torch.no_grad(), torch.cuda.amp.autocast():
            image_features = model.encode_image(image)
            text_features = model.encode_text(text)
            image_features /= image_features.norm(dim=-1, keepdim=True)
            text_features /= text_features.norm(dim=-1, keepdim=True)

            text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
            predicted_label = text_probs.argmax(dim=-1).item()

        ground_truths.append(image_label)
        predictions.append(predicted_label)

    return ground_truths, predictions

In [8]:
def calculate_metrics(true_labels: List[int], predicted_labels: List[int]):
    accuracy = sklearn.metrics.accuracy_score(true_labels, predicted_labels)
    print(f"\nAccuracy: {accuracy * 100:.2f}%")

In [9]:
def main() -> None:
    text_prompts, taco_data = prepare_text_prompts(data_dir_path)
    model, preprocess, tokenizer = configure_model("ViT-B-32")
    text = tokenizer(text_prompts)
    ground_truths, predictions = inference(taco_data, model, preprocess, text)
    calculate_metrics(true_labels=ground_truths, predicted_labels=predictions)

In [11]:
main()

100%|██████████| 569/569 [03:30<00:00,  2.70it/s]

Accuracy: 18.98%



