1. specify `sys.path.append` as your project directory path

2. change `Config.dataset` as different dataset names

3. set `Config.arch='ViT-L/14'` and `f_classifier='./cache/vocabulary_classifier_L.pth'` for ViT-L architecture

In [None]:
import sys
sys.path.append('/home/sheng/sheng-eatamath/S3A')

import os
import json
import re
import time
import pickle
from tqdm import tqdm
from copy import deepcopy
import random
import numpy as np
from nltk.corpus import wordnet as wn

import torch 
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
from torchvision.transforms.functional import InterpolationMode
import model as clip
from data.build_dataset import build_transform
from data.imagenet_datasets import get_datasets_rzsc, Vocab
from data.vocab import get_vocab

In [None]:
class Config:
    device = 'cuda:1'
    arch = 'ViT-B/16'
    ### dataset name
    dataset = 'imagenet'
    n_sampled_classes = 100 ### set num of sampled classes for ImageNet-100
    seed = 0
    
    input_size = 224
    batch_size = 512
    use_def = False
    clip_checkpoint = None
    f_classifier = './cache/vocabulary_classifier.pth' ### precomputed 21k CLIP vocabulary classifier
    
args = Config()

In [None]:
def load_clip(args):
    """ load clip model from checkpoint """
    model = clip.load(args.arch, device=args.device)
    if args.clip_checkpoint:
        model.load_state_dict({k[len('model.'):]:v for k, v in torch.load(args.clip_checkpoint, map_location='cpu')['model'].items()}, strict=False)
    model.to(args.device).eval()
    input_resolution = model.visual.input_resolution
    context_length = model.context_length
    vocab_size = model.vocab_size
    print("Model parameters:", f"{np.sum([int(np.prod(p.shape)) for p in model.parameters()]):,}")
    print("Input resolution:", input_resolution)
    print("Context length:", context_length)
    print("Vocab size:", vocab_size)
    return model

vocab = get_vocab()


In [None]:
mean = (0.48145466, 0.4578275, 0.40821073)
std = (0.26862954, 0.26130258, 0.27577711)

""" load dataset """
transform_f = transforms.Compose([
    transforms.Resize(args.input_size, interpolation=InterpolationMode.BICUBIC),
    transforms.CenterCrop(args.input_size),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=torch.tensor(mean),
        std=torch.tensor(std))
])
dataset = get_datasets_rzsc(args, vocab, is_train=True, transform=transform_f, seed=0)
loader_val = torch.utils.data.DataLoader(dataset, num_workers=4, batch_size=256, shuffle=False)
print('dataset size', len(dataset))

model = load_clip(args)

In [None]:
amp_autocast = torch.cuda.amp.autocast

all_vfeatures = []
all_clu_label = []
with tqdm(total=len(loader_val)) as pbar:
    model.eval()
    for idx_batch, batch in enumerate(loader_val):
        images, label_voc, label_clu, idx_img = batch
        images = images.to(args.device)
        with amp_autocast():
            with torch.no_grad():
                logits = model.visual.extract_features(images)
                logits = logits/logits.norm(dim=-1, keepdim=True)
                all_vfeatures.append(deepcopy(logits.cpu().numpy()))
                all_clu_label.append(deepcopy(label_clu.numpy()))
        pbar.update(1)

all_vfeatures = np.concatenate(all_vfeatures)
all_clu_label = np.concatenate(all_clu_label)

In [None]:
np.save(f'./cache/features/vfeatures-{args.dataset}.npy', all_vfeatures)

In [None]:
from sklearn.cluster import KMeans, MiniBatchKMeans
from my_util_package.evaluate import cluster_acc
K = dataset.dataset.num_classes if hasattr(dataset, 'dataset') else dataset.num_classes
print(f'K={K}')
print(np.unique(all_clu_label).shape)

kmeans = KMeans(n_clusters=K, random_state=0, n_init=17, max_iter=1000, verbose=0).fit(all_vfeatures)
preds = kmeans.labels_

In [None]:
acc_clu = cluster_acc(all_clu_label, preds)
print(f'cluster acc={acc_clu}')

np.save(f'./cache/cluster/kmeans-{args.dataset}.pth', preds)