In [22]:
import AnomalyCLIP_lib
import torch
import argparse
import torch.nn.functional as F
from prompt_ensemble import AnomalyCLIP_PromptLearner
from loss import FocalLoss, BinaryDiceLoss
from utils import normalize
from dataset import Dataset
from logger import get_logger
from tqdm import tqdm

import os
import random
import numpy as np
from tabulate import tabulate
from utils import get_transform

def setup_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

from visualization import visualizer

from metrics import image_level_metrics, pixel_level_metrics
from tqdm import tqdm
from scipy.ndimage import gaussian_filter

In [3]:
data_path = "C:/Users/dmkwo/Documents/soo/AnomalyCLIP/data/orange"
save_path = "./results/9_12_4_multiscale_orange"
checkpoint_path = "./checkpoints/9_12_4_multiscale/epoch_15.pth"
dataset = "orange"
features_list = [6, 12, 18, 24]
image_size = 518
depth = 9
n_ctx = 12
t_n_ctx = 4
feature_map_layer = [0, 1, 2, 3]
metrics = 'image-pixel-level'
seed = 111
sigma = 4

# args = {"data_path":data_path, "save_path":save_path, "checkpoint_path":checkpoint_path, "dataset":dataset, 
#         "features_list":features_list, "image_size":image_size, "depth":depth, "n_ctx":n_ctx, "t_n_ctx":t_n_ctx, 
#         "feature_map_layer":feature_map_layer, "metrics":metrics, "seed":seed, "sigma":sigma}

In [25]:
parser = argparse.ArgumentParser("AnomalyCLIP", add_help=True)
# paths
parser.add_argument("--data_path", type=str, default="./data/orange", help="path to test dataset")
parser.add_argument("--save_path", type=str, default='./results/9_12_4_multiscale_orange', help='path to save results')
parser.add_argument("--checkpoint_path", type=str, default='./checkpoints/9_12_4_multiscale/epoch_15.pth', help='path to checkpoint')
# model
parser.add_argument("--dataset", type=str, default='orange')
parser.add_argument("--features_list", type=int, nargs="+", default=[6, 12, 18, 24], help="features used")
parser.add_argument("--image_size", type=int, default=518, help="image size")
parser.add_argument("--depth", type=int, default=9, help="image size")
parser.add_argument("--n_ctx", type=int, default=12, help="zero shot")
parser.add_argument("--t_n_ctx", type=int, default=4, help="zero shot")
parser.add_argument("--feature_map_layer", type=int,  nargs="+", default=[0, 1, 2, 3], help="zero shot")
parser.add_argument("--metrics", type=str, default='image-pixel-level')
parser.add_argument("--seed", type=int, default=111, help="random seed")
parser.add_argument("--sigma", type=int, default=4, help="zero shot")

args = parser.parse_args(args=[])

In [26]:
logger = get_logger(args.save_path)
device = "cuda" if torch.cuda.is_available() else "cpu"

In [27]:
AnomalyCLIP_parameters = {"Prompt_length": args.n_ctx, "learnabel_text_embedding_depth": args.depth, "learnabel_text_embedding_length": args.t_n_ctx}

In [28]:
model, _ = AnomalyCLIP_lib.load("ViT-L/14@336px", device=device, design_details = AnomalyCLIP_parameters)
model.eval()

name ViT-L/14@336px
text_layer False
text_layer True


AnomalyCLIP(
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14), bias=False)
    (ln_pre): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): ModuleList(
        (0): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (1): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias

In [29]:
preprocess, target_transform = get_transform(args)
test_data = Dataset(root=args.data_path, transform=preprocess, target_transform=target_transform, dataset_name = args.dataset)
test_dataloader = torch.utils.data.DataLoader(test_data, batch_size=1, shuffle=False)
obj_list = test_data.obj_list

In [30]:
obj_list

['orange']

In [32]:
results = {}
metrics = {}
for obj in obj_list:
    results[obj] = {}
    results[obj]['gt_sp'] = []
    results[obj]['pr_sp'] = []
    results[obj]['imgs_masks'] = []
    results[obj]['anomaly_maps'] = []
    metrics[obj] = {}
    metrics[obj]['pixel-auroc'] = 0
    metrics[obj]['pixel-aupro'] = 0
    metrics[obj]['image-auroc'] = 0
    metrics[obj]['image-ap'] = 0

In [33]:
prompt_learner = AnomalyCLIP_PromptLearner(model.to("cpu"), AnomalyCLIP_parameters)
checkpoint = torch.load(args.checkpoint_path)
prompt_learner.load_state_dict(checkpoint["prompt_learner"])
prompt_learner.to(device)
model.to(device)
model.visual.DAPM_replace(DPAM_layer = 20)

Initializing class-specific contexts
single_para torch.Size([4, 768])
single_para torch.Size([4, 768])
single_para torch.Size([4, 768])
single_para torch.Size([4, 768])
single_para torch.Size([4, 768])
single_para torch.Size([4, 768])
single_para torch.Size([4, 768])
single_para torch.Size([4, 768])
embedding_pos torch.Size([1, 77, 768])
tokenized_prompts shape torch.Size([1, 1, 77]) torch.Size([1, 1, 77])


In [34]:
prompts, tokenized_prompts, compound_prompts_text = prompt_learner(cls_id = None)

print(f'***\n{prompts}\n{tokenized_prompts}\n{compound_prompts_text}\n *** \n')

***
tensor([[[ 0.0012,  0.0032,  0.0003,  ..., -0.0019,  0.0003,  0.0019],
         [-0.0187, -0.0407, -0.0552,  ..., -0.0271, -0.0147,  0.0269],
         [-0.0360,  0.0567, -0.0307,  ..., -0.0068,  0.0393,  0.0139],
         ...,
         [-0.0015,  0.0360,  0.0223,  ...,  0.0148,  0.0045, -0.0214],
         [-0.0015,  0.0360,  0.0223,  ...,  0.0148,  0.0045, -0.0214],
         [-0.0015,  0.0360,  0.0223,  ...,  0.0148,  0.0045, -0.0214]],

        [[ 0.0012,  0.0032,  0.0003,  ..., -0.0019,  0.0003,  0.0019],
         [ 0.0244,  0.1180,  0.0415,  ..., -0.0288,  0.0463,  0.0488],
         [ 0.0578,  0.0428,  0.0656,  ...,  0.0013,  0.0128,  0.0003],
         ...,
         [-0.0015,  0.0360,  0.0223,  ...,  0.0148,  0.0045, -0.0214],
         [-0.0015,  0.0360,  0.0223,  ...,  0.0148,  0.0045, -0.0214],
         [-0.0015,  0.0360,  0.0223,  ...,  0.0148,  0.0045, -0.0214]]],
       device='cuda:0', grad_fn=<CatBackward0>)
tensor([[49406,   343,   343,   343,   343,   343,   343,   343,

In [35]:
text_features = model.encode_text_learn(prompts, tokenized_prompts, compound_prompts_text).float()
text_features = torch.stack(torch.chunk(text_features, dim = 0, chunks = 2), dim = 1)
text_features = text_features/text_features.norm(dim=-1, keepdim=True)
print(text_features)

tensor([[[ 0.0411, -0.0154,  0.0404,  ..., -0.0621, -0.0004, -0.0997],
         [-0.0425,  0.0133, -0.0352,  ...,  0.0522,  0.0033,  0.0958]]],
       device='cuda:0', grad_fn=<DivBackward0>)


In [37]:
print(text_features.permute(0, 2, 1))

tensor([[[ 0.0411, -0.0425],
         [-0.0154,  0.0133],
         [ 0.0404, -0.0352],
         ...,
         [-0.0621,  0.0522],
         [-0.0004,  0.0033],
         [-0.0997,  0.0958]]], device='cuda:0', grad_fn=<PermuteBackward0>)


In [47]:
from visualization import visualizer


for idx, items in enumerate(tqdm(test_dataloader)):
    image = items['img'].to(device)
    cls_name = items['cls_name']
    cls_id = items['cls_id']
    gt_mask = items['img_mask']
    gt_mask[gt_mask > 0.5], gt_mask[gt_mask <= 0.5] = 1, 0
    results[cls_name[0]]['imgs_masks'].append(gt_mask)  # px
    results[cls_name[0]]['gt_sp'].extend(items['anomaly'].detach().cpu())

    with torch.no_grad():
        image_features, patch_features = model.encode_image(image, features_list, DPAM_layer = 20)
        print(f'\n *** \n image_features: {image_features} \n patch_features:{patch_features} \n *** \n')
        image_features = image_features / image_features.norm(dim=-1, keepdim=True)
        print(f'\n *** \n image_features: {image_features} \n *** \n')
        
        text_probs = image_features @ text_features.permute(0, 2, 1)
        print(f'\n *** \n text_probs 1 : {text_probs} \n *** \n')
        text_probs = (text_probs/0.07).softmax(-1)
        print(f'\n *** \n text_probs 2 : {text_probs} \n *** \n')
        text_probs = text_probs[:, 0, 1] # anomaly score = P(g_a, x)
        print(f'\n *** \n text_probs 3 : {text_probs} \n *** \n')
        anomaly_map_list = []
        for idx, patch_feature in enumerate(patch_features): # 픽셀별 anomlay score
            if idx >= args.feature_map_layer[0]:
                patch_feature = patch_feature/ patch_feature.norm(dim = -1, keepdim = True)
                similarity, _ = AnomalyCLIP_lib.compute_similarity(patch_feature, text_features[0])
                similarity_map = AnomalyCLIP_lib.get_similarity_map(similarity[:, 1:, :], args.image_size)
                anomaly_map = (similarity_map[...,1] + 1 - similarity_map[...,0])/2.0
                anomaly_map_list.append(anomaly_map)

        anomaly_map = torch.stack(anomaly_map_list)
        print(f'\n *** \n anomaly_map: {anomaly_map} \n *** \n')
        anomaly_map = anomaly_map.sum(dim = 0)
        print(f'\n *** \n anomaly_map_sum: {anomaly_map} \n *** \n')
        results[cls_name[0]]['pr_sp'].extend(text_probs.detach().cpu())
        anomaly_map = torch.stack([torch.from_numpy(gaussian_filter(i, sigma = args.sigma)) for i in anomaly_map.detach().cpu()], dim = 0 )
        results[cls_name[0]]['anomaly_maps'].append(anomaly_map)
        print(f'\n *** \n results: {results} \n *** \n')
        visualizer(items['img_path'], anomaly_map.detach().cpu().numpy(), args.image_size, args.save_path, cls_name, results[cls_name[0]]['pr_sp'])
    if idx !=0 :
        break


  0%|          | 0/99 [00:00<?, ?it/s]


 *** 
 image_features: tensor([[ 5.9967e-03,  7.5851e-01,  2.6046e-01,  4.7619e-02, -3.7095e-01,
         -4.8167e-02,  3.1094e-02, -2.2805e-02,  1.8683e-01,  4.3864e-02,
          1.9436e-01, -3.7653e-01, -6.5187e-02,  5.2986e-01,  1.0068e-01,
         -1.2386e-01, -4.1698e-01, -2.3728e-01,  3.4110e-01, -2.7846e-01,
          3.9896e-02,  1.0191e-01,  1.2274e-01,  2.1334e-01, -5.7332e-01,
         -1.1138e-01, -2.6434e-01, -4.1776e-01,  1.9270e-01, -3.4563e-02,
          2.2431e-01, -1.7430e-01, -3.1606e-01, -3.8469e-02,  1.7556e-01,
          3.6981e-01, -6.6115e-02, -9.8012e-02,  2.1256e-01, -1.6106e-02,
          8.4564e-02, -7.2324e-02,  3.2877e-01, -4.4942e-01, -2.7108e-01,
          1.6757e-01, -2.6343e-02,  3.2838e-01,  8.7209e-02, -3.6387e-02,
          1.6057e-01,  6.5648e-02,  1.7208e-01,  8.2174e-02, -3.5652e-01,
         -7.0663e-02,  1.4999e-01, -5.2479e-01,  4.3431e-01, -1.4094e-01,
         -5.7409e-02, -7.8857e-02, -5.9329e-02,  5.4463e-02,  4.8163e-01,
          2.97




TypeError: join() argument must be str, bytes, or os.PathLike object, not 'list'

In [40]:
print(results['orange'].keys())

dict_keys(['gt_sp', 'pr_sp', 'imgs_masks', 'anomaly_maps'])


In [41]:
print(results['orange']['gt_sp'], results['orange']['pr_sp'])

[tensor(1)] [tensor(0.9661)]
