# Evaluating your SAE

Code based off Rob Graham's ([themachinefan](https://github.com/themachinefan)) SAE evaluation code.

In [1]:
import os
os.getcwd()

'/workspace/ViT-Prisma/src/vit_prisma/sae/evals'

In [2]:
import einops
import torch
import torchvision

import plotly.express as px

from tqdm import tqdm

import numpy as np
import os
import requests

# Setup

In [3]:
from dataclasses import dataclass
from vit_prisma.sae.config import VisionModelSAERunnerConfig


selected_sae = "sparse-autoencoder-clip-b-32-sae-vanilla-x64-layer-11-hook_resid_post-l1-0.0001"

@dataclass
class EvalConfig(VisionModelSAERunnerConfig):
    # sparse-autoencoder-clip-b-32-sae-vanilla-x64-layer-10-hook_mlp_out-l1-0.0001
    # sparse-autoencoder-clip-b-32-sae-vanilla-x64-layer-11-hook_resid_post-l1-0.0001
    # sparse-autoencoder-clip-b-32-sae-vanilla-x64-layer-0-hook_mlp_out-l1-0.0001
    sae_path: str = f'/workspace/sae_checkpoints/{selected_sae}/n_images_2600058.pt'
    model_name: str = "open-clip:laion/CLIP-ViT-B-32-DataComp.XL-s13B-b90K"
    model_type: str =  "clip"
    patch_size: str = 32

    dataset_path = "/workspace"
    dataset_train_path: str = "/workspace/ILSVRC/Data/CLS-LOC/train"
    dataset_val_path: str = "/workspace/ILSVRC/Data/CLS-LOC/val"

    verbose: bool = True

    device: bool = 'cuda'

    eval_max: int = 50_000 # 50_000
    batch_size: int = 32
        
#     hook_point_layer: int = 11
    # make the max image output folder a subfolder of the sae path


    @property
    def max_image_output_folder(self) -> str:
        # Get the base directory of sae_checkpoints
        sae_base_dir = os.path.dirname(os.path.dirname(self.sae_path))
        
        # Get the name of the original SAE checkpoint folder
        sae_folder_name = os.path.basename(os.path.dirname(self.sae_path))
        
        # Create a new folder path in sae_checkpoints/images with the original name
        output_folder = os.path.join(sae_base_dir, 'max_images', sae_folder_name)
        output_folder = os.path.join(output_folder, f"layer_{self.hook_point_layer}") # Add layer number

        
        # Ensure the directory exists
        os.makedirs(output_folder, exist_ok=True)
        
        return output_folder

cfg = EvalConfig()

n_tokens_per_buffer (millions): 0.032
Lower bound: n_contexts_per_buffer (millions): 0.00064
Total training steps: 158691
Total training images: 13000000
Total wandb updates: 15869
Expansion factor: 16
n_tokens_per_feature_sampling_window (millions): 204.8
n_tokens_per_dead_feature_window (millions): 1024.0
Using Ghost Grads.
We will reset the sparsity calculation 158 times.
Number tokens in sparsity calculation window: 4.10e+06
Gradient clipping with max_norm=1.0
Using SAE initialization method: encoder_transpose_decoder


In [4]:
torch.set_grad_enabled(False)

<torch.autograd.grad_mode.set_grad_enabled at 0x7469cd294bb0>

## Load model

In [5]:
from vit_prisma.models.base_vit import HookedViT

model_name = "open-clip:laion/CLIP-ViT-B-32-DataComp.XL-s13B-b90K"
model = HookedViT.from_pretrained(model_name, is_timm=False, is_clip=True).to(cfg.device)
 

model_id download_pretrained_from_hf: laion/CLIP-ViT-B-32-DataComp.XL-s13B-b90K
Official model name open-clip:laion/CLIP-ViT-B-32-DataComp.XL-s13B-b90K
Converting OpenCLIP weights
model_id download_pretrained_from_hf: laion/CLIP-ViT-B-32-DataComp.XL-s13B-b90K
visual projection shape torch.Size([768, 512])
Setting center_writing_weights to False for OpenCLIP
Setting fold_ln to False for OpenCLIP
Loaded pretrained model open-clip:laion/CLIP-ViT-B-32-DataComp.XL-s13B-b90K into HookedTransformer


## Load datasets

In [6]:
import importlib
import vit_prisma
# importlib.reload(vit_prisma.dataloaders.imagenet_dataset)

In [7]:
# load dataset
import open_clip
from vit_prisma.utils.data_utils.imagenet_utils import setup_imagenet_paths
from vit_prisma.dataloaders.imagenet_dataset import get_imagenet_transforms_clip, ImageNetValidationDataset

from torchvision import transforms
from transformers import CLIPProcessor

og_model_name = "hf-hub:laion/CLIP-ViT-B-32-DataComp.XL-s13B-b90K"
og_model, _, preproc = open_clip.create_model_and_transforms(og_model_name)
processor = preproc

size=224

data_transforms = transforms.Compose([
    transforms.Resize((size, size)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073],
                     std=[0.26862954, 0.26130258, 0.27577711]),
])
    
imagenet_paths = setup_imagenet_paths(cfg.dataset_path)
imagenet_paths["train"] = "/workspace/ILSVRC/Data/CLS-LOC/train"
imagenet_paths["val"] = "/workspace/ILSVRC/Data/CLS-LOC/val"
imagenet_paths["val_labels"] = "/workspace/LOC_val_solution.csv"
imagenet_paths["label_strings"] = "/workspace/LOC_synset_mapping.txt"
print()
train_data = torchvision.datasets.ImageFolder(cfg.dataset_train_path, transform=data_transforms)
val_data = ImageNetValidationDataset(cfg.dataset_val_path, 
                                imagenet_paths['label_strings'], 
                                imagenet_paths['val_labels'], 
                                data_transforms,
                                return_index=True,
)
val_data_visualize = ImageNetValidationDataset(cfg.dataset_val_path, 
                                imagenet_paths['label_strings'], 
                                imagenet_paths['val_labels'],
                                torchvision.transforms.Compose([
    torchvision.transforms.Resize((224, 224)),
    torchvision.transforms.ToTensor(),]), return_index=True)

print(f"Validation data length: {len(val_data)}") if cfg.verbose else None



Validation data length: 50000


In [8]:
from vit_prisma.sae.training.activations_store import VisionActivationsStore
# import dataloader
from torch.utils.data import DataLoader

# activations_loader = VisionActivationsStore(cfg, model, train_data, eval_dataset=val_data)
val_dataloader = DataLoader(val_data, batch_size=cfg.batch_size, shuffle=False, num_workers=4)


## Load pretrained SAE to evaluate

In [9]:
from vit_prisma.sae.sae import SparseAutoencoder
sparse_autoencoder = SparseAutoencoder(cfg).load_from_pretrained(f"/workspace/sae_checkpoints/{selected_sae}/n_images_2600058.pt")
sparse_autoencoder.to(cfg.device)
sparse_autoencoder.eval()  # prevents error if we're expecting a dead neuron mask for who 


get_activation_fn received: activation_fn=relu, kwargs={}
n_tokens_per_buffer (millions): 0.032
Lower bound: n_contexts_per_buffer (millions): 0.00064
Total training steps: 158691
Total training images: 13000000
Total wandb updates: 1586
Expansion factor: 64
n_tokens_per_feature_sampling_window (millions): 204.8
n_tokens_per_dead_feature_window (millions): 1024.0
Using Ghost Grads.
We will reset the sparsity calculation 158 times.
Number tokens in sparsity calculation window: 4.10e+06
Gradient clipping with max_norm=1.0
Using SAE initialization method: encoder_transpose_decoder
get_activation_fn received: activation_fn=relu, kwargs={}


SparseAutoencoder(
  (hook_sae_in): HookPoint()
  (hook_hidden_pre): HookPoint()
  (hook_hidden_post): HookPoint()
  (hook_sae_out): HookPoint()
  (activation_fn): ReLU()
)

## Clip Labeling AutoInterp

In [10]:
# all_imagenet_class_names

In [11]:
from vit_prisma.dataloaders.imagenet_dataset import get_imagenet_index_to_name
ind_to_name = get_imagenet_index_to_name()

all_imagenet_class_names = []
for i in range(len(ind_to_name)):
    all_imagenet_class_names.append(ind_to_name[str(i)][1])

In [12]:
cfg.max_image_output_folder

'/workspace/sae_checkpoints/max_images/sparse-autoencoder-clip-b-32-sae-vanilla-x64-layer-11-hook_resid_post-l1-0.0001/layer_9'

## Feature steering

In [13]:
def steering_hook_fn_cls_only(
    activations, cfg, hook, sae, steering_indices, steering_strength=1.0, mean_ablation_values=None, include_error=False

):
    sae.to(activations.device)


    sae_input = activations.clone()
    sae_output, feature_activations, *data = sae(sae_input)
    
    steered_feature_activations = feature_activations.clone()
    
    # batch, stream, feats
    # cls token is *last* in sequence
    steered_feature_activations[:, 0, steering_indices] = steering_strength

    steered_sae_out = einops.einsum(
                steered_feature_activations,
                sae.W_dec,
                "... d_sae, d_sae d_in -> ... d_in",
            ) + sae.b_dec

    steered_sae_out = sae.run_time_activation_norm_fn_out(steered_sae_out)
    
    # print(f"steering norm: {(steered_sae_out - sae_output).norm()}")
    
    

    if include_error:
        error = sae_input - sae_output
        # print(f"error.norm(): {error.norm()}")
        return steered_sae_out + error
    return steered_sae_out


def steering_hook_fn(
    activations, cfg, hook, sae, steering_indices, steering_strength=1.0, mean_ablation_values=None, include_error=False

):
    sae.to(activations.device)


    sae_input = activations.clone()
    sae_output, feature_activations, *data = sae(sae_input)
    
    steered_feature_activations = feature_activations.clone()
    
    steered_feature_activations[:, :, steering_indices] = steering_strength

    steered_sae_out = einops.einsum(
                steered_feature_activations,
                sae.W_dec,
                "... d_sae, d_sae d_in -> ... d_in",
            ) + sae.b_dec

    steered_sae_out = sae.run_time_activation_norm_fn_out(steered_sae_out)
    
    # print(f"steering norm: {(steered_sae_out - sae_output).norm()}")
    
    

    if include_error:
        error = sae_input - sae_output
        # print(f"error.norm(): {error.norm()}")
        return steered_sae_out + error
    return steered_sae_out

In [33]:
random_feat_idxs = np.random.randint(0, high=sparse_autoencoder.b_enc.shape[0], size=(50))
random_feat_idxs[0] = 655
random_feat_idxs[1] = 656
random_feat_idxs[2] = 665
random_feat_idxs[3] = 2541

In [34]:
# for a given feature, set it high/low on maxim activ. imgs and high/low on non-activ images
# hook SAE and replace desired feature with 0 or 1 
from typing import List, Dict, Tuple
import torch
import einops
from tqdm import tqdm

from functools import partial

@torch.no_grad()
def compute_feature_activations_set_feat(
    images: torch.Tensor,
    model: torch.nn.Module,
    sparse_autoencoder: torch.nn.Module,
    encoder_weights: torch.Tensor,
    encoder_biases: torch.Tensor,
    feature_ids: List[int],
    feature_categories: List[str],
    top_k: int = 10,
    steering_strength: float = 10.0
):
    """
    Compute the highest activating tokens for given features in a batch of images.
    
    Args:
        images: Input images
        model: The main model
        sparse_autoencoder: The sparse autoencoder
        encoder_weights: Encoder weights for selected features
        encoder_biases: Encoder biases for selected features
        feature_ids: List of feature IDs to analyze
        feature_categories: Categories of the features
        top_k: Number of top activations to return per feature

    Returns:
        Dictionary mapping feature IDs to tuples of (top_indices, top_values)
    """
#     _, cache = model.run_with_cache(images, names_filter=[sparse_autoencoder.cfg.hook_point])
    recons_image_embeddings_feat_altered_list = []
    for idx in np.array(range(sparse_autoencoder.W_dec.shape[0]))[random_feat_idxs]:
#         print(f"Feature: {idx} ====================")
        
        # steering_hook_fn, steering_hook_fn_cls_only
        steering_hook = partial(
            steering_hook_fn,
            cfg=cfg,
            sae=sparse_autoencoder,
            steering_indices=[idx],
            steering_strength=steering_strength,
            mean_ablation_values = [1.0],
            include_error=True,
            )
        
        
        recons_image_embeddings_feat_altered = model.run_with_hooks(
            images,
            fwd_hooks=[("blocks.11.hook_resid_post", steering_hook)],
        )
        recons_image_embeddings_feat_altered_list.append(recons_image_embeddings_feat_altered)

    
    # output is in clip embedding space
    recons_image_embeddings_default = model.run_with_hooks(
        images,
        fwd_hooks=[("blocks.11.hook_resid_post", lambda x, hook: x)],
    )
    
#     print(f"recons_image_embeddings_default: {recons_image_embeddings_default}")
#     print(f"recons_image_embeddings_default.shape: {recons_image_embeddings_default.shape}")
#     print(f"recons_image_embeddings_default: {recons_image_embeddings_default.shape}")

#     print(f"recons_image_embeddings_feat_altered: {recons_image_embeddings_feat_altered}")
#     print(f"recons_image_embeddings_feat_altered.shape: {recons_image_embeddings_feat_altered.shape}")

    return recons_image_embeddings_feat_altered_list, recons_image_embeddings_default

In [None]:
import torch
from PIL import Image

from collections import defaultdict
max_samples = cfg.eval_max

encoder_biases = sparse_autoencoder.b_enc#[interesting_features_indices]
encoder_weights = sparse_autoencoder.W_enc#[:, interesting_features_indices]

steering_strengths = [0.0, 5.0, 10.0, 20.0, 50.0, 150.0]#, 300.0]#, 500.0, 1000.0, 2000.0, 5000.0, 10000.0]#, -200.0, -300.0]


steering_strength_image_results = defaultdict(dict)
steering_strength_info = {}

og_model.cuda()




for steering_strength in steering_strengths:
    print(f"{'==============' * 2} steering_strength: {steering_strength} {'==============' * 2}")
    # ===== Get Steered and Default CLIP Outputs =====
    top_k=10
    processed_samples = 0
    default_embeds_list = []
    feature_steered_embeds = defaultdict(list)
    l = 0
    
    # remove tqdm
    for batch_images, _, batch_indices in tqdm(val_dataloader, total=max_samples // cfg.batch_size):
        batch_images = batch_images.to(cfg.device)
        batch_indices = batch_indices.to(cfg.device)
        batch_size = batch_images.shape[0]

        altered_embeds_list, default_embeds = compute_feature_activations_set_feat(
            batch_images, model, sparse_autoencoder, encoder_weights, encoder_biases,
            None, None, top_k, steering_strength
        )
        default_embeds_list.append(default_embeds)
        for j, altered_embeds in enumerate(altered_embeds_list):
            feature_steered_embeds[random_feat_idxs[j]].extend(altered_embeds)
        # either label embeds or optimize to maximal token in text transformer embedding face
        l += 1
        if l >= 1:
            break    
    default_embeds = torch.cat(default_embeds_list)
    
    with open("/workspace/clip_dissect_raw.txt", "r") as f:
        larger_vocab = [line[:-1] for line in f.readlines()][:5000]


    # ===== CLIP Embeds =====
    # use clip vocab here and compare embeds
    tokenizer = open_clip.get_tokenizer('ViT-B-32')
    text = tokenizer(larger_vocab)
    text_features = og_model.encode_text(text.cuda())
    text_features_normed = text_features/text_features.norm(dim=-1, keepdim=True)


    print(f"text_features_normed.shape: {text_features_normed.shape}")
    text_probs_altered_list = []
    with torch.no_grad(), torch.cuda.amp.autocast():
        for key in feature_steered_embeds:
            print(key)
            # embeds already have L2 norm of 1
            text_probs_altered = (100.0 * torch.stack(feature_steered_embeds[key]) @ text_features_normed.T).softmax(dim=-1)
            text_probs_altered_list.append(text_probs_altered)
        text_probs_default = (100.0 * default_embeds @ text_features_normed.T).softmax(dim=-1)

#     print("Label probs altered:", text_probs_altered.shape)  # prints: [[1., 0., 0.]]
#     print("Label probs default:", text_probs_default.shape)  # prints: [[1., 0., 0.]]
    
    
    # ===== Logit Difference =====
    # indexed as such in steering_strength_image_results:
    # per steering strength
    # per feature
    # per image
    
    selected_vocab = larger_vocab

    top_concept_per_feat = {}
    top_val_per_feat = {}
    top_diff_per_feat = {}
    steerability_per_feat = {}
    top_ratio_per_feat = {}
    
    # run this for sampled features over all of imagenet eval
    for j, text_probs_altered in enumerate(text_probs_altered_list):
        print(f"{'============================================'*2}\n\nFor Feature {random_feat_idxs[j]}")
#         print("actual image content:")
        default_vals_softmax, default_idxs_softmax = torch.topk(text_probs_default,k=10)
#         print(default_vals_softmax, "\n", np.array(selected_vocab)[default_idxs_softmax.cpu()])


        logit_diff = text_probs_altered - text_probs_default
#         logit_diff_aggregate = logit_diff.sum(dim=0)
        logit_diff_aggregate = logit_diff.mean(dim=0)
    
        print(f"logit_diff.shape: {logit_diff.shape}")
        print(f"logit_diff: {logit_diff}")
        steerability_score = torch.square(logit_diff_aggregate)
        print(f"steerability_score.shape: {steerability_score.shape}")
        print(f"steerability_score: {steerability_score}")
#         steerability_score_aggregate = steerability_score.mean(dim=0)
#         print(f"steerability_score_aggregate.shape: {steerability_score_aggregate.shape}")
#         print(f"steerability_score_aggregate: {steerability_score_aggregate}")
        

        logit_ratio = text_probs_altered/text_probs_default
        logit_ratio_aggregate = logit_ratio.mean(dim=0)

#         print(f"text_probs_altered.softmax(): {text_probs_altered.softmax(1).shape}")
        text_probs_altered_softmax = text_probs_altered.softmax(1)
        vals_softmax, idxs_softmax = torch.topk(text_probs_altered_softmax,k=10)

    #     print(f"text_probs_altered.softmax(): {text_probs_altered.sum(0).softmax(0).shape}")
    #     text_probs_altered_softmax_agg = text_probs_altered.sum(0).softmax(0)
    #     vals_softmax_agg, idxs_softmax_agg = torch.topk(text_probs_altered_softmax_agg,k=10)

#         print(f"\nSoftmax Over {text_probs_altered.shape[0]} Images:\n{vals_softmax}")
#         print(np.array(selected_vocab)[idxs_softmax.cpu()])
#         for i in range(vals_softmax.shape[0]):
#             print(vals_softmax[i], "\n", np.array(selected_vocab)[idxs_softmax.cpu()][i])
#             break

    #     print(f"\nAgg Softmax Over {text_probs_altered.shape[0]} Images:\n{vals_softmax_agg}")
    #     print(np.array(selected_vocab)[idxs_softmax_agg.cpu()])

        vals_agg, idxs_agg = torch.topk(logit_diff_aggregate,k=10)
        vals_least_agg, idxs_least_agg = torch.topk(logit_diff_aggregate,k=10,largest=False)

        ratios_agg, ratios_idxs_agg = torch.topk(logit_ratio_aggregate,k=10)
        ratios_least_agg, ratios_idxs_least_agg = torch.topk(logit_ratio_aggregate,k=10,largest=False)

        vals, idxs = torch.topk(logit_diff,k=5)
        vals_least, idxs_least = torch.topk(logit_diff,k=5,largest=False)

        ratios, ratios_idxs = torch.topk(logit_ratio,k=5)
        ratios_least, ratios_idxs_least = torch.topk(logit_ratio,k=5,largest=False)

        # random_feat_idxs[j] is the index of the feature
        for img_idx in range(batch_images.shape[0]):
            if random_feat_idxs[j] not in steering_strength_image_results[str(steering_strength)].keys():
                steering_strength_image_results[str(steering_strength)][random_feat_idxs[j].copy()] = []
            # entries are torch.topk(k=10) results
            steering_strength_image_results[str(steering_strength)][random_feat_idxs[j]].append((np.array(selected_vocab, copy=True)[idxs_softmax.cpu()][img_idx], torch.clone(vals_softmax[img_idx])))
        
        # per image
        top_concept_per_feat[random_feat_idxs[j]] = np.array(selected_vocab)[idxs_softmax.cpu()][0][0]
        top_val_per_feat[random_feat_idxs[j]] = vals_softmax[0][0]
        
        # aggregate
        steerability_per_feat[random_feat_idxs[j]] = steerability_score
        top_diff_per_feat[random_feat_idxs[j]] = vals_agg[0]
        top_ratio_per_feat[random_feat_idxs[j]] = ratios_agg[0]


        print(f"\nMost Changed, by Absolute Diff Over {logit_diff.shape[0]} Images:\n{vals_agg}")
        print(np.array(selected_vocab)[idxs_agg.cpu()])
#         print(vals_least_agg)
#         print(np.array(selected_vocab)[idxs_least_agg.cpu()])

        print(f"\nMost Changed, by Ratio Over {logit_diff.shape[0]} Images:")
        print(ratios_agg)
        print(np.array(selected_vocab)[ratios_idxs_agg.cpu()])
#         print(ratios_least_agg)
#         print(np.array(selected_vocab)[ratios_idxs_least_agg.cpu()])
    
    steering_strength_info[steering_strength] = (top_concept_per_feat,top_val_per_feat,top_ratio_per_feat,top_diff_per_feat,steerability_per_feat)




  0%|                                                                                                            | 0/1562 [00:03<?, ?it/s]


text_features_normed.shape: torch.Size([5000, 512])
655
656
665
2541
14942
30735
17019
21323
29112
19116
46123
19102
13507
16897
8551
27058
37825
25896
47367
6859
9650
34678
32502
16732
14425
22156
10142
28741
17591
27470
1676
4494
26761
46183
5347
43344
30909
41886
43766
38607
43962
11929
23633
48165
15362
46624
8161
40820
21387
8301

For Feature 655
logit_diff.shape: torch.Size([32, 5000])
logit_diff: tensor([[0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        ...,
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [1.5534e-09, 2.8580e-08, 3.0559e-10,  ..., 2.5693e-11, 8.3446e-11,
         5.8776e-11]


Most Changed, by Absolute Diff Over 32 Images:
tensor([2.0474e-07, 1.4133e-07, 7.6836e-08, 6.1700e-08, 4.4468e-08, 4.3074e-08,
        3.8015e-08, 3.0093e-08, 2.9162e-08, 2.1433e-08], device='cuda:0')
['sharp' 'speed' 'ride' 'vehicle' 'shared' 'racing' 'recreation' 'sierra'
 'sorted' 'mouse']

Most Changed, by Ratio Over 32 Images:
tensor([1.0005, 1.0002, 1.0002, 1.0002, 1.0002, 1.0002, 1.0002, 1.0002, 1.0002,
        1.0002], device='cuda:0')
['sharp' 'let' 'plans' 'fr' 'objectives' 'using' 'programs' 'failed'
 'sole' 'particularly']

For Feature 17019
logit_diff.shape: torch.Size([32, 5000])
logit_diff: tensor([[0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        ...,
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
       


Most Changed, by Absolute Diff Over 32 Images:
tensor([2.0474e-07, 1.4133e-07, 7.6836e-08, 6.1700e-08, 4.4468e-08, 4.3074e-08,
        3.8015e-08, 3.0093e-08, 2.9162e-08, 2.1433e-08], device='cuda:0')
['sharp' 'speed' 'ride' 'vehicle' 'shared' 'racing' 'recreation' 'sierra'
 'sorted' 'mouse']

Most Changed, by Ratio Over 32 Images:
tensor([1.0005, 1.0002, 1.0002, 1.0002, 1.0002, 1.0002, 1.0002, 1.0002, 1.0002,
        1.0002], device='cuda:0')
['sharp' 'let' 'plans' 'fr' 'objectives' 'using' 'programs' 'failed'
 'sole' 'particularly']

For Feature 16897
logit_diff.shape: torch.Size([32, 5000])
logit_diff: tensor([[0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        ...,
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
       

tensor([1.0005, 1.0002, 1.0002, 1.0002, 1.0002, 1.0002, 1.0002, 1.0002, 1.0002,
        1.0002], device='cuda:0')
['sharp' 'let' 'plans' 'fr' 'objectives' 'using' 'programs' 'failed'
 'sole' 'particularly']

For Feature 6859
logit_diff.shape: torch.Size([32, 5000])
logit_diff: tensor([[0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        ...,
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [1.5534e-09, 2.8580e-08, 3.0559e-10,  ..., 2.5693e-11, 8.3446e-11,
         5.8776e-11]], device='cuda:0')
steerability_score.shape: torch.Size([5000])
steerability_score: tensor([2.3648e-21, 7.9869e-19, 1.4249e-22, 

steerability_score: tensor([2.3648e-21, 7.9869e-19, 1.4249e-22,  ..., 1.4055e-22, 8.8636e-24,
        1.3332e-24], device='cuda:0')

Most Changed, by Absolute Diff Over 32 Images:
tensor([2.0474e-07, 1.4133e-07, 7.6836e-08, 6.1700e-08, 4.4468e-08, 4.3074e-08,
        3.8015e-08, 3.0093e-08, 2.9162e-08, 2.1433e-08], device='cuda:0')
['sharp' 'speed' 'ride' 'vehicle' 'shared' 'racing' 'recreation' 'sierra'
 'sorted' 'mouse']

Most Changed, by Ratio Over 32 Images:
tensor([1.0005, 1.0002, 1.0002, 1.0002, 1.0002, 1.0002, 1.0002, 1.0002, 1.0002,
        1.0002], device='cuda:0')
['sharp' 'let' 'plans' 'fr' 'objectives' 'using' 'programs' 'failed'
 'sole' 'particularly']

For Feature 10142
logit_diff.shape: torch.Size([32, 5000])
logit_diff: tensor([[0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+0

logit_diff: tensor([[0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        ...,
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [1.5534e-09, 2.8580e-08, 3.0559e-10,  ..., 2.5693e-11, 8.3446e-11,
         5.8776e-11]], device='cuda:0')
steerability_score.shape: torch.Size([5000])
steerability_score: tensor([2.3648e-21, 7.9869e-19, 1.4249e-22,  ..., 1.4055e-22, 8.8636e-24,
        1.3332e-24], device='cuda:0')

Most Changed, by Absolute Diff Over 32 Images:
tensor([2.0474e-07, 1.4133e-07, 7.6836e-08, 6.1700e-08, 4.4468e-08, 4.3074e-08,
        3.8015e-08, 3.0093e-08, 2.9162e-08, 2.1433e-08], device='cuda:


Most Changed, by Absolute Diff Over 32 Images:
tensor([2.0474e-07, 1.4133e-07, 7.6836e-08, 6.1700e-08, 4.4468e-08, 4.3074e-08,
        3.8015e-08, 3.0093e-08, 2.9162e-08, 2.1433e-08], device='cuda:0')
['sharp' 'speed' 'ride' 'vehicle' 'shared' 'racing' 'recreation' 'sierra'
 'sorted' 'mouse']

Most Changed, by Ratio Over 32 Images:
tensor([1.0005, 1.0002, 1.0002, 1.0002, 1.0002, 1.0002, 1.0002, 1.0002, 1.0002,
        1.0002], device='cuda:0')
['sharp' 'let' 'plans' 'fr' 'objectives' 'using' 'programs' 'failed'
 'sole' 'particularly']

For Feature 43766
logit_diff.shape: torch.Size([32, 5000])
logit_diff: tensor([[0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        ...,
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
       


Most Changed, by Absolute Diff Over 32 Images:
tensor([2.0474e-07, 1.4133e-07, 7.6836e-08, 6.1700e-08, 4.4468e-08, 4.3074e-08,
        3.8015e-08, 3.0093e-08, 2.9162e-08, 2.1433e-08], device='cuda:0')
['sharp' 'speed' 'ride' 'vehicle' 'shared' 'racing' 'recreation' 'sierra'
 'sorted' 'mouse']

Most Changed, by Ratio Over 32 Images:
tensor([1.0005, 1.0002, 1.0002, 1.0002, 1.0002, 1.0002, 1.0002, 1.0002, 1.0002,
        1.0002], device='cuda:0')
['sharp' 'let' 'plans' 'fr' 'objectives' 'using' 'programs' 'failed'
 'sole' 'particularly']

For Feature 15362
logit_diff.shape: torch.Size([32, 5000])
logit_diff: tensor([[0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        ...,
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
       

  0%|                                                                                                            | 0/1562 [00:03<?, ?it/s]


text_features_normed.shape: torch.Size([5000, 512])
655
656
665
2541
14942
30735
17019
21323
29112
19116
46123
19102
13507
16897
8551
27058
37825
25896
47367
6859
9650
34678
32502
16732
14425
22156
10142
28741
17591
27470
1676
4494
26761
46183
5347
43344
30909
41886
43766
38607
43962
11929
23633
48165
15362
46624
8161
40820
21387
8301

For Feature 655
logit_diff.shape: torch.Size([32, 5000])
logit_diff: tensor([[ 6.0463e-05,  3.6061e-04,  3.7240e-04,  ...,  2.2413e-07,
          4.2510e-06,  4.8869e-06],
        [ 6.0822e-06,  1.5436e-05,  5.2838e-05,  ..., -1.8799e-06,
         -3.7966e-08, -2.4757e-07],
        [ 8.1855e-06,  3.6000e-05,  2.0823e-05,  ...,  3.6430e-08,
          3.2836e-07,  1.2262e-07],
        ...,
        [ 8.3106e-06,  7.6327e-06,  6.6329e-06,  ...,  9.0241e-09,
          3.6510e-06,  2.8800e-07],
        [ 5.6727e-06,  5.5843e-05,  5.2555e-05,  ..., -2.1745e-07,
          4.8445e-06,  1.7840e-07],
        [ 3.2091e-05,  6.4407e-04,  3.2074e-05,  ..., -1.1561e-07

['bird' 'dog' 'wildlife' 'male' 'hunting' 'bears' 'dogs' 'flower'
 'visitor' 'fish']

Most Changed, by Ratio Over 32 Images:
tensor([12.3196, 11.8533, 11.5141, 10.1595,  9.7346,  9.5785,  9.3376,  8.9323,
         8.7992,  7.9578], device='cuda:0')
['flowers' 'atmosphere' 'dear' 'governments' 'their' 'arrival' 'allows'
 'dates' 'levels' 'preferences']

For Feature 17019
logit_diff.shape: torch.Size([32, 5000])
logit_diff: tensor([[ 3.7038e-05,  1.4350e-04,  1.0094e-04,  ...,  1.5314e-07,
          3.2156e-06,  3.0601e-06],
        [ 4.8675e-05,  5.9255e-05,  7.3547e-05,  ...,  3.4755e-06,
          8.8109e-07,  4.5921e-06],
        [ 8.9019e-06,  2.4851e-05,  8.7949e-06,  ...,  8.8138e-08,
          5.0836e-07,  1.4268e-07],
        ...,
        [ 1.6176e-05,  8.9688e-06,  4.2551e-06,  ...,  3.2559e-07,
          1.5403e-05,  7.0258e-07],
        [ 3.4487e-06,  1.7370e-05,  1.2900e-05,  ..., -3.3375e-07,
          4.9237e-06,  9.2963e-08],
        [ 4.3361e-05,  4.8954e-04,  1.3670e-05


Most Changed, by Absolute Diff Over 32 Images:
tensor([0.0243, 0.0100, 0.0055, 0.0046, 0.0043, 0.0041, 0.0037, 0.0036, 0.0036,
        0.0036], device='cuda:0')
['pair' 'guinea' 'table' 'golden' 'lift' 'tower' 'tel' 'ray' 'eating'
 'seal']

Most Changed, by Ratio Over 32 Images:
tensor([14.6210, 13.9589, 12.4094, 10.9832, 10.1479,  9.7227,  9.4610,  9.2013,
         9.1489,  9.1241], device='cuda:0')
['fig' 'edge' 'lift' 'cape' 'ref' 'serves' 'extension' 'steps'
 'citysearch' 'shower']

For Feature 16897
logit_diff.shape: torch.Size([32, 5000])
logit_diff: tensor([[ 1.2966e-05,  3.7946e-05,  4.2172e-05,  ..., -1.2421e-08,
          1.9883e-07, -1.0501e-06],
        [ 2.2375e-05,  2.4531e-05,  3.8125e-05,  ...,  1.5977e-07,
          2.0566e-07, -1.0041e-06],
        [ 6.5665e-06,  1.6847e-05,  7.1492e-06,  ...,  2.9800e-08,
          1.8540e-07, -3.7462e-08],
        ...,
        [ 6.0961e-06,  3.0994e-06,  2.0303e-06,  ..., -8.9036e-09,
          6.3014e-07, -1.7385e-07],
        [ 8


Most Changed, by Absolute Diff Over 32 Images:
tensor([0.0084, 0.0064, 0.0055, 0.0048, 0.0041, 0.0034, 0.0032, 0.0029, 0.0028,
        0.0028], device='cuda:0')
['dolls' 'pair' 'bears' 'dog' 'fish' 'dogs' 'fucking' 'wing' 'hairy'
 'bowl']

Most Changed, by Ratio Over 32 Images:
tensor([13.5793, 11.9508, 11.6166, 11.4064, 10.0075,  9.7058,  9.6695,  9.6310,
         9.6301,  9.6224], device='cuda:0')
['toshiba' 'storage' 'orange' 'dolls' 'inventory' 'dvds' 'twice' 'motors'
 'supplies' 'thing']

For Feature 9650
logit_diff.shape: torch.Size([32, 5000])
logit_diff: tensor([[ 1.2853e-05,  6.9482e-05, -8.1340e-06,  ...,  1.6792e-08,
         -3.8497e-07, -5.9749e-07],
        [ 1.1844e-05,  1.8804e-05, -5.7923e-06,  ..., -1.6682e-07,
         -7.4808e-08, -8.6522e-07],
        [ 3.4344e-06,  1.2906e-05, -9.1741e-07,  ...,  4.9692e-09,
         -8.0253e-08, -3.4104e-08],
        ...,
        [ 7.3063e-06,  5.5283e-06,  2.4758e-08,  ...,  1.0922e-07,
         -1.2272e-06, -4.7514e-08],
     


Most Changed, by Absolute Diff Over 32 Images:
tensor([0.0193, 0.0086, 0.0064, 0.0063, 0.0053, 0.0045, 0.0042, 0.0034, 0.0033,
        0.0029], device='cuda:0')
['dog' 'mouse' 'mac' 'guinea' 'box' 'bowl' 'completed' 'epinions' 'permit'
 'trembl']

Most Changed, by Ratio Over 32 Images:
tensor([13.4810,  8.8321,  8.7017,  7.9589,  7.8340,  7.6889,  7.5516,  7.3274,
         7.1862,  7.1629], device='cuda:0')
['disney' 'cape' 'cart' 'button' 'common' 'missouri' 'resume'
 'professional' 'dog' 'thompson']

For Feature 28741
logit_diff.shape: torch.Size([32, 5000])
logit_diff: tensor([[ 8.6964e-07,  8.1578e-05,  3.8125e-05,  ...,  1.1930e-07,
         -7.0876e-07,  4.0253e-07],
        [ 1.2159e-05,  5.7274e-05,  4.8668e-05,  ...,  5.9953e-06,
          4.2411e-08,  2.3812e-06],
        [-1.1783e-06,  7.2630e-06,  2.0022e-06,  ...,  4.3636e-08,
         -1.9279e-07, -6.9019e-09],
        ...,
        [ 9.7082e-07,  5.3062e-06,  1.7777e-06,  ...,  3.2819e-07,
         -3.3614e-06,  1.5363e-


Most Changed, by Absolute Diff Over 32 Images:
tensor([0.0094, 0.0087, 0.0081, 0.0081, 0.0050, 0.0049, 0.0046, 0.0041, 0.0038,
        0.0035], device='cuda:0')
['turkey' 'hairy' 'kits' 'hunting' 'lamp' 'cute' 'mouse' 'dog' 'boxes'
 'wildlife']

Most Changed, by Ratio Over 32 Images:
tensor([9.7497, 8.7208, 8.6358, 8.5317, 8.1943, 8.0138, 7.4381, 7.1390, 6.8256,
        6.4802], device='cuda:0')
['bills' 'printable' 'camcorder' 'puzzle' 'nfl' 'utilities' 'attorney'
 'ohio' 'cincinnati' 'charger']

For Feature 5347
logit_diff.shape: torch.Size([32, 5000])
logit_diff: tensor([[ 3.2911e-06,  1.2136e-05,  3.9828e-06,  ...,  8.9163e-08,
         -5.2932e-07,  1.6794e-06],
        [ 1.1593e-05,  1.9167e-05,  1.1107e-05,  ...,  2.6348e-06,
          5.1753e-08,  3.7338e-06],
        [ 2.6085e-06,  9.5324e-06,  1.7496e-06,  ...,  1.2461e-07,
         -1.8846e-08,  1.7470e-07],
        ...,
        [ 6.3530e-06,  5.1986e-06,  1.3372e-06,  ...,  4.5425e-07,
          1.7230e-07,  8.7110e-07],
 


Most Changed, by Absolute Diff Over 32 Images:
tensor([0.0269, 0.0095, 0.0087, 0.0083, 0.0047, 0.0045, 0.0040, 0.0039, 0.0034,
        0.0034], device='cuda:0')
['guinea' 'trademarks' 'kits' 'ray' 'dogs' 'pair' 'detail' 'cocks'
 'foster' 'cabinet']

Most Changed, by Ratio Over 32 Images:
tensor([29.7912, 15.1449, 12.9091, 12.7613, 11.3782, 11.1587, 10.9598, 10.4936,
        10.0894,  9.7874], device='cuda:0')
['pubmed' 'trademarks' 'dublin' 'disc' 'scripts' 'domains' 'directories'
 'months' 'labels' 'bookmark']

For Feature 11929
logit_diff.shape: torch.Size([32, 5000])
logit_diff: tensor([[ 3.3917e-05,  4.0930e-05,  1.0025e-05,  ...,  6.2579e-07,
          1.5392e-06,  5.3155e-06],
        [ 1.0792e-05, -5.5304e-06, -6.5547e-06,  ...,  3.9833e-06,
          4.9388e-10,  2.1632e-06],
        [ 6.6586e-06,  3.0128e-06, -3.2674e-08,  ...,  3.4897e-07,
          1.3396e-07,  1.9923e-07],
        ...,
        [ 2.9771e-06, -9.8697e-07, -3.5912e-07,  ...,  3.5011e-07,
         -2.1223e-06,


Most Changed, by Absolute Diff Over 32 Images:
tensor([0.0142, 0.0105, 0.0103, 0.0081, 0.0072, 0.0067, 0.0061, 0.0031, 0.0027,
        0.0023], device='cuda:0')
['guinea' 'pair' 'mouse' 'bears' 'rear' 'model' 'apple' 'cabinet' 'turkey'
 'male']

Most Changed, by Ratio Over 32 Images:
tensor([9.7015, 9.2227, 8.9351, 8.9071, 8.7153, 8.5216, 8.4050, 8.2800, 7.5763,
        7.5419], device='cuda:0')
['vision' 'jacket' 'television' 'uniprotkb' 'conference' 'conferences'
 'cabinet' 'laboratory' 'alabama' 'classroom']

For Feature 21387
logit_diff.shape: torch.Size([32, 5000])
logit_diff: tensor([[ 5.8534e-05,  2.2197e-04,  6.8258e-05,  ...,  8.1668e-08,
         -4.6064e-07, -9.5857e-08],
        [ 3.0034e-05,  3.3947e-05,  1.7234e-05,  ..., -1.1769e-06,
         -2.0072e-07, -1.1748e-06],
        [ 9.3142e-07, -1.6694e-06, -4.1057e-07,  ..., -1.0672e-07,
         -2.9788e-07, -8.0522e-08],
        ...,
        [ 8.2149e-06,  4.4212e-06,  8.9153e-07,  ..., -1.2700e-07,
         -6.1298e-06,

  0%|                                                                                                            | 0/1562 [00:03<?, ?it/s]


text_features_normed.shape: torch.Size([5000, 512])
655
656
665
2541
14942
30735
17019
21323
29112
19116
46123
19102
13507
16897
8551
27058
37825
25896
47367
6859
9650
34678
32502
16732
14425
22156
10142
28741
17591
27470
1676
4494
26761
46183
5347
43344
30909
41886
43766
38607
43962
11929
23633
48165
15362
46624
8161
40820
21387
8301

For Feature 655
logit_diff.shape: torch.Size([32, 5000])
logit_diff: tensor([[ 1.3374e-04,  7.8211e-04,  1.6846e-03,  ...,  3.7321e-07,
          5.9839e-06,  8.7196e-06],
        [ 4.2929e-05,  9.7198e-05,  4.1990e-04,  ..., -1.4754e-06,
          2.6243e-07,  1.8139e-06],
        [ 2.8785e-05,  1.2772e-04,  1.4414e-04,  ...,  1.5915e-07,
          8.8483e-07,  4.6045e-07],
        ...,
        [ 3.8702e-05,  4.1376e-05,  6.8298e-05,  ...,  2.9404e-07,
          1.0106e-05,  1.3838e-06],
        [ 1.8429e-05,  1.6302e-04,  2.9410e-04,  ..., -1.0622e-07,
          5.6550e-06,  5.9111e-07],
        [ 8.7943e-05,  1.4614e-03,  1.8963e-04,  ..., -4.4871e-08

['bird' 'wildlife' 'male' 'dog' 'slow' 'jerry' 'birds' 'dogs' 'flower'
 'treat']

Most Changed, by Ratio Over 32 Images:
tensor([198.7059, 122.0498, 106.3026,  98.9906,  89.4397,  89.1064,  79.7472,
         76.6500,  71.9211,  63.7722], device='cuda:0')
['flowers' 'atmosphere' 'dear' 'governments' 'arrival' 'their' 'dates'
 'allows' 'leaves' 'preferences']

For Feature 17019
logit_diff.shape: torch.Size([32, 5000])
logit_diff: tensor([[ 1.2725e-04,  4.0501e-04,  4.1601e-04,  ...,  4.8659e-07,
          8.0952e-06,  9.0025e-06],
        [ 1.3165e-04,  1.4572e-04,  2.5171e-04,  ...,  5.7747e-06,
          2.0945e-06,  1.0563e-05],
        [ 3.2303e-05,  7.8817e-05,  4.0901e-05,  ...,  2.9030e-07,
          1.4658e-06,  5.2303e-07],
        ...,
        [ 6.2055e-05,  3.4405e-05,  2.4790e-05,  ...,  9.7901e-07,
          3.3784e-05,  2.4991e-06],
        [ 1.1474e-05,  4.8546e-05,  4.8656e-05,  ..., -2.2868e-07,
          6.9646e-06,  3.4103e-07],
        [ 1.3948e-04,  1.1811e-03,  6.28

steerability_score: tensor([9.5489e-14, 3.5040e-08, 2.1577e-10,  ..., 9.5646e-12, 1.4290e-11,
        1.0870e-11], device='cuda:0')

Most Changed, by Absolute Diff Over 32 Images:
tensor([0.0563, 0.0148, 0.0118, 0.0098, 0.0094, 0.0080, 0.0074, 0.0064, 0.0063,
        0.0061], device='cuda:0')
['pair' 'lift' 'fig' 'table' 'tel' 'golden' 'leg' 'sri' 'seal' 'serves']

Most Changed, by Ratio Over 32 Images:
tensor([167.2779, 150.4198, 117.7651, 100.8152,  97.3097,  93.5263,  84.9760,
         75.2253,  73.2980,  72.4486], device='cuda:0')
['fig' 'edge' 'lift' 'cape' 'citysearch' 'shower' 'ref' 'steps'
 'extension' 'serves']

For Feature 16897
logit_diff.shape: torch.Size([32, 5000])
logit_diff: tensor([[ 6.0622e-05,  1.6994e-04,  2.1327e-04,  ...,  1.4402e-07,
          1.4731e-06, -1.0753e-06],
        [ 1.0101e-04,  1.1018e-04,  2.0491e-04,  ...,  2.9224e-06,
          9.6042e-07, -7.9012e-07],
        [ 2.5304e-05,  5.7293e-05,  3.4629e-05,  ...,  1.9036e-07,
          6.0449e-07, -2.25


Most Changed, by Absolute Diff Over 32 Images:
tensor([0.0247, 0.0109, 0.0077, 0.0076, 0.0072, 0.0070, 0.0069, 0.0068, 0.0046,
        0.0046], device='cuda:0')
['dog' 'douglas' 'ray' 'flying' 'dogs' 'ski' 'pussy' 'flower' 'sole'
 'wing']

Most Changed, by Ratio Over 32 Images:
tensor([329.3222, 134.7733, 132.5424, 110.6304,  98.1958,  90.9668,  89.3685,
         89.0098,  83.6604,  78.0226], device='cuda:0')
['plus' 'pick' 'xx' 'shopper' 'usc' 'flower' 'picks' 'silver' 'merchants'
 'flowers']

For Feature 6859
logit_diff.shape: torch.Size([32, 5000])
logit_diff: tensor([[ 4.6031e-05,  1.0452e-05,  1.2328e-05,  ...,  3.1212e-08,
          5.7197e-06,  9.9235e-06],
        [ 8.3211e-05,  2.7307e-05,  2.3340e-05,  ..., -1.1042e-07,
          2.3488e-06,  1.8023e-05],
        [ 6.7273e-06, -5.3303e-06, -1.1543e-08,  ..., -6.6372e-08,
          6.8203e-07,  4.5058e-07],
        ...,
        [ 1.3654e-05,  1.7618e-06,  1.1060e-06,  ..., -4.8287e-08,
          8.4282e-06,  1.9503e-06],
    

steerability_score: tensor([1.3675e-10, 3.2859e-10, 4.2933e-10,  ..., 5.6782e-10, 1.0315e-11,
        1.9811e-10], device='cuda:0')

Most Changed, by Absolute Diff Over 32 Images:
tensor([0.0301, 0.0145, 0.0135, 0.0116, 0.0106, 0.0074, 0.0061, 0.0053, 0.0051,
        0.0051], device='cuda:0')
['pair' 'milwaukee' 'permit' 'vehicle' 'debian' 'cabinet' 'hairy'
 'archive' 'trademarks' 'laser']

Most Changed, by Ratio Over 32 Images:
tensor([695.4758, 256.3501, 210.6448, 158.7635, 149.6612, 133.3284, 126.1143,
        117.6875, 111.7829, 109.7869], device='cuda:0')
['milwaukee' 'manchester' 'ericsson' 'melbourne' 'engineer' 'sacramento'
 'toyota' 'bbc' 'engine' 'ibm']

For Feature 10142
logit_diff.shape: torch.Size([32, 5000])
logit_diff: tensor([[ 3.4027e-05, -2.0672e-05,  4.4204e-06,  ...,  8.6962e-07,
          2.3584e-06,  2.8506e-06],
        [ 5.8700e-05,  5.5928e-06,  1.2105e-05,  ...,  1.7270e-05,
          1.0197e-06,  5.3402e-06],
        [-1.6542e-06, -1.6352e-05, -2.5964e-06,  .


Most Changed, by Absolute Diff Over 32 Images:
tensor([0.0229, 0.0159, 0.0128, 0.0084, 0.0078, 0.0077, 0.0071, 0.0059, 0.0058,
        0.0057], device='cuda:0')
['dolls' 'wildlife' 'wing' 'bowl' 'visitors' 'watching' 'wings' 'appeared'
 'sequence' 'researchers']

Most Changed, by Ratio Over 32 Images:
tensor([455.7671, 201.8643, 175.6848, 171.7110, 169.9640, 164.5271, 153.5534,
        146.9366, 137.5677, 125.1131], device='cuda:0')
['dolls' 'workers' 'disclosure' 'artists' 'designers' 'doctors'
 'scientists' 'researchers' 'surgery' 'soldiers']

For Feature 26761
logit_diff.shape: torch.Size([32, 5000])
logit_diff: tensor([[-3.0952e-06, -6.6084e-05,  8.8733e-05,  ..., -8.2163e-08,
         -6.3585e-07, -1.5386e-06],
        [ 1.0209e-04,  4.2609e-05,  6.2903e-04,  ...,  5.8574e-06,
          2.3522e-06,  1.9243e-06],
        [ 1.1334e-05,  2.0217e-06,  5.6021e-05,  ...,  1.2368e-07,
          6.5442e-07,  2.4022e-08],
        ...,
        [ 1.0423e-05,  5.0102e-07,  1.9475e-05,  ..., 


Most Changed, by Absolute Diff Over 32 Images:
tensor([0.0116, 0.0115, 0.0110, 0.0090, 0.0081, 0.0076, 0.0064, 0.0051, 0.0047,
        0.0039], device='cuda:0')
['indian' 'golden' 'debian' 'bowl' 'pair' 'ball' 'globe' 'box' 'balls'
 'racing']

Most Changed, by Ratio Over 32 Images:
tensor([202.6720,  78.6507,  61.1356,  60.5625,  58.2360,  56.7508,  54.3975,
         54.2042,  53.1808,  52.2274], device='cuda:0')
['indian' 'latin' 'rings' 'spain' 'bands' 'clock' 'teams' 'colors'
 'hotels' 'pink']

For Feature 38607
logit_diff.shape: torch.Size([32, 5000])
logit_diff: tensor([[ 4.3977e-05,  1.3225e-04,  1.9264e-04,  ...,  6.8591e-07,
         -5.4482e-07, -5.6652e-07],
        [ 1.3423e-04,  1.5871e-04,  3.0847e-04,  ...,  2.7885e-05,
          5.4006e-07,  2.2017e-06],
        [ 2.2565e-05,  5.4314e-05,  3.4922e-05,  ...,  1.0113e-06,
          1.0649e-07,  5.2080e-08],
        ...,
        [ 6.3206e-05,  3.7582e-05,  3.1150e-05,  ...,  4.3926e-06,
          4.3654e-06,  5.6292e-07],


logit_diff: tensor([[-9.6639e-06,  3.7787e-04,  1.0239e-04,  ...,  4.0409e-07,
         -7.1695e-07,  5.8526e-06],
        [-8.8347e-06,  1.9703e-04,  9.5082e-05,  ...,  6.9853e-06,
          1.7396e-08,  9.9520e-06],
        [-3.2437e-06,  7.3486e-05,  1.1003e-05,  ...,  2.6658e-07,
         -1.4412e-07,  3.9420e-07],
        ...,
        [-2.0333e-06,  4.9307e-05,  1.0037e-05,  ...,  1.2283e-06,
         -2.6752e-06,  2.4735e-06],
        [-2.3960e-06,  1.6236e-04,  3.3945e-05,  ...,  2.6434e-06,
         -1.0693e-05,  9.2211e-07],
        [-2.6259e-05,  2.0247e-03,  2.7696e-05,  ...,  9.7970e-07,
         -9.1328e-07,  4.1166e-06]], device='cuda:0')
steerability_score.shape: torch.Size([5000])
steerability_score: tensor([1.3987e-10, 5.2799e-08, 2.8754e-09,  ..., 2.8522e-12, 4.7475e-12,
        1.2914e-10], device='cuda:0')

Most Changed, by Absolute Diff Over 32 Images:
tensor([0.0194, 0.0089, 0.0075, 0.0072, 0.0065, 0.0059, 0.0055, 0.0049, 0.0046,
        0.0045], device='cuda:0')


  0%|                                                                                                            | 0/1562 [00:03<?, ?it/s]


text_features_normed.shape: torch.Size([5000, 512])
655
656
665
2541
14942
30735
17019
21323
29112
19116
46123
19102
13507
16897
8551
27058
37825
25896
47367
6859
9650
34678
32502
16732
14425
22156
10142
28741
17591
27470
1676
4494
26761
46183
5347
43344
30909
41886
43766
38607
43962
11929
23633
48165
15362
46624
8161
40820
21387
8301

For Feature 655
logit_diff.shape: torch.Size([32, 5000])
logit_diff: tensor([[2.1611e-04, 1.1014e-03, 6.6033e-03,  ..., 6.1948e-07, 4.7111e-06,
         1.1759e-05],
        [3.1521e-04, 7.6667e-04, 7.1841e-03,  ..., 2.1297e-06, 2.3061e-06,
         1.5957e-05],
        [9.7723e-05, 4.1003e-04, 1.4067e-03,  ..., 5.3624e-07, 1.9126e-06,
         1.9267e-06],
        ...,
        [1.9759e-04, 2.8764e-04, 1.3693e-03,  ..., 1.5603e-06, 1.7443e-05,
         7.3345e-06],
        [6.6462e-05, 4.5729e-04, 2.2185e-03,  ..., 4.3450e-07, 7.3922e-07,
         2.3339e-06],
        [2.3292e-04, 2.4635e-03, 1.8108e-03,  ..., 4.0853e-07, 2.4696e-06,
         5.5953e-06]


Most Changed, by Absolute Diff Over 32 Images:
tensor([0.0262, 0.0114, 0.0112, 0.0096, 0.0082, 0.0071, 0.0045, 0.0044, 0.0039,
        0.0039], device='cuda:0')
['bird' 'birds' 'slow' 'jerry' 'dear' 'treat' 'their' 'feeds' 'low' 'dogs']

Most Changed, by Ratio Over 32 Images:
tensor([12846.3555,  2518.9409,  1989.3773,  1843.1665,  1819.9211,  1640.0994,
         1611.7084,  1582.5924,  1492.8033,  1185.4326], device='cuda:0')
['flowers' 'atmosphere' 'governments' 'arrival' 'leaves' 'their' 'dates'
 'dear' 'flower' 'foods']

For Feature 17019
logit_diff.shape: torch.Size([32, 5000])
logit_diff: tensor([[2.9134e-04, 6.4392e-04, 1.3033e-03,  ..., 1.2130e-06, 1.0545e-05,
         1.7380e-05],
        [3.7814e-04, 3.8391e-04, 1.0929e-03,  ..., 9.6299e-06, 5.1351e-06,
         2.5281e-05],
        [1.2153e-04, 2.2618e-04, 2.5442e-04,  ..., 9.8066e-07, 3.5454e-06,
         2.3225e-06],
        ...,
        [1.7566e-04, 1.0526e-04, 1.6057e-04,  ..., 2.0564e-06, 2.8646e-05,
         6.8058e-0


Most Changed, by Absolute Diff Over 32 Images:
tensor([0.0886, 0.0326, 0.0224, 0.0196, 0.0175, 0.0173, 0.0173, 0.0136, 0.0106,
        0.0094], device='cuda:0')
['pair' 'lift' 'fig' 'extension' 'leg' 'sri' 'serves' 'tel' 'root' 'cape']

Most Changed, by Ratio Over 32 Images:
tensor([4804.1177, 3553.0981, 3091.2319, 2909.7720, 1907.5302, 1838.6384,
        1382.3765, 1373.0074, 1082.2917,  934.8979], device='cuda:0')
['fig' 'citysearch' 'shower' 'edge' 'cape' 'lift' 'ref' 'steps' 'root'
 'pool']

For Feature 16897
logit_diff.shape: torch.Size([32, 5000])
logit_diff: tensor([[ 1.7154e-04,  3.3498e-04,  7.7028e-04,  ...,  6.6219e-07,
          2.3404e-06, -1.1488e-06],
        [ 3.2900e-04,  3.2999e-04,  9.5509e-04,  ...,  7.1297e-06,
          2.6587e-06, -6.3176e-07],
        [ 9.4010e-05,  1.6388e-04,  2.0318e-04,  ...,  7.9449e-07,
          1.4792e-06,  5.0861e-08],
        ...,
        [ 1.7097e-04,  1.0755e-04,  1.7189e-04,  ...,  2.1071e-06,
          9.7296e-06,  1.7891e-07],
  

steerability_score: tensor([1.2078e-09, 8.7055e-10, 1.9396e-10,  ..., 5.4880e-10, 2.0171e-11,
        3.3286e-10], device='cuda:0')

Most Changed, by Absolute Diff Over 32 Images:
tensor([0.0214, 0.0161, 0.0138, 0.0136, 0.0117, 0.0110, 0.0089, 0.0080, 0.0076,
        0.0070], device='cuda:0')
['orange' 'phoenix' 'dolls' 'storage' 'twin' 'wings' 'wing' 'dogs'
 'toshiba' 'toys']

Most Changed, by Ratio Over 32 Images:
tensor([5487.9531, 3598.4219, 3312.5266, 3248.5896, 2801.9438, 2402.6919,
        2308.1863, 1786.3739, 1461.1724, 1266.7271], device='cuda:0')
['dolls' 'toshiba' 'orange' 'appliances' 'storage' 'dvds' 'supplies'
 'thing' 'teens' 'trucks']

For Feature 9650
logit_diff.shape: torch.Size([32, 5000])
logit_diff: tensor([[ 4.4030e-05,  1.5039e-04, -1.9311e-05,  ...,  2.1864e-07,
         -1.3992e-06, -1.3354e-06],
        [ 7.4562e-05,  1.3021e-04, -5.3520e-06,  ...,  8.7202e-07,
         -8.9591e-08, -1.3242e-06],
        [ 3.9977e-05,  1.2249e-04,  1.7940e-06,  ...,  4.9630e-

steerability_score: tensor([1.9570e-10, 5.6438e-10, 7.3551e-10,  ..., 6.1918e-10, 1.5212e-11,
        4.3082e-10], device='cuda:0')

Most Changed, by Absolute Diff Over 32 Images:
tensor([0.1003, 0.0240, 0.0146, 0.0142, 0.0105, 0.0101, 0.0094, 0.0086, 0.0085,
        0.0081], device='cuda:0')
['milwaukee' 'pair' 'laser' 'permit' 'melbourne' 'breast' 'implement'
 'toyota' 'front' 'manchester']

Most Changed, by Ratio Over 32 Images:
tensor([27467.1680,  8231.1816,  5022.4590,  3626.5461,  3514.0054,  3507.2756,
         2918.6968,  2732.2471,  2616.3589,  2360.7917], device='cuda:0')
['milwaukee' 'manchester' 'ericsson' 'sacramento' 'melbourne' 'engineer'
 'toyota' 'microsoft' 'engine' 'denver']

For Feature 10142
logit_diff.shape: torch.Size([32, 5000])
logit_diff: tensor([[ 5.6705e-05, -4.6195e-05,  3.4191e-06,  ...,  3.2763e-06,
          2.2778e-06,  4.8377e-06],
        [ 1.1306e-04,  2.9195e-06,  2.0548e-05,  ...,  3.8578e-05,
          1.9741e-06,  1.0026e-05],
        [ 2.4288e-


Most Changed, by Absolute Diff Over 32 Images:
tensor([0.0409, 0.0194, 0.0173, 0.0135, 0.0133, 0.0124, 0.0122, 0.0117, 0.0110,
        0.0104], device='cuda:0')
['dolls' 'wing' 'researchers' 'appeared' 'boobs' 'workers' 'visitors'
 'wings' 'farmers' 'wildlife']

Most Changed, by Ratio Over 32 Images:
tensor([27924.6348,  5657.0669,  5428.0532,  5268.1562,  5092.6035,  4937.2729,
         4380.0361,  3483.4604,  3120.0874,  3097.5642], device='cuda:0')
['dolls' 'doctors' 'workers' 'designers' 'disclosure' 'artists'
 'scientists' 'lingerie' 'researchers' 'pottery']

For Feature 26761
logit_diff.shape: torch.Size([32, 5000])
logit_diff: tensor([[ 2.2620e-05, -5.5398e-05,  7.8096e-04,  ...,  1.4624e-07,
          6.7197e-07, -1.2256e-06],
        [ 1.4531e-04,  3.3684e-05,  2.2670e-03,  ...,  4.4964e-06,
          3.4221e-06,  7.9862e-07],
        [ 3.4050e-05,  8.1878e-06,  3.8706e-04,  ...,  4.9376e-07,
          1.5221e-06,  1.6021e-07],
        ...,
        [ 5.2033e-05,  8.6976e-06, 

logit_diff: tensor([[ 1.1070e-05, -6.4833e-05,  1.3609e-06,  ..., -8.2530e-08,
         -1.1926e-06, -7.6594e-07],
        [ 4.5849e-05, -4.3611e-06,  3.2516e-05,  ..., -2.8090e-06,
          1.6593e-07,  3.3251e-07],
        [ 4.2059e-06, -1.3201e-05,  2.5562e-06,  ..., -1.1326e-07,
         -1.8595e-07,  3.8954e-08],
        ...,
        [ 2.2250e-05,  2.7706e-06,  7.3023e-06,  ..., -8.7563e-08,
         -5.4752e-06,  4.9924e-07],
        [ 6.2674e-06, -2.7520e-05,  4.2226e-06,  ..., -2.1578e-06,
         -1.5478e-05,  4.5765e-08],
        [ 5.7100e-06, -6.4515e-04,  5.0755e-06,  ..., -4.7955e-07,
         -1.3962e-06, -5.5807e-07]], device='cuda:0')
steerability_score.shape: torch.Size([5000])
steerability_score: tensor([4.4755e-11, 1.9837e-09, 8.6532e-11,  ..., 5.8756e-10, 1.0926e-11,
        1.1537e-11], device='cuda:0')

Most Changed, by Absolute Diff Over 32 Images:
tensor([0.0611, 0.0157, 0.0096, 0.0085, 0.0082, 0.0078, 0.0073, 0.0070, 0.0068,
        0.0066], device='cuda:0')



Most Changed, by Absolute Diff Over 32 Images:
tensor([0.0185, 0.0155, 0.0140, 0.0124, 0.0118, 0.0114, 0.0113, 0.0102, 0.0101,
        0.0101], device='cuda:0')
['font' 'den' 'tiny' 'pet' 'kits' 'root' 'mt' 'matching' 'northern'
 'farmers']

Most Changed, by Ratio Over 32 Images:
tensor([4003.5396, 3096.3245, 2234.3308, 1797.6427, 1580.6343, 1550.6018,
        1386.3805, 1266.3044, 1216.3639, 1031.1659], device='cuda:0')
['bedroom' 'pool' 'kids' 'mountains' 'mountain' 'clothes' 'bath' 'produce'
 'forest' 'root']

For Feature 46624
logit_diff.shape: torch.Size([32, 5000])
logit_diff: tensor([[-1.0767e-05,  1.0657e-03,  3.4140e-04,  ...,  1.9250e-06,
         -9.6247e-07,  1.8990e-05],
        [-1.0434e-05,  7.2725e-04,  3.5018e-04,  ...,  1.7274e-05,
          9.5836e-08,  3.0883e-05],
        [-3.3354e-06,  2.6934e-04,  5.4657e-05,  ...,  1.2501e-06,
         -1.3830e-07,  2.1419e-06],
        ...,
        [-1.6714e-06,  2.7351e-04,  6.9672e-05,  ...,  4.9275e-06,
         -5.2681e-06

  0%|                                                                                                            | 0/1562 [00:00<?, ?it/s]

In [None]:
steering_strength_image_results.keys(), steering_strength_image_results[str(steering_strength)].keys()

In [None]:
i = 0

# 0: python, 3: bowl, 4: bed, 6: guinea
image_idx = 20


feat_num = list(steering_strength_image_results[str(steering_strength)].keys())[-1]

# to iterate over many features:
# for feat_num in steering_strength_image_results[str(steering_strength)].keys():

print(f"=====================\nfeat_num: {feat_num}")
feat_num_concept_arr = []
feat_num_prob_arr = []
for dict_key in steering_strengths:
    # image, tuple position, idx of top-k
    feat_num_concept_arr.append((dict_key, steering_strength_image_results[str(dict_key)][feat_num][image_idx][0][0]))
    print(str(dict_key), steering_strength_image_results[str(dict_key)][feat_num][image_idx][0][0])
    feat_num_prob_arr.append((dict_key, steering_strength_image_results[str(dict_key)][feat_num][image_idx][1][0].item()))
    print(str(dict_key), steering_strength_image_results[str(dict_key)][feat_num][image_idx][1][0].item())


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Sample data
x = [tup[0] for tup in sorted(feat_num_concept_arr)]
y1 = [tup[1] for tup in sorted(feat_num_concept_arr)]
y2 = [tup[1] for tup in sorted(feat_num_prob_arr)]

# Create the plot
plt.figure(figsize=(10, 6))

# Plot each line with different colors and markers
plt.plot(np.array(x), y2, 'rs-', label='Prob, Label at Strength')  # Red line with squares

# Label each point
for i in range(len(x)):
    # Labels for series
    plt.annotate(f'({y2[i]:01f}, {y1[i]})', 
                (x[i], y2[i]), 
                textcoords="offset points", 
                xytext=(0,-15),
                ha='center')
    
# Customize the plot
plt.xlabel('Feature Steering Strength (feat = Strength)')
plt.ylabel('Probability of TPL, Top Predicted Label')
plt.title(f'Most Likely Class by Feature Steering Strength, Feature {feat_num}\n Label at 0.0: {steering_strength_image_results[str(0.0)][feat_num][image_idx][0][0]}. Label at max steered val ({str(max(steering_strengths))}): {steering_strength_image_results[str(max(steering_strengths))][feat_num][image_idx][0][0]}.')
plt.legend()
plt.grid(True, linestyle='--', alpha=0.7)

# Adjust layout to prevent label overlap
plt.tight_layout()


# plt.savefig("test.svg")

# Show the plot
plt.show()

In [None]:
# top-k logits chart
# i = 10
# 0: python, 3: bowl, 4: bed, 6: guinea
image_idx = image_idx
feat_num = feat_num

# to iterate over many features:
# for feat_num in steering_strength_image_results[str(steering_strength)].keys():

print(f"=====================\nfeat_num: {feat_num}")
feat_num_concept_arr = []
feat_num_concept_arr_2 = []
feat_num_concept_arr_3 = []
feat_num_prob_arr = []
feat_num_prob_arr_2 = []
feat_num_prob_arr_3 = []
for dict_key in steering_strengths:
    # image, tuple position, idx of top-k
    # modify this to do top-k at some point
    print(str(dict_key), steering_strength_image_results[str(dict_key)][feat_num][image_idx][0][0])
    feat_num_concept_arr.append((dict_key, steering_strength_image_results[str(dict_key)][feat_num][image_idx][0][0]))
    feat_num_concept_arr_2.append((dict_key, steering_strength_image_results[str(dict_key)][feat_num][image_idx][0][1]))
    feat_num_concept_arr_3.append((dict_key, steering_strength_image_results[str(dict_key)][feat_num][image_idx][0][2]))
    print(str(dict_key), steering_strength_image_results[str(dict_key)][feat_num][image_idx][1][0].item())
    feat_num_prob_arr.append((dict_key, steering_strength_image_results[str(dict_key)][feat_num][image_idx][1][0].item()))
    feat_num_prob_arr_2.append((dict_key, steering_strength_image_results[str(dict_key)][feat_num][image_idx][1][1].item()))
    feat_num_prob_arr_3.append((dict_key, steering_strength_image_results[str(dict_key)][feat_num][image_idx][1][2].item()))


# Sample data
x = [tup[0] for tup in sorted(feat_num_concept_arr)]
y1 = [tup[1] for tup in sorted(feat_num_concept_arr)]
y2 = [tup[1] for tup in sorted(feat_num_prob_arr)]
y1_2 = [tup[1] for tup in sorted(feat_num_concept_arr_2)]
y2_2 = [tup[1] for tup in sorted(feat_num_prob_arr_2)]
y1_3 = [tup[1] for tup in sorted(feat_num_concept_arr_3)]
y2_3 = [tup[1] for tup in sorted(feat_num_prob_arr_3)]

# Create the plot
plt.figure(figsize=(15, 9))

# Plot each line with different colors and markers
plt.plot(np.array(x), y2, 'rs-', label='Prob, Label at Strength')  # Red line with squares
plt.plot(np.array(x), y2_2, 'ms-', label='Prob, Label at Strength')  # Red line with squares
plt.plot(np.array(x), y2_3, 'ys-', label='Prob, Label at Strength')  # Red line with squares

# Label each point
for i in range(len(x)):
    # Labels for series
    plt.annotate(f'({y2[i]:01f}, {y1[i]})', 
                (x[i], y2[i]), 
                textcoords="offset points", 
                xytext=(0,-15),
                ha='center')
    plt.annotate(f'({y2_2[i]:01f}, {y1_2[i]})', 
                (x[i], y2_2[i]), 
                textcoords="offset points", 
                xytext=(0,-15),
                ha='center')
    plt.annotate(f'({y2_3[i]:01f}, {y1_3[i]})', 
                (x[i], y2_3[i]), 
                textcoords="offset points", 
                xytext=(0,-15),
                ha='center')
    
# Customize the plot
plt.xlabel('Feature Steering Strength (feat = Strength)')
plt.ylabel('Probability of TPL, Top Predicted Label')
plt.title(f'Most Likely Class by Feature Steering Strength, Feature {feat_num}\n Label at 0.0: {steering_strength_image_results[str(0.0)][feat_num][image_idx][0][0]}. Label at max steered val ({str(max(steering_strengths))}): {steering_strength_image_results[str(max(steering_strengths))][feat_num][image_idx][0][0]}.')
plt.legend()
plt.grid(True, linestyle='--', alpha=0.7)

# Adjust layout to prevent label overlap
plt.tight_layout()


# plt.savefig("test.svg")

# Show the plot
plt.show()

In [None]:
# Steerability metric

for feat_num in steering_strength_image_results[str(steering_strength)].keys():
    feat_num_steerability = feat_num
    # steering_strength_info[steering_strength] = (top_concep,top_val,aggregate_ratio,aggregate_diff)
    agg_diff_arr = []
    steerability_arr = []
    i = 0
    for key in steering_strength_info:
    #     print(key, steering_strength_info[key][4][feat_num].cpu())
        steerability_arr.append(steering_strength_info[key][4][feat_num_steerability].sum().cpu().item())
        agg_diff_arr.append(steering_strength_info[key][3][feat_num_steerability].cpu().item())
    #     agg_diff_arr.append(steering_strength_info[key][2][feat_num_steerability].cpu().item())
    #     i += 1
    #     if i > 3:
    #         agg_diff_arr[-1] = agg_diff_arr[-2]

    import matplotlib.pyplot as plt
    import numpy as np

    # Sample data
    x = [tup[0] for tup in sorted(feat_num_concept_arr)]
    y1 = agg_diff_arr
    y2 = steerability_arr

    # Create the plot
    plt.figure(figsize=(10, 6))

    # Plot each line with different colors and markers
    plt.plot(np.array(x), y1, 'rs-', label='Strength, Aggregate Logit Difference')  # Red line with squares
    plt.plot(np.array(x), y2, 'ms-', label='Strength, Steerability Score')  # Red line with squares

    # Label each point
    for i in range(len(x)):
        # Labels for series
        plt.annotate(f'({x[i]:.1f}, {y1[i]:.3f})', 
                    (x[i], y1[i]), 
                    textcoords="offset points", 
                    xytext=(0,-15),
                    ha='center')
        plt.annotate(f'({x[i]:.1f}, {y2[i]:.3f})', 
                    (x[i], y2[i]), 
                    textcoords="offset points", 
                    xytext=(0,-15),
                    ha='center')

    # Customize the plot
    plt.xlabel('Feature Steering Strength (feat = Strength)')
    plt.ylabel('Sum of Logit Difference Over All Tested Images')
    plt.title(f'Aggregate Logit Difference at different steering strengths, Feature {feat_num_steerability}')
    plt.legend()
    plt.grid(True, linestyle='--', alpha=0.7)

    # Adjust layout to prevent label overlap
    plt.tight_layout()


    # plt.savefig("test.svg")

    # Show the plot
    plt.show()
    # image, tuple position, idx of top-k
    print(dict_key, steering_strength_image_results[str(dict_key)][feat_num][image_idx][0])
    print(dict_key, steering_strength_image_results[str(dict_key)][feat_num][image_idx][1])

### Prev Code

In [278]:
len(feature_steered_embeds[random_feat_idxs[0]])

32

In [279]:
default_embeds.shape
len(default_embeds_list)
default_embeds = torch.cat(default_embeds_list)
default_embeds.shape

torch.Size([32, 512])

In [280]:

len(altered_embeds_list), altered_embeds_list[0].shape, default_embeds.shape

(10, torch.Size([32, 512]), torch.Size([32, 512]))

In [281]:
og_model.cuda()

CLIP(
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
    (patch_dropout): Identity()
    (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): ModuleList(
        (0-11): 12 x ResidualAttentionBlock(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ls_1): Identity()
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): GELU(approximate='none')
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ls_2): Identity()
        )
      )
    )
    (ln_post): LayerNorm((768,), eps=1e-05, elementwise_affine

In [282]:
with open("/workspace/clip_dissect_raw.txt", "r") as f:
    larger_vocab = [line[:-1] for line in f.readlines()][:5000]

# with open("/workspace/better_img_desc.txt", "r") as f:
#     larger_vocab = [line[:-1] for line in f.readlines()][:5000]

In [283]:
# use clip vocab here and compare embeds
import torch
from PIL import Image

tokenizer = open_clip.get_tokenizer('ViT-B-32')
text = tokenizer(larger_vocab)
text_features = og_model.encode_text(text.cuda())
text_features_normed = text_features/text_features.norm(dim=-1, keepdim=True)


print(f"text_features_normed.shape: {text_features_normed.shape}")
text_probs_altered_list = []
with torch.no_grad(), torch.cuda.amp.autocast():
    for key in feature_steered_embeds:
        print(key)
        # embeds already have L2 norm of 1
        text_probs_altered = (100.0 * torch.stack(feature_steered_embeds[key]) @ text_features_normed.T).softmax(dim=-1)
        text_probs_altered_list.append(text_probs_altered)
    text_probs_default = (100.0 * default_embeds @ text_features_normed.T).softmax(dim=-1)

print("Label probs altered:", text_probs_altered.shape)  # prints: [[1., 0., 0.]]
print("Label probs default:", text_probs_default.shape)  # prints: [[1., 0., 0.]]

text_features_normed.shape: torch.Size([5000, 512])
655
656
665
2541
9482
21803
21701
22826
17981
10559
Label probs altered: torch.Size([32, 5000])
Label probs default: torch.Size([32, 5000])


### Summed Logit Difference

In [None]:
# subtract from default, label, and print trends
text_probs_altered.shape

# selected_vocab = all_imagenet_class_names
selected_vocab = larger_vocab

top_concept_per_feat = {}
top_val_per_feat = {}
top_diff_per_feat = {}
top_ratio_per_feat = {}
# run this for sampled features over all of imagenet eval
for j, text_probs_altered in enumerate(text_probs_altered_list):
    print(f"{'============================================'*2}\n\nFor Feature {random_feat_idxs[j]}")
    print("actual image content:")
    default_vals_softmax, default_idxs_softmax = torch.topk(text_probs_default,k=10)
    print(default_vals_softmax, "\n", np.array(selected_vocab)[default_idxs_softmax.cpu()])
    
    
    logit_diff = text_probs_altered - text_probs_default
    logit_diff_aggregate = logit_diff.sum(dim=0)
    
    logit_ratio = text_probs_altered/text_probs_default
    logit_ratio_aggregate = logit_ratio.mean(dim=0)
    
    print(f"text_probs_altered.softmax(): {text_probs_altered.softmax(1).shape}")
    text_probs_altered_softmax = text_probs_altered.softmax(1)
    vals_softmax, idxs_softmax = torch.topk(text_probs_altered_softmax,k=10)
    
#     print(f"text_probs_altered.softmax(): {text_probs_altered.sum(0).softmax(0).shape}")
#     text_probs_altered_softmax_agg = text_probs_altered.sum(0).softmax(0)
#     vals_softmax_agg, idxs_softmax_agg = torch.topk(text_probs_altered_softmax_agg,k=10)
    
    print(f"\nSoftmax Over {text_probs_altered.shape[0]} Images:\n{vals_softmax}")
    print(np.array(selected_vocab)[idxs_softmax.cpu()])
    for i in range(vals_softmax.shape[0]):
        print(vals_softmax[i], "\n", np.array(selected_vocab)[idxs_softmax.cpu()][i])
        break
        
#     print(f"\nAgg Softmax Over {text_probs_altered.shape[0]} Images:\n{vals_softmax_agg}")
#     print(np.array(selected_vocab)[idxs_softmax_agg.cpu()])
    
    vals_agg, idxs_agg = torch.topk(logit_diff_aggregate,k=10)
    vals_least_agg, idxs_least_agg = torch.topk(logit_diff_aggregate,k=10,largest=False)
    
    ratios_agg, ratios_idxs_agg = torch.topk(logit_ratio_aggregate,k=10)
    ratios_least_agg, ratios_idxs_least_agg = torch.topk(logit_ratio_aggregate,k=10,largest=False)
    
    vals, idxs = torch.topk(logit_diff,k=5)
    vals_least, idxs_least = torch.topk(logit_diff,k=5,largest=False)
    
    ratios, ratios_idxs = torch.topk(logit_ratio,k=5)
    ratios_least, ratios_idxs_least = torch.topk(logit_ratio,k=5,largest=False)
    
    top_concept_per_feat[random_feat_idxs[j]] = np.array(selected_vocab)[idxs_softmax.cpu()][0][0]
    top_val_per_feat[random_feat_idxs[j]] = vals_softmax[0][0]
    top_diff_per_feat[random_feat_idxs[j]] = vals_agg[0]
    top_ratio_per_feat[random_feat_idxs[j]] = ratios_agg[0]
    
    
    print(f"\nMost Changed, by Absolute Diff Over {logit_diff.shape[0]} Images:\n{vals_agg}")
    print(np.array(selected_vocab)[idxs_agg.cpu()])
    print(vals_least_agg)
    print(np.array(selected_vocab)[idxs_least_agg.cpu()])
    
    print(f"\nMost Changed, by Ratio Over {logit_diff.shape[0]} Images:")
    print(ratios_agg)
    print(np.array(selected_vocab)[ratios_idxs_agg.cpu()])
    print(vals_least_agg)
    print(np.array(selected_vocab)[ratios_idxs_least_agg.cpu()])

In [None]:
top_concept_per_feat,top_val_per_feat,top_ratio_per_feat

In [None]:
steering_strength_info = {}
steering_strength_info[steering_strength] = (top_concept_per_feat,top_val_per_feat,top_ratio_per_feat,top_diff_per_feat)


In [None]:
steering_strength_info[steering_strength][0].keys()

In [None]:
steering_strength_info

In [None]:
i = 0
for feat_num in steering_strength_info[steering_strength][0].keys():
    print(f"=====================\nfeat_num: {feat_num}")
    feat_num_concept_arr = []
    feat_num_prob_arr = []
    feat_num_ratio_arr = []
    for key in steering_strength_info:
        print(key, steering_strength_info[key][0][feat_num])
        feat_num_concept_arr.append((key, steering_strength_info[key][0][feat_num]))
        print(key, steering_strength_info[key][1][feat_num])
        feat_num_prob_arr.append((key, steering_strength_info[key][1][feat_num].item()))
        print(key, steering_strength_info[key][2][feat_num])
        feat_num_ratio_arr.append((key, steering_strength_info[key][2][feat_num].item()))
    i += 1
    if i > 5:
        break

In [None]:
sorted(feat_num_concept_arr),sorted(feat_num_prob_arr),sorted(feat_num_ratio_arr),

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Sample data
x = [tup[0] for tup in sorted(feat_num_concept_arr)]
y1 = [tup[1] for tup in sorted(feat_num_concept_arr)]
y2 = [tup[1] for tup in sorted(feat_num_prob_arr)]
# y3 = [tup[1] for tup in sorted(feat_num_ratio_arr)]

# Create the plot
plt.figure(figsize=(10, 6))

# Plot each line with different colors and markers
# plt.plot(x, y1, 'bo-', label='Series 1')  # Blue line with circles
plt.plot(np.array(x), y2, 'rs-', label='Series 2')  # Red line with squares
# plt.plot(x, y3, 'gd-', label='Series 3')  # Green line with diamonds

# Label each point for all three series
for i in range(len(x)):
#     # Labels for series 1
#     plt.annotate(f'({x[i]}, {y1[i]})', 
#                 (x[i], y1[i]), 
#                 textcoords="offset points", 
#                 xytext=(0,10),
#                 ha='center')
    
    # Labels for series 2
    plt.annotate(f'({y2[i]:01f}, {y1[i]})', 
                (x[i], y2[i]), 
                textcoords="offset points", 
                xytext=(0,-15),
                ha='center')
    
#     # Labels for series 3
#     plt.annotate(f'({x[i]}, {y3[i]})', 
#                 (x[i], y3[i]), 
#                 textcoords="offset points", 
#                 xytext=(0,10),
#                 ha='center')

# Customize the plot
plt.xlabel('X-axis')
plt.ylabel('Y-axis')
plt.title(f'Most Likely Class by Feature Steering Strength, Feature {feat_num}\n Label at 0.0: {steering_strength_info[0.0][0][feat_num]}. Label at max steered val: {steering_strength_info[max(list(steering_strength_info.keys()))][0][feat_num]}.')
plt.legend()
plt.grid(True, linestyle='--', alpha=0.7)

# Adjust layout to prevent label overlap
plt.tight_layout()

# Show the plot
plt.show()

## Enc/Dec Clustering/Exploration

In [None]:
encoder_weights_for_math = sparse_autoencoder.W_enc
decoder_weights_for_math = sparse_autoencoder.W_dec

In [None]:
dists_from_feat_0 = encoder_weights_for_math[0] - encoder_weights_for_math

In [None]:
dists_from_feat_0_normalized = encoder_weights_for_math[0]/encoder_weights_for_math[0].norm(p=2) - encoder_weights_for_math/encoder_weights_for_math.norm(p=2,dim=0)

In [None]:
dists_from_feat_0.norm(p=2, dim=0).shape

In [None]:
# plt.hist(bins[:-1], bins)
plt.hist(dists_from_feat_0.norm(p=2, dim=0).cpu(), density=True, bins=1000, histtype='step')  # density=False would make counts
plt.title('Encoder Dist from feat 0')
plt.ylabel('L2 Distance')
plt.xlabel('Density (of ~50k feats)');


In [None]:
torch.topk(dists_from_feat_0.norm(p=2, dim=0),k=10,largest=False)

In [None]:
dec_dists_from_feat_0 = decoder_weights_for_math[0]/decoder_weights_for_math[0].norm(p=2) - decoder_weights_for_math/decoder_weights_for_math.norm(p=2)
dec_dists_from_feat_0.shape

In [None]:
# plt.hist(bins[:-1], bins)
plt.hist(dec_dists_from_feat_0.T.norm(p=2, dim=0).cpu(), density=True, bins=1000, histtype='step')  # density=False would make counts
plt.title('Decoder Dist from feat 0')
plt.ylabel('L2 Distance')
plt.xlabel('Density (of ~50k feats)');


In [None]:
torch.topk(dec_dists_from_feat_0.T.norm(p=2, dim=0),k=10,largest=False)

In [None]:
dec_dists_from_feat_0.T.norm(p=2, dim=0).shape