### 1. Import the required libraries.

In [1]:
# import libraries
import sys
sys.path.append("viewFormer")

import torch
import matplotlib.pyplot as plt
from tqdm import tqdm

from transformers import AutoModelForCausalLM, AutoTokenizer

from viewFormer.utils import get_model_layers
from viewFormer.visualize import outlier_heatmap, abs_outlier_tensor

# check torch gpu availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cuda


  from .autonotebook import tqdm as notebook_tqdm


### 2. Load model and weights.

In [2]:
# load model
model_name = 'meta-llama/Llama-2-7b-hf'
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model.eval()

# load prompt
prompt = "What is the meaning of life?"

# generate text
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)
outputs = model.generate(input_ids, max_new_tokens=100)

# decode text
text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(text)

Loading checkpoint shards: 100%|██████████| 2/2 [00:06<00:00,  3.12s/it]


What is the meaning of life? I am not sure if I have a clear answer to that question. I think that the meaning of life is to be happy, to enjoy the life that you have, to enjoy the people you are with, to enjoy the food that you eat, to enjoy the music that you listen to, to enjoy the movies that you watch, to enjoy the books that you read, to enjoy the places that you go, to enjoy the things that you do, to enjoy the experiences that you have,


### 3. Obtain Model's Activations.

In [3]:
from torchvision import datasets, transforms
from timm.data import create_transform
from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
from torch.utils.data import DataLoader
import os

def build_dataset_CIFAR100(is_train, data_path):
    transform = build_transform(is_train)
    dataset = datasets.CIFAR100(data_path, train=is_train, transform=transform, download=True)
    nb_classes = 100
    return dataset, nb_classes

def build_transform(is_train):
    input_size = 224
    eval_crop_ratio = 1.0

    resize_im = input_size > 32
    if is_train:
        # this should always dispatch to transforms_imagenet_train
        transform = create_transform(
            input_size=input_size,
            is_training=True,
            color_jitter=0.3,
            auto_augment='rand-m9-mstd0.5-inc1',
            interpolation='bicubic',
            re_prob=0.0,
            re_mode='pixel',
            re_count=1,
        )
        if not resize_im:
            # replace RandomResizedCropAndInterpolation with
            # RandomCrop
            transform.transforms[0] = transforms.RandomCrop(
                input_size, padding=4)
        return transform

    t = []
    if resize_im:
        size = int(input_size / eval_crop_ratio)
        t.append(
            transforms.Resize(size, interpolation=3),  # to maintain same ratio w.r.t. 224 images
        )
        t.append(transforms.CenterCrop(input_size))

    t.append(transforms.ToTensor())
    t.append(transforms.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD))
    return transforms.Compose(t)

def prepare_data(batch_size):
    train_set, nb_classes = build_dataset_CIFAR100(is_train=True, data_path='./data')
    test_set, _ = build_dataset_CIFAR100(is_train=False, data_path='./data')

    train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, drop_last=True)
    test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False, drop_last=True)
    return train_loader, test_loader, nb_classes

def evaluate_model(model, data_loader, device):
    # model.eval()
    model.to(device)
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in tqdm(data_loader):
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    accuracy = 100 * correct / total
    print(f'Accuracy of the model on the test images: {accuracy}%')
    return accuracy

def train_one_epoch(model, criterion, optimizer, data_loader, device):

    cnt = 0

    for image, target in tqdm(data_loader):
        cnt += 1
        image, target = image.to(device), target.to(device)
        output = model(image)
        loss = criterion(output, target)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    return

def print_size_of_model(model):
    torch.save(model.state_dict(), "temp.p")
    print('Size (MB):', os.path.getsize("temp.p")/1e6)
    os.remove('temp.p')

In [4]:
from viewFormer.data import calibrate
from viewFormer.hooks import HookHandler, get_absmax_act_func

def selected_layers(block_idx):
    return [f'blocks.{block_idx}.attn.qkv', f'blocks.{block_idx}.mlp.fc1', f'blocks.{block_idx}.mlp.fc2']

batch_size = 1
train_loader, test_loader, nb_classes = prepare_data(batch_size)

handler = HookHandler()
layer_outputs = {}

Files already downloaded and verified
Files already downloaded and verified


In [5]:
model_blocks_cnt = len(model.blocks)
block_layer_cnt = len(selected_layers(0))

for i in range(model_blocks_cnt):
    sample_block = get_model_layers(model, match_names=selected_layers(i), match_types=['Linear'])
    handler.create_hooks(sample_block, get_absmax_act_func, layer_outputs)

with torch.autocast(device_type="cuda"):
    calibrate(model, list(train_loader)[:256])
handler.remove_hooks()

Calibrate: [  0/256]	Time  1.027 ( 1.027)
Calibrate: [ 10/256]	Time  0.178 ( 0.258)
Calibrate: [ 20/256]	Time  0.171 ( 0.217)
Calibrate: [ 30/256]	Time  0.170 ( 0.202)
Calibrate: [ 40/256]	Time  0.167 ( 0.194)
Calibrate: [ 50/256]	Time  0.171 ( 0.189)
Calibrate: [ 60/256]	Time  0.169 ( 0.186)
Calibrate: [ 70/256]	Time  0.170 ( 0.184)
Calibrate: [ 80/256]	Time  0.168 ( 0.182)
Calibrate: [ 90/256]	Time  0.184 ( 0.181)
Calibrate: [100/256]	Time  0.171 ( 0.180)
Calibrate: [110/256]	Time  0.175 ( 0.179)
Calibrate: [120/256]	Time  0.203 ( 0.179)
Calibrate: [130/256]	Time  0.168 ( 0.179)
Calibrate: [140/256]	Time  0.172 ( 0.178)
Calibrate: [150/256]	Time  0.173 ( 0.178)
Calibrate: [160/256]	Time  0.168 ( 0.177)
Calibrate: [170/256]	Time  0.168 ( 0.177)
Calibrate: [180/256]	Time  0.167 ( 0.176)
Calibrate: [190/256]	Time  0.168 ( 0.176)
Calibrate: [200/256]	Time  0.170 ( 0.176)
Calibrate: [210/256]	Time  0.171 ( 0.175)
Calibrate: [220/256]	Time  0.168 ( 0.175)
Calibrate: [230/256]	Time  0.168 (

### 4. Visualize the Activations.

In [8]:
# activation = layer_outputs['blocks.10.attn.qkv']
activation = model.model.layers[10].self_attn.q_proj.weight



# create figure
fig = plt.figure(figsize=(20, 10))
fig.suptitle('QKV Activation', fontsize=24)

# plot heatmap
# ax = fig.add_subplot(1, 2, 1)
# outlier_heatmap(activation, kernel_size=1, cmap='bwr', ax=ax)

import numpy as np
from matplotlib import cm
def abs_outlier_tensor(w, ax=None, *args, **kwargs):
    if ax is None:
        fig, self_ax = plt.subplots(subplot_kw={'projection': '3d'}, figsize=(10, 10))
    ax = ax or self_ax

    w = np.abs(w)
    min_val, max_val = np.min(w), np.max(w)


    # set z axis min value
    ax.set_zlim(min_val, max_val)

    # draw 3d surface plot with coolwarm color map
    x = np.arange(w.shape[1])  # Dims
    y = np.arange(w.shape[0])  # Seqlen
    xpos, ypos = np.meshgrid(x, y, copy=False)
    zpos = w
    surf = ax.plot_surface(xpos, ypos, zpos, cmap='coolwarm', antialiased=False)

    # # use tri-surface plot
    # x = np.arange(w.shape[1])  # Dims
    # y = np.arange(w.shape[0])  # Seqlen
    # xpos, ypos = np.meshgrid(x, y, copy=False)
    # xpos = xpos.flatten()
    # ypos = ypos.flatten()
    # surf = ax.plot_trisurf(xpos, ypos, w.flatten(), *args, **kwargs)

    # set labels
    ax.set_xlabel('Out Dims', fontsize=18, labelpad=10)
    ax.set_ylabel('SeqLen', fontsize=18, labelpad=10)
    ax.set_zlabel('Absolute Value', fontsize=18, labelpad=10)

    # set axis font size
    ax.tick_params(axis='both', which='major', labelsize=14)
    
    


    # draw color bar, pad is the distance between color bar and plot
    m = cm.ScalarMappable(**kwargs)
    m.set_array(w)
    ax.figure.colorbar(m, ax=ax, shrink=0.6, pad=0.1).ax.tick_params(labelsize=14)
    
    # rotate the plot
    # ax.view_init(elev=20, azim=30)

    # set box aspect
    # w_h = w.shape[1] / w.shape[0]
    # ax.set_box_aspect([1, 1, 1])

    return surf

# plot 3d tensor
ax = fig.add_subplot(1, 2, 2, projection='3d')
abs_outlier_tensor(activation.cpu().detach().numpy(), cmap='coolwarm', ax=ax)

plt.tight_layout()
plt.show()