In [51]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "true"
os.environ['CUDA_HOME'] = '/home/jovyan/vasiliev/notebooks/Show-o/cuda_fake'
# os.environ['CUDA_VISIBLE_DEVICES'] = '3'|

In [52]:
from PIL import Image
from tqdm import tqdm
import numpy as np
import torch
import torch.nn as nn
from transformers import AutoTokenizer
from models import Showo
from training.prompting_utils import UniversalPrompting, create_attention_mask_predict_next
from training.utils import get_config
import json
import argparse
from PIL import Image
import numpy as np
from omegaconf import OmegaConf
from models.clip_encoder import CLIPVisionTower
from transformers import CLIPImageProcessor
from llava.llava import conversation as conversation_lib
from omegaconf import OmegaConf
from copy import deepcopy
from collections import defaultdict


conversation_lib.default_conversation = conversation_lib.conv_templates["phi1.5"]

In [53]:
from inference_mmu import run_mmu
from inference_t2i import run_t2i

In [54]:
config = OmegaConf.load('configs/showo_demo_w_clip_vit_512x512.yaml')

In [55]:
def get_model(config):
    model = Showo.from_pretrained(config.model.showo.pretrained_model_path)
    return model

In [56]:
class LayerOutputRecorder:
    def __init__(self, device='cuda', compute_stats=True):
        self.outputs = defaultdict(list)
        self.inputs_shapes = defaultdict(list)
        self.handles = []
        self.device = device
        self.compute_stats = compute_stats

    def build_hook_fn(self, name):
        def hook_fn(module, input_, output):
            with torch.no_grad():
                if self.compute_stats:
                    stats = {
                        'max_abs': output.detach().abs().max(dim=1, keepdim=False).values[0].to('cpu'),
                        # 'mean': output.mean(dim=-1),
                        # 'shape': output.shape
                    }
                    if self.device == 'cpu':
                        stats = {k: v.cpu() if isinstance(v, torch.Tensor) else v 
                                for k, v in stats.items()}
                    self.outputs[name].append(stats)
                else:
                    self.outputs[name].append(output.detach())
                    
                self.inputs_shapes[name].append(input_[0].shape)
        return hook_fn

    def register_hook(self, module_name, module):
        handle = module.register_forward_hook(self.build_hook_fn(module_name))
        self.handles.append(handle)

    def register_hooks(self, modules: list[tuple[str, torch.nn.Module]]) -> None:
        for module_name, module in modules:
            self.register_hook(module_name, module)

    def remove_hooks(self):
        for handle in self.handles:
            handle.remove()
        self.handles = []
        
    def clear(self):
        self.outputs.clear()
        self.inputs_shapes.clear()
        torch.cuda.empty_cache()

In [57]:
def get_target_layers(model) -> list[tuple[str, torch.nn.Module]]:
    layers = []
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            layers.append((name, module))
    return layers

In [58]:
model_t2i = get_model(config)
model_mmu = get_model(config)

The config attributes {'mask_token_id': 58497} were passed to Showo, but are not expected and will be ignored. Please verify your config.json configuration file.


attention implementation:  sdpa


  if self.w_clip_vit:
The config attributes {'mask_token_id': 58497} were passed to Showo, but are not expected and will be ignored. Please verify your config.json configuration file.


attention implementation:  sdpa


In [59]:
list(model_mmu.named_modules())

[('',
  Showo(
    (showo): PhiForCausalLM(
      (model): PhiModel(
        (embed_tokens): Embedding(58498, 2048)
        (embed_dropout): Dropout(p=0.0, inplace=False)
        (layers): ModuleList(
          (0-23): 24 x PhiDecoderLayer(
            (self_attn): PhiSdpaAttention(
              (q_proj): Linear(in_features=2048, out_features=2048, bias=True)
              (k_proj): Linear(in_features=2048, out_features=2048, bias=True)
              (v_proj): Linear(in_features=2048, out_features=2048, bias=True)
              (dense): Linear(in_features=2048, out_features=2048, bias=True)
              (q_layernorm): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
              (k_layernorm): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
              (rotary_emb): PhiRotaryEmbedding()
            )
            (mlp): PhiMLP(
              (activation_fn): NewGELUActivation()
              (fc1): Linear(in_features=2048, out_features=8192, bias=True)
              (fc2):

In [60]:
target_layers_t2i = get_target_layers(model_t2i)
activations_recorder_t2i = LayerOutputRecorder()
activations_recorder_t2i.register_hooks(target_layers_t2i)

In [61]:
target_layers_mmu = get_target_layers(model_mmu)
activations_recorder_mmu = LayerOutputRecorder()
activations_recorder_mmu.register_hooks(target_layers_mmu)

In [62]:
# mode=t2i dataset.params.validation_prompts_file=validation_prompts/showoprompts.txt batch_size=1

In [63]:
# uv run inference_t2i.py config=configs/showo_demo_w_clip_vit_512x512.yaml mode=t2i dataset.params.validation_prompts_file=validation_prompts/showoprompts.txt batch_size=1

In [64]:
# uv run inference_mmu.py config=configs/showo_demo_w_clip_vit_512x512.yaml max_new_tokens=100 mmu_image_root=./mmu_validation

In [65]:
cfg = OmegaConf.load("configs/showo_demo_w_clip_vit_512x512.yaml")
# cfg.batch_size = 1
cfg.max_new_tokens = 100

cfg_mmu = deepcopy(cfg)
cfg_t2i = deepcopy(cfg)


cfg_t2i.dataset.params.validation_prompts_file="validation_prompts/text2image_prompts.txt"
cfg_t2i.mode = "t2i"
cfg_mmu.mmu_image_root = "./mmu_validation"
cfg_mmu.device = "cuda:0"
cfg_t2i.device = "cuda:0"

In [66]:
run_mmu(cfg_mmu, model_mmu)

Working with z of shape (1, 13, 16, 16) = 3328 dimensions.
Look-up free quantizer with codebook size: 8192


  input_ids = torch.tensor(input_ids).to(device).squeeze(0)
 17%|█▋        | 1/6 [00:01<00:09,  1.82s/it]

 The path or structure is on a set of train tracks that are surrounded by trees and rocks.


 33%|███▎      | 2/6 [00:09<00:21,  5.31s/it]

 The scene appears to be surreal, as it is not a typical representation of a living room or a beach setting. Instead, it features a couch, a table, and a potted plant, all placed on a beach-like surface. The couch is positioned in the middle of the scene, and the table is located to the left of the couch. The potted plant is situated to the right of the couch. The combination of these elements creates an unusual and dreamlike atmosphere, as it is


 50%|█████     | 3/6 [00:14<00:15,  5.14s/it]

 Yes, it is interacting with something or someone in the image. A dog is standing in a field of flowers, sniffing a flower, and appears to be smelling it. This suggests that the dog is curious and interested in the flower, possibly exploring its surroundings or smelling it as a part of its natural behavior.


 67%|██████▋   | 4/6 [00:15<00:07,  3.52s/it]

 The time of day in the picture is during the day.


 83%|████████▎ | 5/6 [00:16<00:02,  2.71s/it]

 The character, Spongebob, appears to be expressing happiness or excitement.


100%|██████████| 6/6 [00:18<00:00,  3.02s/it]

 The cat is walking through the snow, moving across the snow-covered ground.
=== Multimodal Understanding Results ===

Image 1: dog.png
Response: User: What kind of path or structure is it on?
 Answer :  The path or structure is on a set of train tracks that are surrounded by trees and rocks.

--------------------------------------------------

Image 2: cat.png
Response: User: Does the scene look realistic or surreal?
 Answer :  The scene appears to be surreal, as it is not a typical representation of a living room or a beach setting. Instead, it features a couch, a table, and a potted plant, all placed on a beach-like surface. The couch is positioned in the middle of the scene, and the table is located to the left of the couch. The potted plant is situated to the right of the couch. The combination of these elements creates an unusual and dreamlike atmosphere, as it is

--------------------------------------------------

Image 3: cat.txt
Response: User: Is it interacting with somethin




In [67]:
activations_recorder_mmu.outputs.keys()

dict_keys(['mm_projector.0', 'mm_projector.2', 'showo.model.layers.0.self_attn.q_proj', 'showo.model.layers.0.self_attn.k_proj', 'showo.model.layers.0.self_attn.v_proj', 'showo.model.layers.0.self_attn.dense', 'showo.model.layers.0.mlp.fc1', 'showo.model.layers.0.mlp.fc2', 'showo.model.layers.1.self_attn.q_proj', 'showo.model.layers.1.self_attn.k_proj', 'showo.model.layers.1.self_attn.v_proj', 'showo.model.layers.1.self_attn.dense', 'showo.model.layers.1.mlp.fc1', 'showo.model.layers.1.mlp.fc2', 'showo.model.layers.2.self_attn.q_proj', 'showo.model.layers.2.self_attn.k_proj', 'showo.model.layers.2.self_attn.v_proj', 'showo.model.layers.2.self_attn.dense', 'showo.model.layers.2.mlp.fc1', 'showo.model.layers.2.mlp.fc2', 'showo.model.layers.3.self_attn.q_proj', 'showo.model.layers.3.self_attn.k_proj', 'showo.model.layers.3.self_attn.v_proj', 'showo.model.layers.3.self_attn.dense', 'showo.model.layers.3.mlp.fc1', 'showo.model.layers.3.mlp.fc2', 'showo.model.layers.4.self_attn.q_proj', 'sho

In [None]:
def get_activations(recorder):
    acts_list = []
    for ind in tqdm(range(0, 24)):
        layer_name = f'showo.model.layers.{ind}.mlp.fc2'
        acts = [a['max_abs'] for a in recorder.outputs[layer_name]]

        if len(acts) == 0:
            break

        acts_list.extend(acts)
    tens = torch.concat(acts_list, dim=0)
    # print(tens.shape)
    return tens.tolist()

In [69]:
import matplotlib.pyplot as plt

def print_acts_boxplots(acts):
    plt.figure(figsize=(12,6))
    plt.boxplot(acts, labels=[f'Layer {i}' for i in range(len(acts))], showfliers=False)
    plt.ylabel("Activation value")
    plt.title("Boxplots of activations for different layers")
    plt.show()

In [86]:
acts_mmu = get_activations(activations_recorder_mmu)

100%|██████████| 24/24 [00:00<00:00, 21063.67it/s]

torch.Size([11304960])





In [82]:
print(acts_mmu[0])

tensor(1.4795)


In [73]:
run_t2i(cfg_t2i, model_t2i)

Saving images to: show-o-demo/inference_t2i_20251006_192525
Working with z of shape (1, 13, 16, 16) = 3328 dimensions.
Look-up free quantizer with codebook size: 8192


100%|██████████| 1/1 [00:24<00:00, 24.94s/it]

Saved 8 text-to-image results to show-o-demo/inference_t2i_20251006_192525





In [74]:
acts_t2i = get_activations(activations_recorder_t2i)

100%|██████████| 24/24 [00:00<00:00, 93379.68it/s]


In [75]:
acts_t2i

tensor([ 1.0004,  0.4156,  1.0055,  ..., 13.3257, 15.9834, 11.6894])