In [1]:
from lmnav.models.perceiver import Perceiver
import torch
import numpy as np

[2023-09-10 11:23:39,447] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [2]:
model = Perceiver(
    input_channels = 768,          # number of channels for each token of the input
    input_axis = 1,              # number of axis for input data (2 for images, 3 for video)
    num_freq_bands = 64,          # number of freq bands, with original value (2 * K + 1)
    max_freq = 16384.,              # maximum frequency, hyperparameter depending on how fine the data is
    depth = 2,                   # depth of net. The shape of the final attention mechanism will be:
                                 #   depth * (cross attention -> self_per_cross_attn * self attention)
    num_latents = 256,           # number of latents, or induced set points, or centroids. different papers giving it different names
    latent_dim = 768,            # latent dimension
    cross_heads = 1,             # number of heads for cross attention. paper said 1
    latent_heads = 8,            # number of heads for latent self attention, 8
    cross_dim_head = 128,         # number of dimensions per cross attention head
    latent_dim_head = 128,        # number of dimensions per latent self attention head
    final_classifier_head = False,          # output number of classes
    attn_dropout = 0.1,
    ff_dropout = 0.1,
    weight_tie_layers = False,   # whether to weight tie layers (optional, as indicated in the diagram)
    fourier_encode_data = True,  # whether to auto-fourier encode the data, using the input_axis given. defaults to True, but can be turned off if you are fourier encoding the data yourself
    self_per_cross_attn = 7      # number of self attention blocks per cross attention
)
model = model.to('cuda')

In [3]:
sum([np.prod(p.size()) for p in model.parameters()])

158511108

In [4]:
x = torch.rand(2, 16000, 768).to('cuda')

In [5]:
y = model(x)

In [6]:
y.shape

torch.Size([2, 256, 768])

In [2]:
from collections import namedtuple

from lmnav.common.config import Config
from lmnav.common.registry import registry

from lmnav.models import *
from lmnav.processors import *
from lmnav.common.episode_processor import apply_transforms_inputs

import torch
import einops


In [3]:
def _init_components(cfg_path, device):
    Args = namedtuple("Args", "cfg_path, model_type, gpu_id, options")
    args = Args(cfg_path, "llama_v2", 0, [])

    cfg = Config(args)

    model_config = cfg.model_cfg
    model_cls = registry.get_model_class(model_config.arch)
    model = model_cls.from_config(model_config).to(device)
    
    model.train()

    vis_processor_cfg = cfg.config.preprocess.vis_processor.train
    vis_processor = registry.get_processor_class(vis_processor_cfg.name).from_config(vis_processor_cfg)

    return model, vis_processor


def test_construct_inputs(B, T):
    goals = torch.rand(B, 1, 3, 480, 640)
    rgbs = torch.rand(B, T, 3, 480, 640)
    actions = torch.randint(0, 4, (B, T))

    return goals, rgbs, actions

In [4]:
cfg_path = "/srv/flash1/pputta7/projects/lm-nav/exp_configs/lora_nav_llama_train.yaml"
device = 'cuda'
B, T = 2, 20

model, vis_processor = _init_components(cfg_path, device)

Loading VIT
Loading VIT Done
Loading Q-Former


Using pad_token, but it is not set yet.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
model.half()

LinNavLLAMA(
  (visual_encoder): VisionTransformer(
    (patch_embed): PatchEmbed(
      (proj): Conv2d(3, 1408, kernel_size=(14, 14), stride=(14, 14))
    )
    (pos_drop): Dropout(p=0.0, inplace=False)
    (blocks): ModuleList(
      (0-38): 39 x Block(
        (norm1): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=1408, out_features=4224, bias=False)
          (attn_drop): Dropout(p=0.0, inplace=False)
          (proj): Linear(in_features=1408, out_features=1408, bias=True)
          (proj_drop): Dropout(p=0.0, inplace=False)
        )
        (drop_path): Identity()
        (norm2): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
        (mlp): Mlp(
          (fc1): Linear(in_features=1408, out_features=6144, bias=True)
          (act): GELU(approximate='none')
          (fc2): Linear(in_features=6144, out_features=1408, bias=True)
          (drop): Dropout(p=0.0, inplace=False)
        )
      )
    )
 

In [8]:
x = torch.rand(100, 3, 224, 224).to('cuda').half()
embds = model.visual_encoder(x)
qtk = model.query_tokens.expand(embds.shape[0], -1, -1)
out = model.Qformer.bert(query_embeds = qtk,
                         encoder_hidden_states=embds,
                         return_dict=True)

In [11]:
out.last_hidden_state.shape

torch.Size([100, 32, 768])

In [13]:
from einops import repeat, rearrange
from torch import einsum

In [53]:
B, H, D = 1, 1, 512
Nkv = 5
Nq = 3
q = torch.rand(B, Nq, D)
k, v = (torch.rand(B, Nkv, D) for _ in range(2))
q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=H), (q, k, v))
sim = einsum('b i d, b j d -> b i j', q, k)

In [54]:
sim.shape

torch.Size([1, 3, 5])

In [56]:
mask = torch.ones(B, Nkv)
mask[..., -1] = 0
mask = rearrange(mask, 'b ... -> b (...)')
neg = -torch.finfo(sim.dtype).max
mask = repeat(mask, 'b j -> (b h) () j', h=H)
mask = mask.to(torch.bool)
mask

tensor([[[ True,  True,  True,  True, False]]])

In [57]:
sim.masked_fill_(~mask, neg)

tensor([[[ 1.3009e+02,  1.2752e+02,  1.2684e+02,  1.3022e+02, -3.4028e+38],
         [ 1.2227e+02,  1.1984e+02,  1.2209e+02,  1.2447e+02, -3.4028e+38],
         [ 1.3540e+02,  1.2980e+02,  1.2632e+02,  1.2921e+02, -3.4028e+38]]])