In [56]:
import json
import glob
import os
import time
import math
import pickle
from contextlib import nullcontext
import pprint

import numpy as np
import torch
import torch._dynamo
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.distributed import init_process_group, destroy_process_group

import rt1_model_v2 as rt1_model
import rt1_dataset
from utils_file import get_filename
import matplotlib.pyplot as plt

import random
from tqdm import tqdm

from functools import reduce
import importlib

## Test model forward pass

In [51]:
importlib.reload(rt1_model)

<module 'rt1_model' from '/home/user/Documents/projects/osil/rt1_model.py'>

In [52]:
hasattr(torch.nn.functional, 'scaled_dot_product_attention')
torch.nn.functional.scaled_dot_product_attention

<function torch._C._nn.scaled_dot_product_attention>

In [61]:
# configuration

# training config
device='cuda'
dtype = 'float32'

# model
n_layer: int = 6 # 12
n_head: int = 8 # 12
# n_embd = 768
n_embd: int = 128
dropout = 0.0 # for pretraining 0 is good, for finetuning try 0.1+
bias = False # do we use bias inside LayerNorm and Linear layers?

# data
batch_size = 12 # if gradient_accumulation_steps > 1, this is the micro-batch size
block_size = 1
num_keyframes=5

# torch
seed_offset = 0
torch.manual_seed(1337 + seed_offset) # TODO why 1337?
torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
device_type = 'cuda' if 'cuda' in device else 'cpu' # for later use in torch.autocast
# note: float16 data type will automatically use a GradScaler
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)
torch._dynamo.config.suppress_errors = False
torch._dynamo.config.verbose=True
print(device, ptdtype, ctx)

cuda torch.float32 <torch.amp.autocast_mode.autocast object at 0x7f69ef022e80>


In [62]:
# Why does compile fail?
# ImageFeatureEncoder
# AttentionEncoderBlock
# AttentionDecoderBlock
model_args = dict(
    n_layer=n_layer, 
    n_head=n_head, 
    n_embd=n_embd,
    dropout=dropout, 
    bias=bias,
    block_size_key=num_keyframes, # encoder block size
    block_size_obs=block_size, # decoder block size
    vocab_size=100,
    n_tokens_per_frame=8, # n_tokens per image feature
    feature_dim=1280, # image feature dimension
)

model = rt1_model.TOSIL(rt1_model.TOSILConfig(**model_args))
model = model.to(device)
model = torch.compile(model)

number of parameters: 1.36M


In [63]:
X = torch.ones(12, 5, 1280, 3, 3, device='cuda', dtype=torch.float32)
Y = torch.ones(12, 1, 1280, 3, 3, device='cuda', dtype=torch.float32)
actions = torch.ones(12, 4, device='cuda', dtype=torch.float32)

In [64]:
action_pred, loss = model(X, Y, actions)



In [65]:
action_pred.shape

torch.Size([12, 4])

## TOSIL Performance Test

In [None]:
def rough_fun_timing(fun, *args, **kwargs):
    start = time.time()
    fun(*args, **kwargs)
    end = time.time()
    print(f"Time taken: {(end - start) * 1000} ms")

In [None]:
def grad_step():
    action_pred, loss = model(X, Y, actions)
    loss.backward()

In [None]:
rough_fun_timing(grad_step)