In [1]:
# setting device on GPU if available, else CPU
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

#Additional Info when using cuda
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')


Using device: cuda

NVIDIA GeForce RTX 2080 Ti
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [3]:
import torch
import numpy as np
import os
import torch.nn as nn
from tqdm import tqdm
import json
from functools import partial
from torch import einsum, nn
import torch.nn.functional as F

In [4]:
def findAllFile(base):
    file_path = []
    for root, ds, fs in os.walk(base, followlinks=True):
        for f in fs:
            fullname = os.path.join(root, f)
            file_path.append(fullname)
    return file_path



In [5]:
# from utils.motion_processing.hml_process import recover_from_ric, recover_root_rot_pos,recover_from_rot
import utils.vis_utils.plot_3d_global as plot_3d
import matplotlib.pyplot as plt

def vis(mot , dset , name = "motion"):

    if isinstance(mot , torch.Tensor):
        mot = dset.toMotion(mot)
    mot =dset.inv_transform(mot)



    xyz = np.array(dset.to_xyz(mot).cpu())

    print(xyz.shape)

    
    plot_3d.render(xyz , f"/srv/hays-lab/scratch/sanisetty3/music_motion/ATCMG/render/{name}.gif")

In [8]:
from configs.config import cfg, get_cfg_defaults

cfg = get_cfg_defaults()
cfg.merge_from_file("/srv/hays-lab/scratch/sanisetty3/music_motion/ATCMG/checkpoints/vqvae/vqvae_body/vqvae_body.yaml")

In [9]:
from core.datasets.conditioner import ConditionProvider, ConditionFuser
from core.datasets.multimodal_dataset import MotionAudioTextDataset, load_dataset, simple_collate

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
from core.models.attend import Attention
from core import AttentionParams
from core import AttentionParams, TranslationTransformerParams, PositionalEmbeddingParams, PositionalEmbeddingType, MotionRep, AudioRep, TextRep


In [11]:
# from core.models.generation.translation_transformer import TranslationTransformer
# translation_tranformer = TranslationTransformer(cfg.translation_transformer).cuda()

In [12]:
dataset_args = cfg.dataset
# condition_provider = ConditionProvider(
#             motion_rep=MotionRep(dataset_args.motion_rep),
#             audio_rep=AudioRep(dataset_args.audio_rep),
#             text_rep=TextRep(dataset_args.text_rep),
#             motion_padding=dataset_args.motion_padding,
#             audio_padding=dataset_args.audio_padding,
#             motion_max_length_s=10,
#             audio_max_length_s=10,
#         )

condition_provider2 = ConditionProvider(
            motion_rep=MotionRep(dataset_args.motion_rep),
            motion_padding=dataset_args.motion_padding,

        )

In [13]:
from core.datasets.multimodal_dataset import MotionAudioTextDataset
from core.datasets.vq_dataset import VQSMPLXMotionDataset
from core.datasets.vq_dataset import simple_collate as simple_collate2
from core import Motion

from utils.motion_processing.skeleton import Skeleton, t2m_kinematic_chain , body_joints_id, t2m_raw_body_offsets

In [14]:
# dset = MotionAudioTextDataset("moyo" , "/srv/hays-lab/scratch/sanisetty3/motionx" ,motion_rep = "body" , hml_rep = "gprvc", split = "test"   )

In [16]:
dset = VQSMPLXMotionDataset("choreomaster" , "/srv/hays-lab/scratch/sanisetty3/motionx" ,motion_rep = "body" , hml_rep = "rv", split = "train" , window_size = 600  )

Total number of motions choreomaster: 34


In [20]:
dset.motion_dim

192

In [17]:
train_loader = torch.utils.data.DataLoader(
        dset,
        4,
        # sampler=sampler,
        collate_fn=partial(simple_collate2 , conditioner = condition_provider2),
        drop_last=True,
    )

In [18]:
for inputs in train_loader:
    break
    

In [19]:
train_loader.batch_size

4

In [103]:
22*6

132

In [102]:
mot = inputs["motion"][0]
mot.shape

torch.Size([4, 300, 126])

In [62]:
dset.render_hml(mot[0] , "/srv/hays-lab/scratch/sanisetty3/music_motion/ATCMG/render/r.gif")

In [85]:
hml_rep = dset.hml_rep
motion_rep = dset.motion_rep
nb_joints = dset.nb_joints

In [86]:
split_seq = []

if "g" in hml_rep:
    split_seq.append(4)
if "p" in hml_rep:
    if motion_rep == MotionRep.BODY or motion_rep == MotionRep.FULL:
        split_seq.append((nb_joints - 1) * 3)
    else:
        split_seq.append((nb_joints) * 3)
if "r" in hml_rep:
    if motion_rep == MotionRep.BODY or motion_rep == MotionRep.FULL:
        split_seq.append((nb_joints - 1) * 6)
    else:
        split_seq.append((nb_joints) * 6)
if "v" in hml_rep:
    split_seq.append(nb_joints * 3)
if "c" in hml_rep:
    split_seq.append(4)


In [94]:
import utils.rotation_conversions as geometry


torch.Size([4, 300, 21, 6])

In [111]:
geometry.rotation_6d_to_matrix(mot.view(-1, 21 , 6).contiguous()).shape

torch.Size([1200, 21, 3, 3])

In [27]:
from core.models.resnetVQ.vqvae import HumanVQVAE


In [23]:
vqvae_args = cfg.vqvae

In [24]:
vqvae_args.nb_joints = dset.nb_joints
vqvae_args.motion_dim = dset.motion_dim

In [26]:
vqvae_model = HumanVQVAE(vqvae_args).to(device)

In [38]:
from core.models.loss import ReConsLoss

loss_fnc = ReConsLoss("l1_smooth" , True , vqvae_args.nb_joints , hml_rep=dset.hml_rep , motion_rep = dset.motion_rep  )

In [30]:
motion = inputs["motion"][0].to(device)

In [31]:
pred = vqvae_model(motion)

In [33]:
pred.decoded_motion.shape

torch.Size([4, 300, 192])

In [40]:
loss = loss_fnc(pred.decoded_motion , motion)

In [26]:
mottt = dset.toMotion(mot)

In [93]:
vis(mot[0] , dset2)

(120, 22, 3)


In [99]:
mot.dtype

torch.float32

In [82]:

sap = AttentionParams(dim = 256 , causal=True)
cap = AttentionParams(dim = 256 , causal=True , add_null_kv=True)
transformer_params = TranslationTransformerParams(self_attention_params = sap , 
                                                  cross_attention_params = cap , 
                                                  depth = 1, 
                                                  positional_embedding_params=PositionalEmbeddingParams(dim = 256) , 
                                                  positional_embedding=PositionalEmbeddingType.SINE,
                                                  fuse_method = {"cross_seperate" : ["audio" , "text"]}
                                                 )

In [62]:
from core.models.generation.translation_transformer import ClassifierFreeGuidanceDropout, TransformerBlock

In [63]:
dim = 256

In [64]:
pos_emb = ScaledSinusoidalEmbedding(dim , theta = 10000)
cfg_dropout = ClassifierFreeGuidanceDropout(
            0.0
        )
condition_fuser = ConditionFuser({"cross" : ["audio"] ,   "prepend" : ["text"]})

In [65]:
transformer_blocks = TransformerBlock(sap , cap).cuda()

In [66]:
project_audio = (
    nn.Linear(128, dim).cuda()
    
)
project_text = (
    nn.Linear(768, dim).cuda()
    
)

In [67]:
motion = inputs["motion"][0]
motion_padding_mask = inputs["motion"][1]
device, b, n , d = motion.device, *motion.shape
translation = motion[... , :4]

In [68]:
x = (
    pos_emb(translation).repeat(b, 1, 1) * motion_padding_mask.unsqueeze(-1),
    motion_padding_mask,
)


In [69]:
conditions = cfg_dropout(conditions , 0.2)


In [70]:
audio_embed = project_audio(conditions["audio"][0])
text_embed = project_text(conditions["text"][0])


In [71]:
conditions["audio"] = (audio_embed, conditions["audio"][1])
conditions["text"] = (text_embed, conditions["text"][1])


In [72]:
conditions["audio"][0].shape

torch.Size([4, 1, 256])

In [75]:
inputs_, cross_inputs_ = condition_fuser(x, conditions)

In [78]:
x_ = inputs_[0]
x_padding_mask = inputs_[1]
context = cross_inputs_[0]
context_padding_mask = cross_inputs_[1]

In [81]:
x_.shape

torch.Size([4, 153, 256])

In [82]:
x_padding_mask.shape

torch.Size([4, 153])

In [83]:
context.shape

torch.Size([4, 1, 256])

In [84]:
context_padding_mask.shape

torch.Size([4, 1])

In [85]:
conditions["text"][1]

tensor([[ True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False],
        [False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False],
        [ True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False],
        [ True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  Tr

In [86]:
x_padding_mask

tensor([[ True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  

In [87]:
embed = transformer_blocks(
            x=x_,
            mask=x_padding_mask,
            context=context,
            context_mask=context_padding_mask,
        )

In [89]:
embed = embed[:, -n:, :]

In [90]:
embed.shape

torch.Size([4, 116, 256])

In [143]:
pos_emb = ScaledSinusoidalEmbedding(PositionalEmbeddingParams(dim = 256))

In [161]:
b , n , _ = input["motion"][0].shape

In [167]:

x = pos_emb(input["motion"][0]).repeat(b , 1 ,1)


In [173]:
x.shape

torch.Size([4, 116, 256])

In [None]:
for a, (b , c) in conditions.items():
    print(a)
    print(b.shape)

In [175]:
inputs_ , cross_inputs = condition_fuser(x , conditions  )

In [176]:
inputs_.shape

torch.Size([4, 153, 256])

In [177]:
cross_inputs.shape

torch.Size([4, 500, 256])