In [1]:
# setting device on GPU if available, else CPU
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

#Additional Info when using cuda
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')


Using device: cuda

GeForce RTX 2080 Ti
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [3]:
import wandb

import torch
import torch.nn.functional as F
import torch.nn as nn
from torch.utils.data import DataLoader

from torch.utils import data


import copy
import os
import random
import cv2
import numpy as np
from PIL import Image
import pandas as pd
import matplotlib.pyplot as plt
from glob import glob
import functools
from tqdm import tqdm
from datetime import datetime
import numpy as np
from core.datasets.vqa_motion_dataset import VQMotionDataset,DATALoader,VQVarLenMotionDataset,MotionCollator,VQFullMotionDataset
from einops import rearrange, reduce, pack, unpack
import librosa
import itertools

In [4]:
from utils.motion_process import recover_from_ric
import visualize.plot_3d_global as plot_3d
from glob import glob
def to_xyz(motion, mean ,std , j = 22):
    motion_xyz = recover_from_ric(motion.cpu().float()*std+mean, j)
    motion_xyz = motion_xyz.reshape(motion.shape[0],-1, j, 3)
    return motion_xyz

            
def sample_render(motion_xyz , name , save_path):
    print(f"render start")
    
    gt_pose_vis = plot_3d.draw_to_batch(motion_xyz.numpy(),None, [os.path.join(save_path,name + ".gif")])



In [5]:
from configs.config import cfg, get_cfg_defaults
from core.models.motion_regressor import MotionRegressorModel


cfg_trans = get_cfg_defaults()
cfg_trans.merge_from_file("/srv/scratch/sanisetty3/music_motion/motion_vqvae/checkpoints/generator/var_len/trans_768_768_albi_aist_style/var_len_768_768_aist_style.yaml")



In [6]:
trans_model = MotionRegressorModel(args = cfg_trans.motion_trans,pad_value=1025 ).eval()
# pkg_trans = torch.load(f"/srv/scratch/sanisetty3/music_motion/motion_vqvae/checkpoints/generator/var_len/trans_768_768_albi_aist/trans_motion_best_fid.pt", map_location = 'cpu')
# print(pkg_trans["steps"])
# trans_model.load_state_dict(pkg_trans["model"])
# trans_model =trans_model.cuda()


In [7]:
from core.datasets.vqa_motion_dataset import MotionCollatorConditional,VQVarLenMotionDatasetConditional, TransMotionDatasetConditional,MotionCollatorConditionalStyle


In [8]:
# hml_train_ds = TransMotionDatasetConditional(dataset_name = "t2m", split = "val",datafolder="joint_indices_max_400",data_root = "/srv/scratch/sanisetty3/music_motion/HumanML3D/HumanML3D" , window_size = 400)



In [9]:
aist_train_ds = TransMotionDatasetConditional("aist", split = "train",data_root = "/srv/scratch/sanisetty3/music_motion/AIST" , datafolder="joint_indices_max_400",  musicfolder = "music",window_size = 400)



100%|██████████| 1910/1910 [00:01<00:00, 1335.26it/s]

Total number of motions 1910





In [10]:
import clip

clip_model, clip_preprocess = clip.load("ViT-B/32", device=torch.device('cuda'), jit=False)  # Must set jit=False for training
clip_model.eval()
for p in clip_model.parameters():
    p.requires_grad = False

In [11]:
collate_fn2 = MotionCollatorConditionalStyle(clip_model=clip_model, bos = 1024, pad = 1025, eos = 1026)


In [20]:
dl = DATALoader(aist_train_ds , batch_size = 1,collate_fn=collate_fn2 )


In [21]:
for reg_batch in dl:
    break
print(reg_batch["motion_lengths"])
print(reg_batch["names"])
for k,v in reg_batch.items():
    print(k , v.shape)

tensor([191.])
['gBR_sBM_cAll_d04_mBR2_ch05']
motion torch.Size([1, 192])
motion_lengths torch.Size([1])
motion_mask torch.Size([1, 192])
names (1,)
condition torch.Size([1, 191, 128])
condition_mask torch.Size([1, 191])
style torch.Size([1, 512])


## Style only aist

In [22]:
inp, target = reg_batch["motion"][:, :-1], reg_batch["motion"][:, 1:]

In [23]:
# trans_model = MotionRegressorModel(args = cfg_trans.motion_trans,pad_value=1025 ).eval()

In [24]:
logits = trans_model(motion = inp , mask = reg_batch["motion_mask"][:,:-1]  , \
    context = reg_batch["condition"], context_mask = reg_batch["condition_mask"] , \
                     style_context = reg_batch["style"])

In [25]:
start = torch.randint(0,1024 , (1,1))

In [26]:
reg_batch["condition"].shape

torch.Size([1, 191, 128])

In [27]:
trans_model.generate(start,100,context = reg_batch["condition"], context_mask = reg_batch["condition_mask"],style_context = reg_batch["style"])

100%|██████████| 100/100 [00:13<00:00,  7.53it/s]


tensor([[878, 254,   3, 180, 562, 666,  18,  24, 278, 146, 850,  97, 153, 910,
         420, 126, 159, 572, 345,  82, 666,  86, 732, 365, 850, 963,  69, 542,
          97, 906, 808, 512, 115, 768, 126, 834, 554, 554, 420, 834,  21, 454,
           3, 824,  78, 115, 448, 844, 344, 115, 881, 144, 146, 207, 554, 685,
         146,  31, 682, 180, 666, 613,  24,  41, 604, 512, 763, 977,  18, 910,
         732, 666,  18, 844, 463, 116, 115, 560,  24, 348, 990, 590,  69,   3,
          86, 850, 946,  78,  18, 159,  18, 254, 278, 345, 159, 473, 834, 706,
         963, 159, 303]])

In [30]:
text = clip.tokenize(["dance"], truncate=True).cuda()
encodings = clip_model.encode_text(text)

In [36]:
encodings.reshape(-1).shape

torch.Size([512])

In [None]:
class DenseFiLM(nn.Module):
    """Feature-wise linear modulation (FiLM) generator."""

    def __init__(self, embed_channels):
        super().__init__()
        self.embed_channels = embed_channels
        self.block = nn.Sequential(
            nn.Mish(), nn.Linear(embed_channels, embed_channels * 2)
        )

    def forward(self, position):
        pos_encoding = self.block(position)
        pos_encoding = rearrange(pos_encoding, "b c -> b 1 c")
        scale_shift = pos_encoding.chunk(2, dim=-1)
        return scale_shift



In [135]:
from core.models.attention import Attention

In [181]:
reg_batch["style"].shape

torch.Size([2, 512])

In [14]:
proj = nn.Linear(512, 768)

In [15]:
style  = proj(reg_batch["style"])

In [145]:
reg_batch["condition"]

torch.Size([2, 191, 35])

In [136]:
att = Attention(dim = 768)

In [155]:
out = att(x = torch.randn((2,191,768)) , context = torch.randn((2,191,768)) , mask = reg_batch["motion_mask"][:,:-1] , context_mask = reg_batch["condition_mask"] )[0]

In [156]:
out[0].shape

torch.Size([191, 768])

In [17]:
inp, target = reg_batch["motion"][:, :-1], reg_batch["motion"][:, 1:]

In [20]:
logits = trans_model(motion = inp , mask = reg_batch["motion_mask"][:,:-1]  , \
    context = reg_batch["condition"], context_mask = reg_batch["condition_mask"] , style_context = reg_batch["style"])

In [21]:
logits.shape

torch.Size([2, 191, 1027])

In [27]:
len(reg_batch["names"][0].split("_"))

6

In [29]:
"t2m" in ["t2m" , "aist"]

True

In [40]:
random.choice(["Dance1" , "Dance2"])

'Dance2'