In [2]:
# setting device on GPU if available, else CPU
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

#Additional Info when using cuda
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')


Using device: cuda

NVIDIA GeForce RTX 2080 Ti
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


In [3]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [4]:
import torch
import numpy as np
import os
import torch.nn as nn

In [5]:
from modeling_clamp import ClampModel
from configuration_clamp import ClampConfig

from feature_extraction_clamp import ClampFeatureExtractor
from transformers import RobertaTokenizer
from processing_clamp import ClampProcessor
from datasets import load_dataset, Audio

from clamp_dataset import load_dataset, simple_collate
from functools import partial

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
# from modeling_clap import ClapModel
# from transformers import AutoProcessor

In [7]:
def findAllFile(base):
    file_path = []
    for root, ds, fs in os.walk(base, followlinks=True):
        for f in fs:
            fullname = os.path.join(root, f)
            file_path.append(fullname)
    return file_path



In [8]:
from training_config import cfg, get_cfg_defaults


In [9]:
nme = "clamp_enc"
path = f"/srv/hays-lab/scratch/sanisetty3/music_motion/clamp/checkpoints/{nme}/{nme}.yaml"
cfg = get_cfg_defaults()
print("loading config from:", path)
cfg.merge_from_file(path)

loading config from: /srv/hays-lab/scratch/sanisetty3/music_motion/clamp/checkpoints/clamp_enc/clamp_enc.yaml


ScannerError: while scanning for the next token
found character '`' that cannot start any token
  in "<unicode string>", line 20, column 1:
    `
    ^

In [9]:
clamp_config = ClampConfig.from_pretrained("/srv/hays-lab/scratch/sanisetty3/music_motion/clamp/clamp")


In [10]:
clamp_model = ClampModel.from_pretrained("/srv/hays-lab/scratch/sanisetty3/music_motion/clamp/clamp", use_safetensors=True)


Some weights of ClampModel were not initialized from the model checkpoint at /srv/hays-lab/scratch/sanisetty3/music_motion/clamp/clamp and are newly initialized: ['motion_projection.project_in.bias', 'motion_projection.project_in.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [133]:
# clamp_model.load_state_dict("/srv/hays-lab/scratch/sanisetty3/music_motion/clamp/clamp/model.safetensors")

In [11]:
clamp_model.motion_model.quantizer.codebook.weight

Parameter containing:
tensor([[ 0.4886, -0.6965,  0.8278,  ...,  0.7765, -0.5237, -0.6707],
        [-0.1029, -0.4814, -1.3239,  ...,  0.6291, -0.5265, -0.9828],
        [ 0.2206, -0.1616, -0.2627,  ..., -0.2812,  0.8615, -1.7830],
        ...,
        [ 0.1758,  0.1818,  0.0020,  ..., -0.3373,  1.3992, -1.6280],
        [-0.3789,  1.2745, -0.1851,  ..., -0.6624,  0.7807, -0.0992],
        [-0.7875,  0.6957,  0.3103,  ...,  0.8619,  0.1014, -1.4184]],
       requires_grad=True)

In [13]:
clamp_feature_extractor = ClampFeatureExtractor.from_pretrained("/srv/hays-lab/scratch/sanisetty3/music_motion/clamp/clamp/")
tokenizer = RobertaTokenizer.from_pretrained("/srv/hays-lab/scratch/sanisetty3/music_motion/clamp/clamp")
clamp_processor = ClampProcessor(clamp_feature_extractor , tokenizer )




In [25]:
dset , sampler, w = load_dataset("/srv/hays-lab/scratch/sanisetty3/motionx")

Total number of motions animation: 268 and texts 268


In [26]:
train_loader = torch.utils.data.DataLoader(
        dset,
        2,
        sampler=sampler,
        # shuffle=shuffle,
        # num_workers=num_workers,
        collate_fn=partial(simple_collate , clamp_processor = clamp_processor),
        drop_last=True,
    )

In [27]:
for batch in train_loader:
    break

In [29]:
out = clamp_model(**batch , return_loss = True)

In [28]:
batch.keys()

dict_keys(['input_motion_features', 'motion_mask', 'input_ids', 'attention_mask', 'names', 'texts'])

In [32]:
logits_per_text_vs_motion = out.logits_per_text_vs_motion 
probs = logits_per_text_vs_motion.softmax(dim=-1)

In [33]:
probs

tensor([[0.6821, 0.3179],
        [0.3036, 0.6964]], grad_fn=<SoftmaxBackward0>)

In [34]:
logits_per_motion_vs_text = out.logits_per_motion_vs_text
probs2 = logits_per_motion_vs_text.softmax(dim=-1)

In [37]:
torch.diag(probs2)

tensor([0.7158, 0.6615], grad_fn=<DiagonalBackward0_copy>)

In [42]:
batch.keys()

dict_keys(['input_motion_features', 'motion_mask', 'input_ids', 'attention_mask', 'names'])

In [123]:
batch["names"]

array(['animation/subset_0001/Ways_To_Jump_Sit_Fall_Fist_Pump',
       'animation/subset_0000/Ways_To_Catch_Coin_Toss'], dtype='<U53')

In [38]:
clamp_model.motion_model.codebook

Embedding(1024, 512)

In [172]:
n =int( sum(batch["motion_mask"][0]))

In [15]:
motion_outputs = clamp_model.motion_model(
                input_features=batch["input_motion_features"],
                motion_mask=batch["motion_mask"],
                return_dict=True,
            )

In [19]:
motion_outputs.motion_quantized.shape

torch.Size([2, 45, 512])

In [17]:
motion_outputs.pooler_output.shape

torch.Size([2, 512])

In [178]:
motion_outputs.motion_embeds[0][: n//4]

tensor([[-0.3476, -0.3848, -0.3382,  ...,  1.2374,  1.4674, -2.1366],
        [-0.6643, -0.1604, -1.1021,  ...,  0.5524, -1.2188, -2.4285],
        [-0.4812, -0.7574, -1.0271,  ..., -0.4936,  1.6141, -2.4648],
        ...,
        [-0.7409, -0.1642, -0.5005,  ..., -0.7184,  0.2362, -0.9957],
        [-1.1092,  0.1609, -1.3755,  ..., -0.2261,  1.0162, -2.4969],
        [-1.0156,  0.1001, -1.6934,  ...,  0.4242,  1.1925, -2.5306]],
       grad_fn=<SliceBackward0>)

In [177]:
motion_outputs.motion_quantized[0][: n//4]

tensor([[ 0.7374,  0.1091, -0.9001,  ...,  0.9471,  0.4839, -1.7853],
        [-0.6063, -0.2402, -1.3614,  ..., -1.4479, -0.4712, -2.0911],
        [ 0.5954, -0.6793, -0.2143,  ..., -0.7939,  1.1101, -2.4060],
        ...,
        [ 0.4130, -1.0651,  0.4501,  ..., -0.1769,  0.1579, -1.4100],
        [ 0.1780, -0.5427, -0.3067,  ..., -0.3478,  0.5527, -2.9065],
        [-0.0870, -0.5428, -0.8138,  ..., -0.3544,  0.1502, -2.6137]])

### OG

In [129]:
motion_outputs.motion_embeds[0]

tensor([[-0.4184, -0.4447, -0.3431,  ...,  1.2462,  1.4885, -2.1719],
        [-0.4815, -0.2334, -1.2013,  ...,  0.5483, -1.2567, -2.4094],
        [-0.3224, -0.8071, -1.0123,  ..., -0.5272,  1.6135, -2.1802],
        ...,
        [-0.4755, -0.4051, -0.1536,  ..., -1.0224,  0.1503, -1.9890],
        [-1.7348,  0.5145, -1.3978,  ...,  0.0789,  0.8598, -2.9776],
        [-0.4824, -0.1940, -1.5952,  ...,  0.2725,  0.6000, -2.4318]],
       grad_fn=<SelectBackward0>)

In [130]:
motion_outputs.motion_quantized[0]

tensor([[ 0.7374,  0.1091, -0.9001,  ...,  0.9471,  0.4839, -1.7853],
        [-0.4930, -0.5167, -0.8416,  ..., -0.0847, -0.4262, -1.8722],
        [ 0.5954, -0.6793, -0.2143,  ..., -0.7939,  1.1101, -2.4060],
        ...,
        [ 0.4130, -1.0651,  0.4501,  ..., -0.1769,  0.1579, -1.4100],
        [ 0.0607, -0.4803, -0.3824,  ..., -0.3225,  0.4422, -2.5939],
        [ 0.6482, -0.8899, -0.6191,  ...,  0.0393,  0.8039, -2.2377]])

NameError: name 'batch' is not defined

## Load model

In [151]:
for n,p in clamp_model.named_parameters():
    print(n)

logit_scale_a
logit_scale_t
logit_scale_m
text_model.embeddings.word_embeddings.weight
text_model.embeddings.position_embeddings.weight
text_model.embeddings.token_type_embeddings.weight
text_model.embeddings.LayerNorm.weight
text_model.embeddings.LayerNorm.bias
text_model.encoder.layer.0.attention.self.query.weight
text_model.encoder.layer.0.attention.self.query.bias
text_model.encoder.layer.0.attention.self.key.weight
text_model.encoder.layer.0.attention.self.key.bias
text_model.encoder.layer.0.attention.self.value.weight
text_model.encoder.layer.0.attention.self.value.bias
text_model.encoder.layer.0.attention.output.dense.weight
text_model.encoder.layer.0.attention.output.dense.bias
text_model.encoder.layer.0.attention.output.LayerNorm.weight
text_model.encoder.layer.0.attention.output.LayerNorm.bias
text_model.encoder.layer.0.intermediate.dense.weight
text_model.encoder.layer.0.intermediate.dense.bias
text_model.encoder.layer.0.output.dense.weight
text_model.encoder.layer.0.output.

In [145]:
model_og = torch.load("/srv/hays-lab/scratch/sanisetty3/huggingface_downloads/models--laion--larger_clap_general/snapshots/ada0c23a36c4e8582805bb38fec3905903f18b41/pytorch_model.bin")

In [152]:
for k,p in model_og.items():
    print(k)

logit_scale_a
logit_scale_t
text_model.embeddings.position_ids
text_model.embeddings.token_type_ids
text_model.embeddings.word_embeddings.weight
text_model.embeddings.position_embeddings.weight
text_model.embeddings.token_type_embeddings.weight
text_model.embeddings.LayerNorm.weight
text_model.embeddings.LayerNorm.bias
text_model.encoder.layer.0.attention.self.query.weight
text_model.encoder.layer.0.attention.self.query.bias
text_model.encoder.layer.0.attention.self.key.weight
text_model.encoder.layer.0.attention.self.key.bias
text_model.encoder.layer.0.attention.self.value.weight
text_model.encoder.layer.0.attention.self.value.bias
text_model.encoder.layer.0.attention.output.dense.weight
text_model.encoder.layer.0.attention.output.dense.bias
text_model.encoder.layer.0.attention.output.LayerNorm.weight
text_model.encoder.layer.0.attention.output.LayerNorm.bias
text_model.encoder.layer.0.intermediate.dense.weight
text_model.encoder.layer.0.intermediate.dense.bias
text_model.encoder.laye

In [147]:
model_motion = torch.load("/srv/hays-lab/scratch/sanisetty3/music_motion/ACMG/checkpoints/smplx_resnet/checkpoints/vqvae_motion.300000.pt")["model"]

In [150]:
for k,p in model_motion.items():
    print(k)

vqvae.encoder.model.0.weight
vqvae.encoder.model.0.bias
vqvae.encoder.model.2.0.weight
vqvae.encoder.model.2.0.bias
vqvae.encoder.model.2.1.model.0.conv1.weight
vqvae.encoder.model.2.1.model.0.conv1.bias
vqvae.encoder.model.2.1.model.0.conv2.weight
vqvae.encoder.model.2.1.model.0.conv2.bias
vqvae.encoder.model.2.1.model.1.conv1.weight
vqvae.encoder.model.2.1.model.1.conv1.bias
vqvae.encoder.model.2.1.model.1.conv2.weight
vqvae.encoder.model.2.1.model.1.conv2.bias
vqvae.encoder.model.2.1.model.2.conv1.weight
vqvae.encoder.model.2.1.model.2.conv1.bias
vqvae.encoder.model.2.1.model.2.conv2.weight
vqvae.encoder.model.2.1.model.2.conv2.bias
vqvae.encoder.model.2.1.model.3.conv1.weight
vqvae.encoder.model.2.1.model.3.conv1.bias
vqvae.encoder.model.2.1.model.3.conv2.weight
vqvae.encoder.model.2.1.model.3.conv2.bias
vqvae.encoder.model.2.1.model.4.conv1.weight
vqvae.encoder.model.2.1.model.4.conv1.bias
vqvae.encoder.model.2.1.model.4.conv2.weight
vqvae.encoder.model.2.1.model.4.conv2.bias
vqva

In [163]:
new_dict = {}
for k,p in model_motion.items():
    if "encoder" in k:
        print(p.shape)
        new_dict[k.replace("vqvae" , "motion_model")] = p
    if "codebook" in k:
        print(p)
        new_dict["motion_model.quantizer.codebook.weight"] = p


torch.Size([768, 263, 3])
torch.Size([768])
torch.Size([768, 768, 4])
torch.Size([768])
torch.Size([768, 768, 3])
torch.Size([768])
torch.Size([768, 768, 1])
torch.Size([768])
torch.Size([768, 768, 3])
torch.Size([768])
torch.Size([768, 768, 1])
torch.Size([768])
torch.Size([768, 768, 3])
torch.Size([768])
torch.Size([768, 768, 1])
torch.Size([768])
torch.Size([768, 768, 3])
torch.Size([768])
torch.Size([768, 768, 1])
torch.Size([768])
torch.Size([768, 768, 3])
torch.Size([768])
torch.Size([768, 768, 1])
torch.Size([768])
torch.Size([768, 768, 3])
torch.Size([768])
torch.Size([768, 768, 1])
torch.Size([768])
torch.Size([768, 768, 4])
torch.Size([768])
torch.Size([768, 768, 3])
torch.Size([768])
torch.Size([768, 768, 1])
torch.Size([768])
torch.Size([768, 768, 3])
torch.Size([768])
torch.Size([768, 768, 1])
torch.Size([768])
torch.Size([768, 768, 3])
torch.Size([768])
torch.Size([768, 768, 1])
torch.Size([768])
torch.Size([768, 768, 3])
torch.Size([768])
torch.Size([768, 768, 1])
torch.

In [160]:
m , u = clamp_model.load_state_dict(model_og , strict=False)

In [161]:
m2 , u2 = clamp_model.load_state_dict(new_dict, strict = False)

In [162]:
clamp_model.motion_model.quantizer.codebook.weight

Parameter containing:
tensor([[ 0.4886, -0.6965,  0.8278,  ...,  0.7765, -0.5237, -0.6707],
        [-0.1029, -0.4814, -1.3239,  ...,  0.6291, -0.5265, -0.9828],
        [ 0.2206, -0.1616, -0.2627,  ..., -0.2812,  0.8615, -1.7830],
        ...,
        [ 0.1758,  0.1818,  0.0020,  ..., -0.3373,  1.3992, -1.6280],
        [-0.3789,  1.2745, -0.1851,  ..., -0.6624,  0.7807, -0.0992],
        [-0.7875,  0.6957,  0.3103,  ...,  0.8619,  0.1014, -1.4184]],
       requires_grad=True)

In [38]:
for i in os.listdir("/srv/hays-lab/scratch/sanisetty3/motionx/motion_data/new_joint_vecs"):
    motions = []
    for f in findAllFile(f"/srv/hays-lab/scratch/sanisetty3/motionx/motion_data/new_joint_vecs/{i}"):
        motions.append(np.load(f).shape[0])

    print(i , "mean" , np.mean(motions) , "std", np.std(motions) , "max" , max(motions) , "min" , min(motions))

moyo mean 380.77647058823527 std 156.56916355329534 max 950 min 162
music mean 244.91694725028057 std 112.2322020469831 max 943 min 1
EgoBody mean 446.9142857142857 std 117.16863305265865 max 679 min 82
fitness mean 213.24708485319618 std 127.85189517845544 max 1993 min 1
beat mean 1706.0 std 0.0 max 1706 min 1706
aist mean 426.40204211869815 std 893.2803941721178 max 10826 min 11
dance mean 221.6851851851852 std 195.8577359806965 max 1496 min 5
idea400 mean 206.37420076726343 std 65.0198867817423 max 569 min 1
HAA500 mean 58.56643089275473 std 40.42577510598959 max 391 min 4
kungfu mean 246.9153031761309 std 120.82275115239747 max 783 min 1
perform mean 214.8357894736842 std 109.75524149170757 max 553 min 3
GRAB mean 303.31760299625466 std 153.38501490937358 max 1114 min 122
humanml mean 211.2161495962601 std 80.59329265648283 max 299 min 4
choreomaster mean 4351.944444444444 std 1558.3454713331994 max 8142 min 886
game_motion mean 108.68687561214496 std 102.12329450829242 max 2918 mi