In [78]:
# setting device on GPU if available, else CPU
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

#Additional Info when using cuda
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')


Using device: cuda

GeForce RTX 2080 Ti
Memory Usage:
Allocated: 0.2 GB
Cached:    0.2 GB


In [79]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [80]:
import wandb

import torch
import torch.nn.functional as F
import torch.nn as nn
from torch.utils.data import DataLoader

from torch.utils import data


import copy
import os
import random
import cv2
import numpy as np
from PIL import Image
import pandas as pd
import matplotlib.pyplot as plt
from glob import glob
import functools
from tqdm import tqdm
from datetime import datetime
import numpy as np
from core.datasets.vqa_motion_dataset import VQMotionDataset,DATALoader,VQVarLenMotionDataset,MotionCollator,VQFullMotionDataset
from einops import rearrange, reduce, pack, unpack
import librosa

In [81]:
from core.datasets.vqa_motion_dataset import MotionCollatorConditional, TransMotionDatasetConditional,VQMotionDataset,DATALoader,VQVarLenMotionDataset,MotionCollator,VQFullMotionDataset


## VQVAE

In [None]:
from configs.config import cfg, get_cfg_defaults
from core.models.vqvae import VQMotionModel
from core.models.motion_regressor import MotionRegressorModel


cfg_vq = get_cfg_defaults()
cfg_vq.merge_from_file("/srv/scratch/sanisetty3/music_motion/motion_vqvae/configs/var_len_768_768_aist_vq.yaml")




In [None]:
vqvae_model = VQMotionModel(cfg_vq.vqvae).eval()
pkg = torch.load(f"/srv/scratch/sanisetty3/music_motion/motion_vqvae/checkpoints/var_len/vq_768_768_mix/vqvae_motion_best_fid.pt", map_location = 'cpu')
print(pkg["steps"])
vqvae_model.load_state_dict(pkg["model"])
vqvae_model =vqvae_model.cuda()


In [None]:
# train_ds = VQVarLenMotionDataset("t2m", split = "render" , max_length_seconds = 10, data_root = "/srv/scratch/sanisetty3/music_motion/HumanML3D/HumanML3D")
# train_loader = DATALoader(train_ds,1,collate_fn=collate_fn)

In [82]:
from core.datasets.evaluator_dataset import EvaluatorMotionCollator, EvaluatorVarLenMotionDataset

In [86]:
aist_ds = EvaluatorVarLenMotionDataset(data_root = "/srv/scratch/sanisetty3/music_motion/AIST" ,  split = "train" ,num_stages = 6 ,min_length_seconds=10, max_length_seconds=40)
collate_fn = EvaluatorMotionCollator()

aist_loader = DATALoader(aist_ds,10,collate_fn=collate_fn)

  4%|▍         | 80/1910 [00:00<00:02, 795.10it/s]

changing range to: 200 - 200


100%|██████████| 1910/1910 [00:02<00:00, 745.64it/s]

Total number of motions 1910





In [87]:
for batch in aist_loader:
    break

In [88]:
batch["motion"].shape

torch.Size([10, 200, 263])

In [89]:
genre_dict = {
    "mBR" : "Break",
    "mPO" : "Pop",
    "mLO" : "Lock",
    "mMH" : "Middle Hip-hop",
    "mLH" : "LA style Hip-hop",
    "mHO" : "House",    
    "mWA" : "Waack",
    "mKR" : "Krump",
    "mJS" : "Street Jazz",
    "mJB" : "Ballet Jazz",
}

joint_index_genre_mapping = {
    "Break":[],
    "Pop":[],
    "Lock":[],
    "Middle Hip-hop":[],
    "LA style Hip-hop":[],
    "House":[],    
    "Waack":[],
    "Krump":[],
    "Street Jazz":[],
    "Ballet Jazz":[],
}

In [116]:
from torch.nn.utils.rnn import pack_padded_sequence

class MotionEncoderBiGRUCo(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, device):
        super(MotionEncoderBiGRUCo, self).__init__()
        self.device = device

        self.input_emb = nn.Linear(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True, bidirectional=True)
        self.output_net = nn.Sequential(
            nn.Linear(hidden_size*2, hidden_size),
            nn.LayerNorm(hidden_size),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Linear(hidden_size, output_size)
        )

        self.input_emb.apply(self.init_weight)
        self.output_net.apply(self.init_weight)
        self.hidden_size = hidden_size
        self.hidden = nn.Parameter(torch.randn((2, 1, self.hidden_size), requires_grad=True))
        
    def init_weight(self, m):
        if isinstance(m, nn.Conv1d) or isinstance(m, nn.Linear) or isinstance(m, nn.ConvTranspose1d):
            nn.init.xavier_normal_(m.weight)
            # m.bias.data.fill_(0.01)
            if m.bias is not None:
                nn.init.constant_(m.bias, 0)

    # input(batch_size, seq_len, dim)
    def forward(self, inputs, m_lens):
        num_samples = inputs.shape[0]

        input_embs = self.input_emb(inputs)
        hidden = self.hidden.repeat(1, num_samples, 1)

        cap_lens = m_lens.data.tolist()
        emb = pack_padded_sequence(input_embs, cap_lens, batch_first=True, enforce_sorted=False)

        gru_seq, gru_last = self.gru(emb, hidden)

        gru_last = torch.cat([gru_last[0], gru_last[1]], dim=-1)

        return F.normalize(self.output_net(gru_last), dim=1)

In [108]:
~torch.eye(2 * 2, 2 * 2, dtype=bool)

tensor([[False,  True,  True,  True],
        [ True, False,  True,  True],
        [ True,  True, False,  True],
        [ True,  True,  True, False]])

In [109]:
from core.models.loss import CLIPLoss,InfoNceLoss

In [117]:
motionEncoder = MotionEncoderBiGRUCo(263,768,128,"cuda")
audioEncoder = MotionEncoderBiGRUCo(128,768,128,"cuda")

In [93]:
batch.keys()

dict_keys(['motion', 'motion_lengths', 'motion_mask', 'names', 'condition', 'condition_mask'])

In [118]:
em = motionEncoder(batch["motion"]*batch["motion_mask"][:,:,None] , batch["motion_lengths"])
ec = audioEncoder(batch["condition"]*batch["condition_mask"][:,:,None] ,batch["motion_lengths"])

In [119]:
contrastive_loss = nn.CosineEmbeddingLoss()

In [120]:
contrastive_loss(em , ec ,  torch.ones(batch_size))

tensor(0.9851, grad_fn=<MeanBackward0>)

In [122]:
infonce = InfoNceLoss()
clip = CLIPLoss()

In [124]:
infonce(em,ec)

tensor(2.9726, grad_fn=<DivBackward0>)

In [70]:
batch_size = ec.shape[0]
'''Positive pairs'''
pos_labels = torch.zeros(batch_size)
loss_pos = contrastive_loss(ec, em, pos_labels)

'''Negative Pairs, shifting index'''
neg_labels = torch.ones(batch_size)
shift = np.random.randint(0, batch_size-1)
new_idx = np.arange(shift, batch_size + shift) % batch_size
mis_motion_embedding = em.clone()[new_idx]
loss_neg = contrastive_loss(ec, mis_motion_embedding, neg_labels)
loss = loss_pos + loss_neg


In [71]:
loss

tensor(209.3174, grad_fn=<AddBackward0>)

In [74]:
chk = torch.load("/srv/scratch/sanisetty3/music_motion/T2M-GPT/checkpoints/t2m/text_mot_match/model/finest.tar")

10724