In [1]:
# setting device on GPU if available, else CPU`
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

#Additional Info when using cuda
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')


Using device: cuda

NVIDIA GeForce RTX 2080 Ti
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [3]:
import math
import os
import warnings
from dataclasses import dataclass
from typing import Optional, Tuple

import torch
from torch import Tensor, device, dtype, nn
import torch.utils.checkpoint
from torch import nn
from torch.nn import CrossEntropyLoss, MSELoss
import torch.nn.functional as F

from transformers.activations import ACT2FN
from transformers.file_utils import (
    ModelOutput,
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    replace_return_docstrings,
)
from transformers.modeling_outputs import (
    BaseModelOutputWithPastAndCrossAttentions,
    BaseModelOutputWithPoolingAndCrossAttentions,
    CausalLMOutputWithCrossAttentions,
    MaskedLMOutput,
    MultipleChoiceModelOutput,
    NextSentencePredictorOutput,
    QuestionAnsweringModelOutput,
    SequenceClassifierOutput,
    TokenClassifierOutput,
)
from transformers.modeling_utils import (
    PreTrainedModel,
    apply_chunking_to_forward,
    find_pruneable_heads_and_indices,
    prune_linear_layer,
)
from transformers.utils import logging
from transformers.models.bert.configuration_bert import BertConfig
import transformers

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
ckpt = torch.load("/srv/hays-lab/scratch/sanisetty3/music_motion/TGM3D/checkpoints/conformer_768_1024_affine_varlen/vqvae_motion.pt", map_location = "cpu")

In [14]:
ckpt.keys()

dict_keys(['model', 'optim', 'steps', 'total_loss'])

In [15]:
ckpt["steps"]

tensor([200000.])

In [16]:
ckpt["total_loss"]

tensor(0.3532)

In [4]:
from transformers import AutoTokenizer, BertForMaskedLM

import torch

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

model = BertForMaskedLM.from_pretrained("bert-base-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [105]:
config = BertConfig.from_pretrained("bert-base-cased")

In [106]:
config

BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.33.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

In [122]:
tokenizer.mask_token_id

103

In [134]:
inputs = tokenizer(["The capital of France is [MASK]." , "The capital [MASK] is a monster door in atlanta "], return_tensors="pt" , padding = True)

In [135]:
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [121]:
inputs["input_ids"]

tensor([[ 101, 1996, 3007, 1997, 2605, 2003,  103, 1012,  102,    0,    0],
        [ 101, 1996, 3007,  103, 2003, 1037, 6071, 2341, 1999, 5865,  102]])

In [43]:
inputs["attention_mask"]

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])

In [37]:
text_proj = nn.Linear(model.config.hidden_size, 768) 

In [34]:
text_output = model.bert(inputs["input_ids"], attention_mask = inputs["attention_mask"],                      
                                return_dict = True)            
text_embeds = text_output.last_hidden_state

In [38]:
text_feat = F.normalize(text_proj(text_embeds[:,0,:]),dim=-1)       

In [39]:
text_feat.shape

torch.Size([1, 768])

In [40]:
cls_token = (torch.zeros(1, 1, 768))

In [41]:
cls_tokens = cls_token.expand(4, -1, -1)

In [107]:
from torch.nn import CrossEntropyLoss

In [51]:
from transformers import T5ForSequenceClassification, T5EncoderModel, T5Tokenizer


In [49]:
t5 = T5EncoderModel.from_pretrained("google/t5-v1_1-base")

In [52]:
t5_tokenizer = T5Tokenizer.from_pretrained("google/t5-v1_1-base")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


## ALBEF Motion

In [72]:
from transformers import AutoTokenizer, BertForMaskedLM
from transformers.models.bert.configuration_bert import BertConfig
import torch
from core.datasets.motion_bert_dataset import BERTPretrainMotionDataset, MotionCollator
from core.datasets.dataset_loading_utils import load_dataset_bert
from core.datasets.dataset_loading_utils import load_dataset_bert
from core.datasets.motion_bert_dataset import BERTMotionDataset, DATALoader, TokenizerParams, mask_for_mlm
from core.models.BERT import BERT, BERTParams
from core.optimizer import get_optimizer
from torch.utils.data import DataLoader
from configs.config import get_cfg_defaults
import transformers

In [73]:
path = "/srv/hays-lab/scratch/sanisetty3/music_motion/TGM3D/checkpoints/bert_12_768/bert_12_768.yaml"
cfg = get_cfg_defaults()
print("loading config from:", path)
cfg.merge_from_file(path)
cfg.freeze()

loading config from: /srv/hays-lab/scratch/sanisetty3/music_motion/TGM3D/checkpoints/bert_12_768/bert_12_768.yaml


In [74]:
train_ds, sampler_train, weights_train = load_dataset_bert(
                dataset_names=["t2m"],
                args=cfg,
                split="test",
                weight_scale=[1],
            )

100%|███████████████████████████████████████████████████████████████████████████| 4384/4384 [00:03<00:00, 1210.30it/s]

Total number of motions 4198





In [None]:
from transformers import AutoTokenizer, BertForMaskedLM, BertPreTrainedModel


In [75]:
dl = DATALoader(
            train_ds,
            batch_size=4,
            shuffle=False,
        )

In [76]:
for batch in dl:
    break

In [79]:
bcfg =BertConfig.from_pretrained("./checkpoints/bert_12_768/bert_config.json")
model = BertForMaskedLM(bcfg)

In [88]:
input_ids = batch["input_ids"]
input_ids = input_ids.clone()
labels = input_ids.clone()

probability_matrix = torch.full(labels.shape, 0.15)                    
input_ids, labels = mask_for_mlm(input_ids, targets=labels,probability_matrix = probability_matrix) 

In [83]:
batch["attention_mask"].shape

torch.Size([4, 51])

In [98]:
model.bert.embeddings.word_embeddings

Embedding(1027, 768, padding_idx=1025)

In [92]:
mlm_output = model.bert(input_ids = input_ids, 
   attention_mask = batch["attention_mask"],
   # encoder_hidden_states = image_embeds,
   # encoder_attention_mask = image_atts,      
   return_dict = True,
   # labels = labels,   
   # soft_labels = F.softmax(logits_m,dim=-1),
   # alpha = alpha
  )                           

In [96]:
mlm_output["last_hidden_state"]

torch.Size([4, 51, 768])

In [None]:
pwd