In [1]:
# setting device on GPU if available, else CPU
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

#Additional Info when using cuda
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')


Using device: cuda

NVIDIA GeForce RTX 2080 Ti
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [3]:
import math
import os
import warnings
from dataclasses import dataclass
from typing import Optional, Tuple

import torch
from torch import Tensor, device, dtype, nn
import torch.utils.checkpoint
from torch import nn
from torch.nn import CrossEntropyLoss, MSELoss
import torch.nn.functional as F

from transformers.activations import ACT2FN
from transformers.file_utils import (
    ModelOutput,
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    replace_return_docstrings,
)
from transformers.modeling_outputs import (
    BaseModelOutputWithPastAndCrossAttentions,
    BaseModelOutputWithPoolingAndCrossAttentions,
    CausalLMOutputWithCrossAttentions,
    MaskedLMOutput,
    MultipleChoiceModelOutput,
    NextSentencePredictorOutput,
    QuestionAnsweringModelOutput,
    SequenceClassifierOutput,
    TokenClassifierOutput,
)
from transformers.modeling_utils import (
    PreTrainedModel,
    apply_chunking_to_forward,
    find_pruneable_heads_and_indices,
    prune_linear_layer,
)
from transformers.utils import logging
from transformers.models.bert.configuration_bert import BertConfig
import transformers

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
from transformers import AutoTokenizer, BertForMaskedLM

import torch

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

model = BertForMaskedLM.from_pretrained("bert-base-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.weight', 'cls.seq_relationship.weight', 'bert.pooler.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [13]:
config = BertConfig.from_pretrained("bert-base-cased")

Downloading (…)lve/main/config.json: 100%|███████████████████████████████████████████| 570/570 [00:00<00:00, 78.4kB/s]


In [22]:
tokenizer.mask_token_id

103

In [29]:
inputs = tokenizer("The capital of France is [MASK].", return_tensors="pt")

In [30]:
inputs["input_ids"]

tensor([[ 101, 1996, 3007, 1997, 2605, 2003,  103, 1012,  102]])

In [43]:
inputs["attention_mask"]

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])

In [37]:
text_proj = nn.Linear(model.config.hidden_size, 768) 

In [34]:
text_output = model.bert(inputs["input_ids"], attention_mask = inputs["attention_mask"],                      
                                return_dict = True)            
text_embeds = text_output.last_hidden_state

In [38]:
text_feat = F.normalize(text_proj(text_embeds[:,0,:]),dim=-1)       

In [39]:
text_feat.shape

torch.Size([1, 768])

In [40]:
cls_token = (torch.zeros(1, 1, 768))

In [41]:
cls_tokens = cls_token.expand(4, -1, -1)

In [51]:
from transformers import T5ForSequenceClassification, T5EncoderModel, T5Tokenizer


In [49]:
t5 = T5EncoderModel.from_pretrained("google/t5-v1_1-base")

In [52]:
t5_tokenizer = T5Tokenizer.from_pretrained("google/t5-v1_1-base")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [24]:
from tqdm import tqdm

In [4]:
import json
with open("/srv/datasets/coco/person_keypoints_train2017.json") as f:
    data = json.load(f)

with open("/srv/datasets/coco/captions_train2017.json") as f:
    data2 = json.load(f)

In [89]:
data["annotations"][0]

{'segmentation': [[267.03,
   243.78,
   314.59,
   154.05,
   357.84,
   136.76,
   374.05,
   104.32,
   410.81,
   110.81,
   429.19,
   131.35,
   420.54,
   165.95,
   451.89,
   209.19,
   464.86,
   240.54,
   480,
   253.51,
   484.32,
   263.24,
   496.22,
   271.89,
   484.32,
   278.38,
   438.92,
   257.84,
   401.08,
   216.76,
   370.81,
   247.03,
   414.05,
   277.3,
   433.51,
   304.32,
   443.24,
   323.78,
   400,
   362.7,
   376.22,
   375.68,
   400,
   418.92,
   394.59,
   424.32,
   337.3,
   382.16,
   337.3,
   371.35,
   388.11,
   327.03,
   341.62,
   301.08,
   311.35,
   276.22,
   304.86,
   263.24,
   294.05,
   249.19]],
 'num_keypoints': 8,
 'area': 28292.08625,
 'iscrowd': 0,
 'keypoints': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  325,
  160,
  2,
  398,
  177,
  2,
  0,
  0,
  0,
  437,
  238,
  2,
  0,
  0,
  0,
  477,
  270,
  2,
  287,
  255,
  1,
  339,
  267,
  2,
  0,
  0,
  0,
  423,
  314,
  2,
  0,
  0,
 

In [56]:
ann1 = {}
for dd in tqdm(data2["annotations"]):
    try:
        ann1[dd["image_id"]].append(dd["caption"])
    except:
        ann1[dd["image_id"]] = [dd["caption"]]

100%|█████████████████████████████████████████████████████████████████████| 591753/591753 [00:00<00:00, 805192.43it/s]


In [None]:
persons = []
for i, dat in tqdm(enumerate(data["images"])):
    d = {}
    d['image_id'] = dat["id"]
    d["file_name"] = data["images"][i]["file_name"]
    d["captions"] = ann1[dat["id"]]
    d["coco_url"] = dat["coco_url"]
    d["flickr_url"] = dat["flickr_url"]

    is_crowd = [i["iscrowd"] for i in data["annotations"] if i["image_id"] == d['image_id']]
    if is_crowd == 0:
        persons.append(d)

96992it [1:43:21, 15.45it/s]

In [79]:
len(persons)

118287

In [72]:
json_object = json.dumps(persons, indent=4)
 
# Writing to sample.json
with open("./coco_perons.json", "w") as outfile:
    outfile.write(json_object)

In [86]:
'/coc/scratch/sanisetty3/music_motion/TGM3D/coco_perons.json'

'/coc/scratch/sanisetty3/music_motion/TGM3D'

In [82]:
from PIL import Image

In [85]:
save_path = "/srv/hays-lab/scratch/sanisetty3/PyMAF-X/images/coco/"
for dd in tqdm(persons):
    img = Image.open(f"/srv/datasets/coco/train2017/{dd['file_name']}" )
    img.save(save_path + dd["file_name"])

  0%|▎                                                                         | 406/118287 [00:27<2:10:53, 15.01it/s]


KeyboardInterrupt: 

In [None]:
"/srv/datasets/coco/train2017/"