In [1]:
from dataset import create_dataset, create_sampler, create_loader
from models.model_nlvr import XVLM
from models.model_captioning import XVLM
from models import XVLMBase, build_mlp, load_pretrained

config = {'train_file': ['data/finetune/cosmos_train.json'], 'val_file': ['data/finetune/cosmos_val.json'], 
'test_file': ['data/finetune/cosmos_test.json'], 'image_root': 'images/cosmos/', 
'vision_config': 'configs/config_swinB_384.json', 'use_clip_vit': False, 'use_swin': True, 
'image_res': 384, 'patch_size': 32, 'use_roberta': False, 'text_config': 'configs/config_bert.json', 
'text_encoder': 'data/bert-base-uncased', 'num_dec_layers': 6, 'batch_size_train': 5, 
'batch_size_test': 32, 'max_tokens': 40, 'label_smoothing': 0.1, 'max_length': 20, 'min_length': 5, 
'num_beams': 3, 'prompt': 'a picture of ', 'optimizer': {'opt': 'adamW', 'lr': 1e-05, 
'weight_decay': 0.01, 'lr_mult': 2}, 'schedular': {'sched': 'linear', 'lr': 1e-05, 'epochs': 5, 
'num_warmup_steps': 0.1}, 'start_eval': 0}
train_dataset, val_dataset, test_dataset = create_dataset('cosmos', config);
datasets = [train_dataset, val_dataset, test_dataset]
# config_vqa['pad_token_id'] = train_dataset.pad_token_id
# config_vqa['eos'] = train_dataset.eos_token



In [34]:
train_loader, val_loader, test_loader = create_loader(datasets, [None, None, None], batch_size=[5,5,5],
                                                          num_workers=[4, 4, 4], is_trains=[True, False, False],
                                                          collate_fns=[None, None, None])

In [2]:
model = XVLM(config=config)
model.load_pretrained('coco_capt_cider_step_44275.th', config, is_eval=False)
model = model.to('cuda:1');

### Loading pretrained vision encoder
### Loading pretrained text encoder
load_capt_pretrain,  False
### Loading pretrained text encoder
load checkpoint from coco_capt_cider_step_44275.th
missing_keys:  []
unexpected_keys:  []


In [3]:
import json
train_data = list(
    map(json.loads, open(f"/root/thesis/ViLT/cosmos/train_data.json").readlines())
)

In [7]:
from torchvision import transforms
from dataset.randaugment import RandomAugment
normalize = transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
train_transform = transforms.Compose([
        transforms.RandomResizedCrop(config['image_res'], scale=(0.5, 1.0),
                                     interpolation=Image.BICUBIC),
        transforms.RandomHorizontalFlip(),
        RandomAugment(2, 7, isPIL=True, augs=['Identity', 'AutoContrast', 'Equalize', 'Brightness', 'Sharpness',
                                              'ShearX', 'ShearY', 'TranslateX', 'TranslateY', 'Rotate']),
        transforms.ToTensor(),
        normalize,
    ])

  interpolation=Image.BICUBIC),


In [8]:
from PIL import Image
import os
image_path = os.path.join(config['image_root'], train_data[0]['img_local_path'])
image = Image.open(image_path).convert('RGB')
image = train_transform(image)

In [43]:
len(train_data)
np.array_split(train_data)

161754

In [72]:
train_data[0]['img_local_path']
train_data[1]['img_local_path']
train_data[2]['img_local_path']
train_data[3]['img_local_path']
train_data[4]['img_local_path']

'train/2.jpg'

In [9]:
import torch
from tqdm import tqdm
import numpy as np
torch.manual_seed(42)

device='cuda:1'
# model.to(device)
image_cap_dict = {}
batch_size = 10
for i in tqdm(range(0, len(train_data), batch_size)):
    # image_batch = torch.empty()
    z=False
    array=[]
    keys = []
    for data in train_data[i:i+batch_size]:
        key = data['img_local_path']
        keys.append(key)
        image_path = os.path.join(config['image_root'], key)
        image = Image.open(image_path).convert('RGB')
        image = train_transform(image)
        array.append(image)
        # if z == False:
        #     image_batch = image
        #     z=True
        # image_batch = torch.stack([image_batch,image])
    image_batch = torch.stack(array)
    image_batch = image_batch.to(device)
    
    c = model.generate(image_batch,num_return_sequences=1,sample=True, num_beams=1)
    image_cap_dict[keys[0]] = c[0][0]
    image_cap_dict[keys[1]] = c[0][1]
    image_cap_dict[keys[2]] = c[0][2]
    image_cap_dict[keys[3]] = c[0][3]
    image_cap_dict[keys[4]] = c[0][4]

# for data in tqdm(train_data):
    
#     image_path = os.path.join(config['image_root'], data['img_local_path'])
#     image = Image.open(image_path).convert('RGB')
#     image = train_transform(image)[None,:]
#     image = image.to(device)

#     c = model.generate(image,num_return_sequences=1,sample=True, num_beams=1)
#     image_cap_dict[data['img_local_path']] = c[0][0]

 14%|█▎        | 2198/16176 [16:29<1:44:54,  2.22it/s]


KeyboardInterrupt: 

In [10]:
image_cap_dict

{'train/1.jpg': 'a person holding a cell phone in front of a table',
 'train/2.jpg': 'a man standing in front of a crowd of people',
 'train/0.jpg': 'a group of people walking down a street with a statue',
 'train/8.jpg': 'a man standing in the snow with a suitcase on a street',
 'train/9.jpg': 'a man in a business suit and police officer standing with a suitcase',
 'train/10.jpg': 'a man walking a dog on a street',
 'train/11.jpg': 'a group of people holding signs in a building',
 'train/12.jpg': 'a yellow at a site with a yellow fire truck with a person',
 'train/18.jpg': 'a group of soldiers laying in front of a helicopter',
 'train/19.jpg': 'a group of people sitting in a street from a flooded street',
 'train/20.jpg': 'a man wearing a suit and tie with a person',
 'train/21.jpg': 'a group of people standing in a room',
 'train/22.jpg': 'a man in a suit and tie standing at a podium',
 'train/28.jpg': 'a group of people riding bikes down a road',
 'train/29.jpg': 'a man sitting on a

In [14]:
len(image_cap_dict)

10988

In [18]:
image_cap_dict.to_json('image_cap_dict.json')

AttributeError: 'dict' object has no attribute 'to_json'

In [19]:
with open("image_cap_dict.json", "w") as outfile:
    json.dump(image_cap_dict, outfile)