In [1]:
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
import io
from PIL import Image
import pyarrow as pa
from vilt.config import ex
from vilt.modules import ViLTransformerSS

from vilt.modules.objectives import cost_matrix_cosine, ipot
from vilt.transforms import pixelbert_transform
from vilt.datamodules.datamodule_base import get_pretrained_tokenizer

In [2]:
df = pa.ipc.RecordBatchFileReader(
    pa.memory_map(f"dataset_50/cosmos_test.arrow", "r")
).read_all().to_pandas()

In [3]:
_config = {'exp_name': 'vilt', 'seed': 0, 'datasets': ['coco'], 'loss_names': {'itm': 1, 'mlm': 0, 'mpp': 0, 'vqa': 0, 'nlvr2': 0, 'irtr': 0, 'cosmos': 0}, 'batch_size': 4096, 'train_transform_keys': ['pixelbert'], 'val_transform_keys': ['pixelbert'], 'image_size': 384, 'max_image_len': -1, 'patch_size': 32, 'draw_false_image': 1, 'image_only': False, 'vqav2_label_size': 3129, 'max_text_len': 40, 'tokenizer': 'bert-base-uncased', 'vocab_size': 30522, 'whole_word_masking': False, 'mlm_prob': 0.15, 'draw_false_text': 0, 'vit': 'vit_base_patch32_384', 'hidden_size': 768, 'num_heads': 12, 'num_layers': 12, 'mlp_ratio': 4, 'drop_rate': 0.1, 'optim_type': 'adamw', 'learning_rate': 0.0001, 'weight_decay': 0.01, 'decay_power': 1, 'max_epoch': 100, 'max_steps': 25000, 'warmup_steps': 2500, 'end_lr': 0, 'lr_mult': 1, 'get_recall_metric': False, 'resume_from': None, 'fast_dev_run': False, 'val_check_interval': 1.0, 'test_only': False, 'data_root': '', 'log_dir': 'result', 'per_gpu_batchsize': 0, 'num_gpus': 1, 'num_nodes': 1, 
    'load_path': 'weights/vilt_200k_mlm_itm.ckpt', 'num_workers': 8, 'precision': 16}

In [4]:
tokenizer = get_pretrained_tokenizer(_config["tokenizer"])

In [5]:
model = ViLTransformerSS(_config)
model.setup("test")
model.eval();

In [6]:
ckpt = torch.load(_config["load_path"], map_location="cpu")
state_dict = ckpt["state_dict"]
model.load_state_dict(state_dict, strict=False)

_IncompatibleKeys(missing_keys=[], unexpected_keys=['mlm_score.bias', 'mlm_score.transform.dense.weight', 'mlm_score.transform.dense.bias', 'mlm_score.transform.LayerNorm.weight', 'mlm_score.transform.LayerNorm.bias', 'mlm_score.decoder.weight'])

In [7]:
#device = "cuda:0" if _config["num_gpus"] > 0 else "cpu"
device="cuda:0"
model.to(device);

In [8]:
def get_raw_image(img_byte):
    image_bytes = io.BytesIO(img_byte)
    image_bytes.seek(0)
    return Image.open(image_bytes).convert("RGB")

In [9]:
from torch import nn
def infer(image,text,text2):
    img = pixelbert_transform(size=384)(image)
    img = img.unsqueeze(0).to(device)
    batch = {"text": [text], "image": [img]}
    batch2 = {"text": [text2], "image": [img]}
    with torch.no_grad():
        encoded = tokenizer(batch["text"],
        padding="max_length",
        truncation=True,
        max_length=40,
        return_special_tokens_mask=True)
        batch["text_ids"] = torch.tensor(encoded["input_ids"]).to(device)
        batch["text_labels"] = torch.tensor(encoded["input_ids"]).to(device)
        batch["text_masks"] = torch.tensor(encoded["attention_mask"]).to(device)

        encoded2 = tokenizer(batch2["text"],
        padding="max_length",
        truncation=True,
        max_length=40,
        return_special_tokens_mask=True)
        batch2["text_ids"] = torch.tensor(encoded2["input_ids"]).to(device)
        batch2["text_labels"] = torch.tensor(encoded2["input_ids"]).to(device)
        batch2["text_masks"] = torch.tensor(encoded2["attention_mask"]).to(device)

        infer1 = model.infer(batch)
        infer2 = model.infer(batch2)

        # cls_feats = torch.cat([infer1["cls_feats"], infer2["cls_feats"]], dim=-1)
        itm_logits1 = model.itm_score(infer1["cls_feats"])
        itm_logits2 = model.itm_score(infer2["cls_feats"])
        softmax = nn.Softmax()
        itm_logits1 = softmax(itm_logits1)
        itm_logits2 = softmax(itm_logits2)
        # cosmos_logits = model.nlvr2_classifier(cls_feats)
    # encoded = encoded["input_ids"][0][1:-1]
    # inferred_token = [tokenizer.decode(encoded)]
    # return infer1,inferred_token
    return itm_logits1[0][1].item(), itm_logits2[0][1].item()
    # return not(itm_logits1.argmax().item() and itm_logits2.argmax().item())

In [10]:
data = pd.read_csv('/root/thesis/dataset/public_test.csv')

In [11]:
def load_image(path):
   with open(path, "rb") as fp:
      return fp.read()

In [13]:
import os
tqdm.pandas()
data['image_byte'] = data['img_local_path'].progress_apply(lambda x: load_image(
    os.path.join('/root/thesis/dataset',x)
    ))

  from pandas import Panel
100%|██████████| 1000/1000 [00:00<00:00, 1645.20it/s]


In [14]:
def itm(row):
    c1 = row['caption1_modified'][0]
    c2 = row['caption2_modified'][0]
    c1_itm, c2_itm = infer(get_raw_image(row['image_byte']), c1, c2)
    row['c1_itm'] = c1_itm
    row['c2_itm'] = c2_itm
    return row

In [15]:
data = data.progress_apply(itm, axis=1)

100%|██████████| 1000/1000 [02:18<00:00,  7.23it/s]


In [16]:
data[['img_local_path','c1_itm','c2_itm']].to_csv('itm.csv', index=False)

In [93]:
data[['c1_itm','c2_itm']].loc[837]

c1_itm    0.345793
c2_itm    0.170505
Name: 837, dtype: float64

In [56]:
r = df.iloc[159]
print(r['label'])
print( r['caption_1'][0])
print( r['caption_2'][0])
# result = infer(get_raw_image(r['image']), r['caption_1'][0], r['caption_2'][0])
result = infer(get_raw_image(r['image']), 'zerba flying', 'People with dogs playing with snow')

result

[False]
On DATE zebras cross the road as a motorcyclist waits in ORG in GPE.
Zebras cross the road which has QUANTITY-long free electric-fence area for all animals as their corridor in the NORP-Tsavo ecosystem at TIME as a motorbike waits next to ORG in GPE, GPE, on DATE




(0.41710996627807617, 1.2162016901129391e-05)

In [17]:
result = pd.DataFrame({},columns=['predict','label','cap1','cap2'])
for i in tqdm(np.arange(len(df))):
    r = df.iloc[i]
    pred = infer(get_raw_image(r['image']), r['caption_1'][0], r['caption_2'][0])
    # s = util.cos_sim(model2.encode(r['caption_1'][0]),(model2.encode(r['caption_2'][0])))
    pred = pred == True
    ans = r['label'][0]
    result.loc[len(result)] = [pred, ans,r['caption_1'][0],r['caption_2'][0]]

100%|██████████| 1700/1700 [09:20<00:00,  3.03it/s]


In [18]:
confusion_matrix = pd.crosstab(result['predict'], result['label'], rownames=['Predicted'], colnames=['Actual'])
print (confusion_matrix)

Actual     False  True 
Predicted              
False        651    568
True         199    282


In [18]:
(680+351)/1700

0.6064705882352941

In [29]:
confusion_matrix = pd.crosstab(result['predict'], result['label'], rownames=['Predicted'], colnames=['Actual'])
print (confusion_matrix)

Actual     False  True 
Predicted              
False        677    493
True         173    357


In [30]:
(677 + 357) / 1700

0.6082352941176471