In [1]:
%matplotlib inline
from pycocotools.coco import COCO
import numpy as np
import pandas as pd
import skimage.io as io
import matplotlib.pyplot as plt
import pylab
from PIL import Image
import requests
import albumentations as A
import torch
pylab.rcParams['figure.figsize'] = (8.0, 10.0)

### COCO Dataset

In [2]:
# coco 2014 val dataset
annFile = 'data/coco/annotations/instances_val2014.json'
coco=COCO(annFile)

# coco qa dataset
qa_df = pd.concat([
    pd.DataFrame({
        'img_id': pd.read_fwf('data/coco_qa/train/img_ids.txt', header=None)[0],
        'question': pd.read_fwf('data/coco_qa/train/questions.txt', header=None)[0],
        'answer': pd.read_fwf('data/coco_qa/train/answers.txt', header=None)[0],
        'type': pd.read_fwf('data/coco_qa/train/types.txt', header=None)[0],     # 0 -> object, 1 -> number, 2 -> color, 3 -> location
    }),
    pd.DataFrame({
        'img_id': pd.read_fwf('data/coco_qa/test/img_ids.txt', header=None)[0],
        'question': pd.read_fwf('data/coco_qa/test/questions.txt', header=None)[0],
        'answer': pd.read_fwf('data/coco_qa/test/answers.txt', header=None)[0],
        'type': pd.read_fwf('data/coco_qa/test/types.txt', header=None)[0],     
    })
])

# get coco 2014 val data from coco qa
qa_df = qa_df[qa_df['img_id'].isin(coco.getImgIds())]
print('Data size: ', len(qa_df))
qa_df.head(5)

# OLD 
# annFile = 'data/coco/annotations/instances_val2014.json'
# coco=COCO(annFile)

# qa_df = pd.DataFrame({
#     'img_id': pd.read_fwf('data/coco_qa/test/img_ids.txt', header=None)[0],
#     'question': pd.read_fwf('data/coco_qa/test/questions.txt', header=None)[0],
#     'answer': pd.read_fwf('data/coco_qa/test/answers.txt', header=None)[0],
#     'type': pd.read_fwf('data/coco_qa/test/types.txt', header=None)[0],     # 0 -> object, 1 -> number, 2 -> color, 3 -> location
# })
# qa_df.head()


loading annotations into memory...
Done (t=3.87s)
creating index...
index created!
Data size:  38948


Unnamed: 0,img_id,question,answer,type
0,299254,what is the man holding a snowboard on top of ...,hill,0
1,6415,what are sitting on the counter in different s...,carrots,0
2,465893,what is coming down the tracks,train,0
3,53529,where does the person sit next to a top with a...,car,3
4,398005,what are standing in tall dry grass look at th...,zebras,0


### Init InstructBlip Model

In [3]:
from transformers import InstructBlipProcessor, InstructBlipForConditionalGeneration
import torch
from PIL import Image
import requests

ib_model = InstructBlipForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-7b")
ib_processor = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")

device = "cuda" if torch.cuda.is_available() else "cpu"
ib_model.to(device)


  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 4/4 [01:32<00:00, 23.20s/it]


InstructBlipForConditionalGeneration(
  (vision_model): InstructBlipVisionModel(
    (embeddings): InstructBlipVisionEmbeddings(
      (patch_embedding): Conv2d(3, 1408, kernel_size=(14, 14), stride=(14, 14))
    )
    (encoder): InstructBlipEncoder(
      (layers): ModuleList(
        (0-38): 39 x InstructBlipEncoderLayer(
          (self_attn): InstructBlipAttention(
            (dropout): Dropout(p=0.0, inplace=False)
            (qkv): Linear(in_features=1408, out_features=4224, bias=True)
            (projection): Linear(in_features=1408, out_features=1408, bias=True)
          )
          (layer_norm1): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
          (mlp): InstructBlipMLP(
            (activation_fn): GELUActivation()
            (fc1): Linear(in_features=1408, out_features=6144, bias=True)
            (fc2): Linear(in_features=6144, out_features=1408, bias=True)
          )
          (layer_norm2): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
        

### Init Open Flamingo Model

In [4]:
from open_flamingo import create_model_and_transforms
from huggingface_hub import hf_hub_download
import torch

fla_model, fla_processor, fla_tokenizer = create_model_and_transforms(
    clip_vision_encoder_path="ViT-L-14",
    clip_vision_encoder_pretrained="openai",
    lang_encoder_path="anas-awadalla/mpt-1b-redpajama-200b",
    tokenizer_path="anas-awadalla/mpt-1b-redpajama-200b",
    cross_attn_every_n_layers=1,
    # cache_dir="PATH/TO/CACHE/DIR"  # Defaults to ~/.cache
)
fla_tokenizer.padding_side = "left" # For generation padding tokens should be on the left

checkpoint_path = hf_hub_download("openflamingo/OpenFlamingo-3B-vitl-mpt1b", "checkpoint.pt")
fla_model.load_state_dict(torch.load(checkpoint_path), strict=False)

fla_model.to(device)

Using pad_token, but it is not set yet.


You are using config.init_device='cpu', but you can also use config.init_device="meta" with Composer + FSDP for fast initialization.


You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embeding dimension will be 50280. This might induce some performance reduction as *Tensor Cores* will not be available. For more details  about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc


Flamingo model initialized with 1046992944 trainable parameters


Flamingo(
  (vision_encoder): VisionTransformer(
    (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14), bias=False)
    (patch_dropout): Identity()
    (ln_pre): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): ModuleList(
        (0-23): 24 x ResidualAttentionBlock(
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ls_1): Identity()
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ls_2): Identity()
        )
      )
    )
    (ln_post): LayerNorm((1024,), eps=1e-05, elementwis

### Prediction functions for InstructBlip (ib) and OpenFlamingo (fla). Function to plot images

In [5]:
def pred_instructblip(img, prompt):
    inputs = ib_processor(images=img, text=prompt, return_tensors='pt').to(device)
    outputs = ib_model.generate(
        **inputs,
        do_sample=False,
        num_beams=5,
        max_length=256,
        min_length=1,
        top_p=0.9,
        repetition_penalty=1.5,
        length_penalty=1.0,
        temperature=1,
    )
    generated_text = ib_processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
    return generated_text


def pred_openflamingo(img, prompt):
    # OpenFlamingi example Q/As
    demo_image_one = coco.loadImgs([qa_df.loc[0, 'img_id']])[0]
    demo_image_one = Image.open(requests.get(demo_image_one['coco_url'], stream=True).raw)
    demo_question_one = qa_df.loc[0, 'question']
    demo_answer_one = qa_df.loc[0, 'answer']
    
    demo_image_two = coco.loadImgs([qa_df.loc[1, 'img_id']])[0]
    demo_image_two = Image.open(requests.get(demo_image_two['coco_url'], stream=True).raw)
    demo_question_two = qa_df.loc[1, 'question']
    demo_answer_two = qa_df.loc[1, 'answer']
    
    vision_x = [fla_processor(demo_image_one).unsqueeze(0), fla_processor(demo_image_two).unsqueeze(0), fla_processor(img).unsqueeze(0)]
    vision_x = torch.cat(vision_x, dim=0)
    vision_x = vision_x.unsqueeze(1).unsqueeze(0)
    
    lang_x = fla_tokenizer(
        [f"<image>Question: {demo_question_one}? Answer: {demo_answer_one}<|endofchunk|><image>Question: {demo_question_two}? Answer: {demo_answer_two}<|endofchunk|><image>Question: {prompt} Answer: "],
        return_tensors="pt",
    )
    
    fla_generated_text = fla_model.generate(
        vision_x=vision_x.to(device),
        lang_x=lang_x["input_ids"].to(device),
        attention_mask=lang_x["attention_mask"].to(device),
        max_new_tokens=20,
        num_beams=3,
    )
    fla_generated_text = fla_tokenizer.decode(fla_generated_text[0])
    # get the answer without the examples
    fla_generated_text = fla_generated_text.split('Question: ')[-1]
    if len(fla_generated_text.split('Answer:')) > 1:
        fla_generated_text = fla_generated_text.split('Answer:')[-1].replace('<|endofchunk|>', '').replace('.', '')  
        
    return fla_generated_text


import math
def plot_row_images(images, titles=None):
    if not titles:
        titles = ['' for img in images]

    fig, ax = plt.subplots(1, len(images), figsize=(35,20))
    for i, (img, title) in enumerate(zip(images, titles)):
        ax[i].imshow(img)
        ax[i].set_title(title)
        ax[i].axis('off')
    
    fig.show()
                           


### Analyze model outputs when adding blur to images

In [None]:
# Get x sample images and QAs
# qa_df_sample = qa_df.sample(50, random_state=1234)
# qa_df_sample = qa_df_sample.sort_values(by='type')      # 0 -> object, 1 -> number, 2 -> color, 3 -> location
qa_df_sample = qa_df.groupby('type').sample(20, random_state=1234)
qa_df_sample.sample.head(10)

In [None]:

for i, row in qa_df_sample.iterrows():
    img = coco.loadImgs([row['img_id']])[0]
    img = io.imread(img['coco_url'])
    prompt = row['question'] + '?'

    plt_imgs, plt_titles = [], []
    
    ## Original Image
    ib_generated_text = pred_instructblip(img, prompt)
    fla_generated_text = pred_openflamingo(Image.fromarray(img), prompt)
    # fla_generated_text = 'xxxxxx'
    plt_imgs.append(img)
    plt_titles.append(f"ORIGINAL -- ID: {row['img_id']} \n Q: {prompt} -- A: {row['answer']} \n InstructBlip: {ib_generated_text} -- OpenFlamingo: {fla_generated_text}")

    ## Blurred small
    blur1 = A.ReplayCompose([A.Blur(blur_limit=[15]*2, always_apply=True)])
    img_blur1 = blur1(image=img)['image']
    ib_blur1 = pred_instructblip(img_blur1, prompt)
    fla_blur1 = pred_openflamingo(Image.fromarray(img_blur1), prompt)
    # fla_blur1 = 'xxxxxx'
    plt_imgs.append(img_blur1)
    plt_titles.append(f"Small Blur \n Q: {prompt} -- A: {row['answer']} \n InstructBlip: {ib_blur1} -- OpenFlamingo: {fla_blur1}")
    
    ## Blurred medium
    blur2 = A.ReplayCompose([A.Blur(blur_limit=[20]*2, always_apply=True)])
    img_blur2 = blur2(image=img)['image']
    ib_blur2 = pred_instructblip(img_blur2, prompt)
    fla_blur2 = pred_openflamingo(Image.fromarray(img_blur2), prompt)
    # fla_blur2 = 'xxxxxx'
    plt_imgs.append(img_blur2)
    plt_titles.append(f"Medium Blur \n Q: {prompt} -- A: {row['answer']} \n InstructBlip: {ib_blur2} -- OpenFlamingo: {fla_blur2}")

    ## Blurred strong
    blur3= A.ReplayCompose([A.Blur(blur_limit=[22]*2, always_apply=True)])
    img_blur3 = blur3(image=img)['image']
    ib_blur3 = pred_instructblip(img_blur3, prompt)
    fla_blur3 = pred_openflamingo(Image.fromarray(img_blur3), prompt)
    # fla_blur3 = 'xxxxxx'
    plt_imgs.append(img_blur3)
    plt_titles.append(f"Strong Blur \n Q: {prompt} -- A: {row['answer']} \n InstructBlip: {ib_blur3} -- OpenFlamingo: {fla_blur3}")

    ## plot
    plot_row_images(
        images=plt_imgs,
        titles=plt_titles
    )


### Analyze model output when dropping a color channel, solarizing, and elastic transform

In [None]:

for i, row in qa_df_sample.iterrows():
    img = coco.loadImgs([row['img_id']])[0]
    img = io.imread(img['coco_url'])
    prompt = row['question'] + '?'

    plt_imgs, plt_titles = [], []
    
    ## Original Image
    ib_generated_text = pred_instructblip(img, prompt)
    fla_generated_text = pred_openflamingo(Image.fromarray(img), prompt)
    # fla_generated_text = 'xxxxxx'
    plt_imgs.append(img)
    plt_titles.append(f"ORIGINAL -- ID: {row['img_id']} \n Q: {prompt} -- A: {row['answer']} \n InstructBlip: {ib_generated_text} -- OpenFlamingo: {fla_generated_text}")

    ## Channell dropout
    chnl_drop = A.ReplayCompose([A.ChannelDropout(always_apply=True)])
    img_chnl_drop = chnl_drop(image=img)['image']
    ib_chnl_drop = pred_instructblip(img_chnl_drop, prompt)
    fla_chnl_drop = pred_openflamingo(Image.fromarray(img_chnl_drop), prompt)
    # fla_chnl_drop = 'xxxxxx'
    plt_imgs.append(img_chnl_drop)
    plt_titles.append(f"Channel Dropout {'WARNING COLOR QUESTION' if row['type'] == 2 else ''} \n Q: {prompt} -- A: {row['answer']} \n InstructBlip: {ib_chnl_drop} -- OpenFlamingo: {fla_chnl_drop}")

    ## Solarize
    img_solar = A.augmentations.functional.solarize (img, threshold=128)
    ib_solar = pred_instructblip(img_solar, prompt)
    fla_solar = pred_openflamingo(Image.fromarray(img_solar), prompt)
    # fla_solar = 'xxxxxx'
    plt_imgs.append(img_solar)
    plt_titles.append(f"Solarize {'WARNING COLOR QUESTION' if row['type'] == 2 else ''} \n Q: {prompt} -- A: {row['answer']} \n InstructBlip: {ib_solar} -- OpenFlamingo: {fla_solar}")

    ## Elastic transform - distorion
    elastic = A.ReplayCompose([A.ElasticTransform(alpha=5, sigma=50, always_apply=True)])
    img_elastic = elastic(image=img)['image']
    ib_elastic = pred_instructblip(img_elastic, prompt)
    fla_elastic = pred_openflamingo(Image.fromarray(img_elastic), prompt)
    # fla_elastic = 'xxxxxx'
    plt_imgs.append(img_elastic)
    plt_titles.append(f"Elastic transform \n Q: {prompt} -- A: {row['answer']} \n InstructBlip: {ib_elastic} -- OpenFlamingo: {fla_elastic}")

    ## plot
    plot_row_images(
        images=plt_imgs,
        titles=plt_titles
    )



### MLLM Hallucination with Image Augmentation Evaluation

In [6]:
qa_df

Unnamed: 0,img_id,question,answer,type
0,299254,what is the man holding a snowboard on top of ...,hill,0
1,6415,what are sitting on the counter in different s...,carrots,0
2,465893,what is coming down the tracks,train,0
3,53529,where does the person sit next to a top with a...,car,3
4,398005,what are standing in tall dry grass look at th...,zebras,0
...,...,...,...,...
38943,406426,what are there grouped together here,vegetables,0
38944,545581,what serves as the train trestle,bridge,0
38945,40404,what is the color of the plate,white,2
38946,570521,what is sleeping on the blue couch,cat,0


In [None]:

for i, row in qa_df.iterrows():
    try:
        img = coco.loadImgs([row['img_id']])[0]
        img = io.imread(img['coco_url'])
        prompt = row['question'] + '?'
        
        ## Original Image
        qa_df.at[i, 'ib_original'] = str(pred_instructblip(img, prompt))
        qa_df.at[i, 'fla_original'] = str(pred_openflamingo(Image.fromarray(img), prompt))
    
        ## Blurred small
        blur1 = A.ReplayCompose([A.Blur(blur_limit=[15]*2, always_apply=True)])
        img_blur1 = blur1(image=img)['image']
        qa_df.at[i, 'ib_blurSm'] = str(pred_instructblip(img_blur1, prompt))
        qa_df.at[i, 'fla_blurSm'] = str(pred_openflamingo(Image.fromarray(img_blur1), prompt))
        
        ## Blurred medium
        blur2 = A.ReplayCompose([A.Blur(blur_limit=[20]*2, always_apply=True)])
        img_blur2 = blur2(image=img)['image']
        qa_df.at[i, 'ib_blurMd'] = str(pred_instructblip(img_blur2, prompt))
        qa_df.at[i, 'fla_blurMd'] = str(pred_openflamingo(Image.fromarray(img_blur2), prompt))
    
        ## Blurred strong
        blur3= A.ReplayCompose([A.Blur(blur_limit=[22]*2, always_apply=True)])
        img_blur3 = blur3(image=img)['image']
        qa_df.at[i, 'ib_blurLg'] = str(pred_instructblip(img_blur3, prompt))
        qa_df.at[i, 'fla_blurLg'] = str(pred_openflamingo(Image.fromarray(img_blur3), prompt))
    
        ## Channell dropout
        chnl_drop = A.ReplayCompose([A.ChannelDropout(always_apply=True)])
        img_chnl_drop = chnl_drop(image=img)['image']
        qa_df.at[i, 'ib_chnlDrp'] = str(pred_instructblip(img_chnl_drop, prompt))
        qa_df.at[i, 'fla_chnlDrp'] = str(pred_openflamingo(Image.fromarray(img_chnl_drop), prompt))
        
        ## Solarize
        img_solar = A.augmentations.functional.solarize (img, threshold=128)
        qa_df.at[i, 'ib_solar'] = str(pred_instructblip(img_solar, prompt))
        qa_df.at[i, 'fla_solar'] = str(pred_openflamingo(Image.fromarray(img_solar), prompt))
    
        ## Elastic transform - distorion
        elastic = A.ReplayCompose([A.ElasticTransform(alpha=5, sigma=50, always_apply=True)])
        img_elastic = elastic(image=img)['image']
        qa_df.at[i, 'ib_elastic'] = str(pred_instructblip(img_elastic, prompt))
        qa_df.at[i, 'fla_elastic'] = str(pred_openflamingo(Image.fromarray(img_elastic), prompt))
    
        if i % 101 == 0:
            print(f'Progress: {i} / {len(qa_df)}')
            qa_df.to_csv('output/image_aug_analysis_cocoqa_val2014.csv', index=None)
    except Exception as e:
        print(e)
        

  qa_df.at[i, 'ib_original'] = str(pred_instructblip(img, prompt))
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
  qa_df.at[i, 'fla_original'] = str(pred_openflamingo(Image.fromarray(img), prompt))
  qa_df.at[i, 'ib_blurSm'] = str(pred_instructblip(img_blur1, prompt))
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
  qa_df.at[i, 'fla_blurSm'] = str(pred_openflamingo(Image.fromarray(img_blur1), prompt))
  qa_df.at[i, 'ib_blurMd'] = str(pred_instructblip(img_blur2, prompt))
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
  qa_df.at[i, 'fla_blurMd'] = str(pred_openflamingo(Image.fromarray(img_blur2), prompt))
  qa_df.at[i, 'ib_blurLg'] = str(pred_instructblip(img_blur3, prompt))
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
  qa_df.at[i, 'fla_blurLg'] = str(pred_openflamingo(Image.fromarray(img_blur3), prompt))
  warn(
  qa_df.at[i, 'ib_chnlDrp'] = str(pred_instructblip(img_chnl_drop, promp

Progress: 0 / 38948


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
  warn(
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-

Progress: 101 / 38948


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end gene

Progress: 202 / 38948


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end gene

Progress: 303 / 38948


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end gene

Progress: 404 / 38948


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end gene

Progress: 505 / 38948


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end gene

Progress: 606 / 38948


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end gene

Progress: 707 / 38948


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end gene

Invalid image shape. Expected either 4 or 3 dimensions, but got 2 dimensions.


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end gene

Progress: 808 / 38948


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end gene

Invalid image shape. Expected either 4 or 3 dimensions, but got 2 dimensions.


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end gene

In [None]:
qa_df.to_csv('output/image_aug_analysis_cocoqa_val2014.csv', index=None)