In [1]:
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
import torch
from PIL import Image

#Without fine tunned
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

#With fine tunned
#model = VisionEncoderDecoderModel.from_pretrained("./image-captioning-output/checkpoint-160")

feature_extractor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

2023-06-19 00:05:36.273851: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
Some weights of the model checkpoint at nlpconnect/vit-gpt2-image-captioning were not used when initializing VisionEncoderDecoderModel: ['decoder.transformer.h.9.crossattention.masked_bias', 'decoder.transformer.h.9.crossattention.bias', 'decoder.transformer.h.7.crossattention.masked_bias', 'decoder.transformer.h.6.attn.bias', 'decoder.transformer.h.1.crossattention.bias', 'decoder.transformer.h.7.crossattention.bias', 'decoder.transformer.h.10.attn.bias', 'decoder.transformer.h.2.crossattention.bias', 'decoder.transformer.h.8.attn.bias', 'decoder.transformer.h.6.crossattention.masked_bias', 'decoder.transformer.h.0.crossattention.b

VisionEncoderDecoderModel(
  (encoder): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_featur

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0,1,2"

In [3]:
def predict_step(image_paths,max_length,num_beams):
    gen_kwargs = {"max_length": max_length, "num_beams": num_beams}
    images = []
    for image_path in image_paths:
        i_image = Image.open(image_path)
        if i_image.mode != "RGB":
            i_image = i_image.convert(mode="RGB")

        images.append(i_image)

    pixel_values = feature_extractor(images=images, return_tensors="pt").pixel_values
    pixel_values = pixel_values.to(device)

    output_ids = model.generate(pixel_values, **gen_kwargs)

    preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
    preds = [pred.strip() for pred in preds]
    return preds




In [5]:
print(" --------------- Zero Shot Evaluation --------------- ")
print("Greedy: ",(predict_step(['../dataset/my_dataset/train/0395.jpg'],24,1)[0]))
print("Beam (N=3): ",(predict_step(['../dataset/my_dataset/train/0395.jpg'],24,3)[0]))
print("Beam (N=5): ",(predict_step(['../dataset/my_dataset/train/0395.jpg'],24,5)[0]))

 --------------- Zero Shot Evaluation --------------- 
Greedy:  a large clock tower towering over a city
Beam (N=3):  a tall clock tower towering over a city
Beam (N=5):  a tall clock tower towering over a city


In [4]:
token_path = "../dataset/my_dataset/captions2.txt"
images_path = '../dataset/my_dataset/train/'

In [5]:
doc = open(token_path,'r').read()
print(doc[:410])

0001.jpg#0 an aerial view of a town with lots of buildings and trees
0001.jpg#1 a town with lots of tall buildings and trees
0001.jpg#2 an aerial view of a town with tall buildings
0001.jpg#3 a group of under construction buildings in a town with lots of trees around
0002.jpg#0 an aerial view of a town with lots of buildings and trees
0002.jpg#1 a town with many buildings and trees
0002.jpg#2 a lot of build


In [6]:
descriptions = dict()
all_image_id = []
for line in doc.split('\n'):
    tokens = line.split()
    if len(line) > 2:
        image_id = tokens[0].split('.')[0]
        image_desc = ' '.join(tokens[1:])
        if image_id not in all_image_id:
            all_image_id.append(image_id)
        if image_id not in descriptions:
            descriptions[image_id] = list()
        descriptions[image_id].append(image_desc)

In [7]:
print("Total images:",len(all_image_id))

Total images: 1256


In [8]:
import string
table = str.maketrans('', '', string.punctuation)
for key, desc_list in descriptions.items():
    for i in range(len(desc_list)):
        desc = desc_list[i]
        desc = desc.split()
        desc = [word.lower() for word in desc]
        desc = [w.translate(table) for w in desc]
        desc_list[i] =  ' '.join(desc)

In [9]:
vocabulary = set()
for key in descriptions.keys():
        [vocabulary.update(d.split()) for d in descriptions[key]]

print('Original Vocabulary Size: %d' % len(vocabulary))

Original Vocabulary Size: 878


In [10]:
lines = list()
for key, desc_list in descriptions.items():
    for desc in desc_list:
        lines.append(key + ' ' + desc)
new_descriptions = '\n'.join(lines)

In [11]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(all_image_id,test_size=0.2035)
#train, val = train_test_split(train,test_size=0.177)
print(len(train))
print(len(test))
#print(len(val))

1000
256


In [12]:
train_img = []
for i in train:
    path = images_path + i + '.jpg'
    train_img.append(path)

test_img = []
for i in test:
    path = images_path + i + '.jpg'
    test_img.append(path)

In [13]:
train_descriptions = dict()
for line in new_descriptions.split('\n'):
    tokens = line.split()
    image_id, image_desc = tokens[0], tokens[1:]
    if image_id in train:
        if image_id not in train_descriptions:
            train_descriptions[image_id] = list()
        desc = 'startseq ' + ' '.join(image_desc) + ' endseq'
        train_descriptions[image_id].append(desc)

print('Descriptions: train = %d' % len(train_descriptions))

Descriptions: train = 1000


In [14]:
test_descriptions = dict()
for line in new_descriptions.split('\n'):
    tokens = line.split()
    image_id, image_desc = tokens[0], tokens[1:]
    if image_id in test:
        if image_id not in test_descriptions:
            test_descriptions[image_id] = list()
        desc = ' '.join(image_desc)
        test_descriptions[image_id].append(desc)
    

print('Descriptions: test = %d' % len(test_descriptions))

Descriptions: test = 256


In [15]:
print(train_img[0])
print(test_img[0])

../dataset/my_dataset/train/0644.jpg
../dataset/my_dataset/train/0106.jpg


In [16]:
file_name = "test_img.txt"  # Change this to your desired file name
file_path = "./exp6/" + file_name  # Change the path if necessary

# Open the file in write mode
with open(file_path, 'w') as file:
    # Iterate over each element in the array
    for element in test_img:
        # Write the element to a new line in the file
        file.write(' '.join(element) + '\n')

# Print a message indicating the file creation
print(f"The file '{file_name}' has been created and populated with the array elements.")

The file 'test_img.txt' has been created and populated with the array elements.


In [17]:
test_descriptions

{'0002': ['an aerial view of a town with lots of buildings and trees',
  'a town with many buildings and trees',
  'a lot of buildings in a town surrounded by green trees',
  'an aerial view of a town with many buildings and trees'],
 '0003': ['an aerial view of a town with lots of buildings and trees',
  'a town with lots of tall buildings around',
  'an aerial view of a city with many buildings',
  'a lot of buildings in a town with green trees in background'],
 '0006': ['an aerial view of a town with lots of buildings and trees',
  'many buildings and road surrounded by trees in a town',
  'a rotaty in the middle of town with many buildings and trees around it',
  'a view of a large area with many buildings and trees'],
 '0022': ['an aerial view of urban area with road and buildings',
  'an urban area with road and buildings',
  'an aerial view of urban area with construction site near long road and building',
  'an aerial view of buildings with road and construction sites'],
 '0029

In [18]:
#beam 5
result5 = predict_step(test_img,24,5)

In [19]:
actual = []
predicted5 = []

for i, name in enumerate(test_img):
    predicted5.append(result5[i].split(' '))
    s = name.split('/')[-1]
    num = s.split('.')[0]
    test_cap = test_descriptions[num]
    actual_captions = [caption.split() for caption in test_cap]
    actual.append(actual_captions)
    
    
print(len(predicted5))
print(len(actual))

256
256


In [20]:
from nltk.translate.bleu_score import corpus_bleu
from rouge import Rouge
from nltk.translate import meteor_score

In [21]:
print("Beam 5 score:")
print("BLEU-1: %f" % corpus_bleu(actual, predicted5, weights=(1.0, 0, 0, 0)))
print("BLEU-2: %f" % corpus_bleu(actual, predicted5, weights=(0.5, 0.5, 0, 0)))
print("BLEU-3: %f" % corpus_bleu(actual, predicted5, weights=(0.33, 0.33, 0.33, 0)))
print("BLEU-4: %f" % corpus_bleu(actual, predicted5, weights=(0.25, 0.25, 0.25, 0.25)))

Beam 5 score:
BLEU-1: 0.707657
BLEU-2: 0.589750
BLEU-3: 0.492671
BLEU-4: 0.403828


In [22]:
all_r1 = []
all_r2 = []
all_rl = []
all_m = []
#s = 0
for i in range(len(actual)):
    #print(s)
    #s+=1
    
    actual_captions = actual[i]
    generated_tokens = predicted5[i]
    #print("predicted: ",generated_tokens)
    rouge = Rouge()
    
    rouge_scores = []
    m_score = []
    reference_tokens = []
    
    for actual_caption in actual_captions:
        reference_tokens = actual_caption
        #print("ref tokens: ",reference_tokens)
        scores = rouge.get_scores(' '.join(generated_tokens), ' '.join(actual_caption))
        rouge_scores.append(scores[0])
        
        meteor_score_value = meteor_score.meteor_score([reference_tokens], generated_tokens)
        m_score.append(meteor_score_value)
    
    #print("rouge:", rouge_scores)
    #print("m:", m_score)
    
    m = sum(m_score)/len(m_score)
    m_score = []
    all_m.append(m)
    
    r1 = []
    r2 = []
    rl = []
    for i, actual_caption in enumerate(actual_captions):
        #print(f"Actual Caption {i+1}: {actual_caption}")
        #print("ROUGE scores:", rouge_scores[i])
        r1.append(rouge_scores[i]['rouge-1']['f'])
        r2.append(rouge_scores[i]['rouge-2']['f'])
        rl.append(rouge_scores[i]['rouge-l']['f'])
        
    all_r1.append(sum(r1)/4)
    all_r2.append(sum(r2)/4)
    all_rl.append(sum(rl)/4)

print("On greedy:")
print("r1: ",sum(all_r1)/len(all_r1))
print("r2: ",sum(all_r2)/len(all_r2))
print("rl: ",sum(all_rl)/len(all_rl))
print("m: ",sum(all_m)/len(all_m))

On greedy:
r1:  0.44776543991327095
r2:  0.2288857440055648
rl:  0.4016774547092962
m:  0.39595949524631224


In [23]:
#beam 3
result3 = predict_step(test_img,24,3)

In [24]:

predicted3 = []

for i, name in enumerate(test_img):
    predicted3.append(result3[i].split(' '))
    
    
print(len(predicted3))
print(len(actual))

256
256


In [25]:
print("Beam 3 score:")
print("BLEU-1: %f" % corpus_bleu(actual, predicted3, weights=(1.0, 0, 0, 0)))
print("BLEU-2: %f" % corpus_bleu(actual, predicted3, weights=(0.5, 0.5, 0, 0)))
print("BLEU-3: %f" % corpus_bleu(actual, predicted3, weights=(0.33, 0.33, 0.33, 0)))
print("BLEU-4: %f" % corpus_bleu(actual, predicted3, weights=(0.25, 0.25, 0.25, 0.25)))

Beam 3 score:
BLEU-1: 0.720528
BLEU-2: 0.596423
BLEU-3: 0.492296
BLEU-4: 0.400462


In [26]:
all_r1 = []
all_r2 = []
all_rl = []
all_m = []
#s = 0
for i in range(len(actual)):
    #print(s)
    #s+=1
    
    actual_captions = actual[i]
    generated_tokens = predicted3[i]
    #print("predicted: ",generated_tokens)
    rouge = Rouge()
    
    rouge_scores = []
    m_score = []
    reference_tokens = []
    
    for actual_caption in actual_captions:
        reference_tokens = actual_caption
        #print("ref tokens: ",reference_tokens)
        scores = rouge.get_scores(' '.join(generated_tokens), ' '.join(actual_caption))
        rouge_scores.append(scores[0])
        
        meteor_score_value = meteor_score.meteor_score([reference_tokens], generated_tokens)
        m_score.append(meteor_score_value)
    
    #print("rouge:", rouge_scores)
    #print("m:", m_score)
    
    m = sum(m_score)/len(m_score)
    m_score = []
    all_m.append(m)
    
    r1 = []
    r2 = []
    rl = []
    for i, actual_caption in enumerate(actual_captions):
        #print(f"Actual Caption {i+1}: {actual_caption}")
        #print("ROUGE scores:", rouge_scores[i])
        r1.append(rouge_scores[i]['rouge-1']['f'])
        r2.append(rouge_scores[i]['rouge-2']['f'])
        rl.append(rouge_scores[i]['rouge-l']['f'])
        
    all_r1.append(sum(r1)/4)
    all_r2.append(sum(r2)/4)
    all_rl.append(sum(rl)/4)

print("On greedy:")
print("r1: ",sum(all_r1)/len(all_r1))
print("r2: ",sum(all_r2)/len(all_r2))
print("rl: ",sum(all_rl)/len(all_rl))
print("m: ",sum(all_m)/len(all_m))

On greedy:
r1:  0.44873749794374196
r2:  0.22590587618067826
rl:  0.40041263250987397
m:  0.3927816382740277


In [27]:
#beam 1
result1 = predict_step(test_img,24,1)

In [28]:
predicted1 = []

for i, name in enumerate(test_img):
    predicted1.append(result1[i].split(' '))
    
    
print(len(predicted1))
print(len(actual))

256
256


In [29]:
print("Beam 1 score:")
print("BLEU-1: %f" % corpus_bleu(actual, predicted1, weights=(1.0, 0, 0, 0)))
print("BLEU-2: %f" % corpus_bleu(actual, predicted1, weights=(0.5, 0.5, 0, 0)))
print("BLEU-3: %f" % corpus_bleu(actual, predicted1, weights=(0.33, 0.33, 0.33, 0)))
print("BLEU-4: %f" % corpus_bleu(actual, predicted1, weights=(0.25, 0.25, 0.25, 0.25)))

Beam 1 score:
BLEU-1: 0.715110
BLEU-2: 0.577044
BLEU-3: 0.466591
BLEU-4: 0.373477


In [30]:
all_r1 = []
all_r2 = []
all_rl = []
all_m = []
#s = 0
for i in range(len(actual)):
    #print(s)
    #s+=1
    
    actual_captions = actual[i]
    generated_tokens = predicted1[i]
    #print("predicted: ",generated_tokens)
    rouge = Rouge()
    
    rouge_scores = []
    m_score = []
    reference_tokens = []
    
    for actual_caption in actual_captions:
        reference_tokens = actual_caption
        #print("ref tokens: ",reference_tokens)
        scores = rouge.get_scores(' '.join(generated_tokens), ' '.join(actual_caption))
        rouge_scores.append(scores[0])
        
        meteor_score_value = meteor_score.meteor_score([reference_tokens], generated_tokens)
        m_score.append(meteor_score_value)
    
    #print("rouge:", rouge_scores)
    #print("m:", m_score)
    
    m = sum(m_score)/len(m_score)
    m_score = []
    all_m.append(m)
    
    r1 = []
    r2 = []
    rl = []
    for i, actual_caption in enumerate(actual_captions):
        #print(f"Actual Caption {i+1}: {actual_caption}")
        #print("ROUGE scores:", rouge_scores[i])
        r1.append(rouge_scores[i]['rouge-1']['f'])
        r2.append(rouge_scores[i]['rouge-2']['f'])
        rl.append(rouge_scores[i]['rouge-l']['f'])
        
    all_r1.append(sum(r1)/4)
    all_r2.append(sum(r2)/4)
    all_rl.append(sum(rl)/4)

print("On greedy:")
print("r1: ",sum(all_r1)/len(all_r1))
print("r2: ",sum(all_r2)/len(all_r2))
print("rl: ",sum(all_rl)/len(all_rl))
print("m: ",sum(all_m)/len(all_m))

On greedy:
r1:  0.4327704190950966
r2:  0.2025218452731147
rl:  0.3838776638292628
m:  0.3593393025298319


In [31]:
file_name = "greedy.txt"  # Change this to your desired file name
file_path = "./exp6/" + file_name  # Change the path if necessary

# Open the file in write mode
with open(file_path, 'w') as file:
    # Iterate over each element in the array
    for element in predicted1:
        # Write the element to a new line in the file
        file.write(' '.join(element) + '\n')

# Print a message indicating the file creation
print(f"The file '{file_name}' has been created and populated with the array elements.")

The file 'greedy.txt' has been created and populated with the array elements.


In [32]:
file_name = "beam3.txt"  # Change this to your desired file name
file_path = "./exp6/" + file_name  # Change the path if necessary

# Open the file in write mode
with open(file_path, 'w') as file:
    # Iterate over each element in the array
    for element in predicted3:
        # Write the element to a new line in the file
        file.write(' '.join(element) + '\n')

# Print a message indicating the file creation
print(f"The file '{file_name}' has been created and populated with the array elements.")

The file 'beam3.txt' has been created and populated with the array elements.


In [33]:
file_name = "beam5.txt"  # Change this to your desired file name
file_path = "./exp6/" + file_name  # Change the path if necessary

# Open the file in write mode
with open(file_path, 'w') as file:
    # Iterate over each element in the array
    for element in predicted5:
        # Write the element to a new line in the file
        file.write(' '.join(element) + '\n')

# Print a message indicating the file creation
print(f"The file '{file_name}' has been created and populated with the array elements.")

The file 'beam5.txt' has been created and populated with the array elements.


In [34]:
file_name = "act1.txt"  # Change this to your desired file name
file_path = "./exp6/" + file_name  # Change the path if necessary

# Open the file in write mode
with open(file_path, 'w') as file:
    # Iterate over each element in the array
    for element in actual:
        el = element[0]
        # Write the element to a new line in the file
        file.write(' '.join(el) + '\n')

# Print a message indicating the file creation
print(f"The file '{file_name}' has been created and populated with the array elements.")

The file 'act1.txt' has been created and populated with the array elements.


In [35]:
file_name = "act2.txt"  # Change this to your desired file name
file_path = "./exp6/" + file_name  # Change the path if necessary

# Open the file in write mode
with open(file_path, 'w') as file:
    # Iterate over each element in the array
    for element in actual:
        el = element[1]
        # Write the element to a new line in the file
        file.write(' '.join(el) + '\n')

# Print a message indicating the file creation
print(f"The file '{file_name}' has been created and populated with the array elements.")

The file 'act2.txt' has been created and populated with the array elements.


In [36]:
file_name = "act3.txt"  # Change this to your desired file name
file_path = "./exp6/" + file_name  # Change the path if necessary

# Open the file in write mode
with open(file_path, 'w') as file:
    # Iterate over each element in the array
    for element in actual:
        el = element[2]
        # Write the element to a new line in the file
        file.write(' '.join(el) + '\n')

# Print a message indicating the file creation
print(f"The file '{file_name}' has been created and populated with the array elements.")

The file 'act3.txt' has been created and populated with the array elements.


In [37]:
file_name = "act4.txt"  # Change this to your desired file name
file_path = "./exp6/" + file_name  # Change the path if necessary

# Open the file in write mode
with open(file_path, 'w') as file:
    # Iterate over each element in the array
    for element in actual:
        el = element[3]
        # Write the element to a new line in the file
        file.write(' '.join(el) + '\n')

# Print a message indicating the file creation
print(f"The file '{file_name}' has been created and populated with the array elements.")

The file 'act4.txt' has been created and populated with the array elements.
