In [None]:
import torch
from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, GPT2Tokenizer
import torchvision.transforms as T
import warnings
warnings.filterwarnings("ignore")
device = ("cuda"if torch.cuda.is_available() else "cpu")
# Đường dẫn tới model đã lưu
save_path = "/lastest_model.pth"

# Khởi tạo model với kiến trúc đã dùng
image_encoder_model = "google/vit-base-patch16-224-in21k"
text_decode_model = "gpt2"
checkpoint = torch.load(save_path, map_location=device)
model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(
    image_encoder_model, text_decode_model
)
# Khởi tạo feature extractor và tokenizer
tokenizer = GPT2Tokenizer.from_pretrained(text_decode_model)
feature_extractor = ViTFeatureExtractor.from_pretrained(image_encoder_model)
# feature_extractor
# Load trọng số đã lưu vào model
model.load_state_dict(checkpoint["model_state_dict"])
model.to(device)
print(checkpoint["epoch"])
model.config.pad_token_id = tokenizer.pad_token_id  
model.config.eos_token_id = tokenizer.eos_token_id 
model.config.decoder_start_token_id = tokenizer.bos_token_id  

In [None]:
model.eval()
from transformers import ViTFeatureExtractor, GPT2Tokenizer
from PIL import Image
# Ví dụ: ảnh test
image_path = "/kaggle/input/all-new-images/Sydney-Galaxy-homebush.jpg"
image = Image.open(image_path)
transforms = T.Compose([
    T.Resize((224,224)),
    T.ToTensor(), # chuyển về tensor và chuẩn hóa [0,1]
])


image = transforms(image)  # Xử lý ảnh qua các transform


test_inputs = feature_extractor(image, return_tensors="pt",do_rescale = False)
pixel_values = test_inputs.pixel_values


attention_mask = (pixel_values != tokenizer.eos_token_id).long()

with torch.no_grad():
    outputs = model.generate(pixel_values= pixel_values.to(device),attention_mask=attention_mask.to(device), max_length=20, num_beams=8, early_stopping=True)
    caption = tokenizer.decode(outputs[0], skip_special_tokens=True)
import matplotlib.pyplot as plt
plt.figure(figsize = (6,6))
plt.imshow(transforms(Image.open(image_path)).permute(1,2,0))
plt.title(caption)

In [18]:
import nltk
import torch
import torchvision.transforms as T
from torch.utils.data import DataLoader, Dataset
from nltk.translate.bleu_score import corpus_bleu
from torch.utils.data import Subset
import random
import os
import pandas as pd
from PIL import Image
tokenizer.pad_token = tokenizer.eos_token
# Link chứa dataset
caption_file = "/kaggle/input/flickr8k/captions.txt"  
root_dir = "/kaggle/input/flickr8k/Images"  

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

class FlickrDataset(Dataset):
    def __init__(self, root_dir, captions_file, tokenizer, transform=None, freq_threshold=5):
        self.root_dir = root_dir
        self.df = pd.read_csv(captions_file)
        self.transform = transform
        
        # Get image and caption column from the dataframe
        self.imgs = self.df["image"]
        self.captions = self.df["caption"]

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        caption = self.captions[idx] # lấy caption dựa trên index
        img_name = self.imgs[idx] # lấy tên ảnh dựa trên index 

        # Tạo đường dẫn đến tệp ảnh dựa vào tên ảnh đã có và đổi về dạng RGB
        img_location = os.path.join(self.root_dir, img_name)
        img = Image.open(img_location).convert("RGB")
        if not isinstance(caption, str):
            caption = str(caption)
        if self.transform is not None:
            img = self.transform(img)    
    
        return img, caption


transforms = T.Compose([
    T.Resize((224, 224)),
    T.ToTensor(),
])


dataset = FlickrDataset(root_dir=root_dir, captions_file=caption_file, tokenizer=tokenizer, transform=transforms)
random_indices = random.sample(range(len(dataset)), 2000)
subset_dataset = Subset(dataset, random_indices)

dataloader = DataLoader(subset_dataset, batch_size=1, shuffle=False)

gc = [] 
test = [] 
    
for i in range(len(dataloader)):
    img, actual_caption = subset_dataset[i] 

    with torch.no_grad():
           
        test_inputs = feature_extractor(img, return_tensors="pt", do_rescale=False)
        pixel_values = test_inputs.pixel_values

        attention_mask = (pixel_values != tokenizer.eos_token_id).long()
        outputs = model.generate(pixel_values=pixel_values.to(device),
                                    attention_mask=attention_mask.to(device),
                                    max_length=20, num_beams=8, early_stopping=True,pad_token_id = tokenizer.eos_token_id)
            
        caption = tokenizer.decode(outputs[0], skip_special_tokens=True)
        #print(caption)
    
        test.append(caption.split()) 
        gc.append([actual_caption.split()]) 
#print(test[0])
#print(gc[0])

# Tính ma trận Bleu
print("Nltk metrics")
BLEU4 = nltk.translate.bleu_score.corpus_bleu(gc, test, weights=(0.25, 0.25, 0.25, 0.25))
BLEU1 = nltk.translate.bleu_score.corpus_bleu(gc, test, weights=(1.0, 0, 0, 0))
BLEU2 = nltk.translate.bleu_score.corpus_bleu(gc, test, weights=(0.5, 0.5, 0, 0))
BLEU3 = nltk.translate.bleu_score.corpus_bleu(gc, test, weights=(0.33, 0.33, 0.33, 0))
    
print(f"BLEU-1: {BLEU1}")
print(f"BLEU-2: {BLEU2}")
print(f"BLEU-3: {BLEU3}")
print(f"BLEU-4: {BLEU4}")



['A', 'black', 'dog', 'is', 'running', 'after', 'a', 'brown', 'dog', 'on', 'the', 'beach', '.']
[['On', 'a', 'beach', ',', 'black', 'and', 'brown', 'dog', 'runs', ',', 'brown', 'dog', 'jumps', '.']]
Nltk metrics
BLEU-1: 0.5175966284859593
BLEU-2: 0.3757594101171124
BLEU-3: 0.29695747328856376
BLEU-4: 0.238930920107841
