In [22]:
from img_cap_lib import *
# imports
import torch
import torchvision
import torchtext
from torchtext.vocab import vocab, GloVe, Vectors
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as T
import os
from PIL import Image
import string
from collections import OrderedDict, Counter
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import time
from sklearn.model_selection import train_test_split
import pickle
import os
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
import nltk

# Daten herunterladen

In [2]:
data_download("flickr8k_copy")

Data already exists at flickr8k


# Modell laden

In [5]:
# load model
model_stats = torch.load("models/model_stats_1667495086.9660888.pt", map_location=torch.device('cpu'))
model = load_captioning_model(model_stats)

# Preprocessing

In [7]:
# caption preprocessing
embedding_dim = 300
min_frequency = 1

captions = pd.read_csv("flickr8k_copy/captions.txt")
caption_preprocessor = CaptionPreprocessor(embedding=model_stats['embedding'].embedding_matrix, vocabulary=model_stats['embedding'].vocabulary ,captions=captions, embedding_dim=embedding_dim, min_frequency=min_frequency)
caption_preprocessor.preprocess()

# image preprocessing
img_preprocessor = ImagePreprocessor(normalize=False, image_folder_path="flickr8k_copy")
img_preprocessor.preprocess_images()

Shape captions: (40460, 2)
Shape captions after filtering: (40119, 3)
Removed Captions:  341 , in Percent:  0.84


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.captions.caption = self.captions.caption.apply(lambda x: x.strip("."))


transformed_images folder already exists. No preprocessing necessary.


# Datensplit und DataLoader

In [16]:
# create split
training_data, test_data = train_test_split(caption_preprocessor.captions, test_size=0.15, random_state=42)

# create datasets
train_dataset = FlickrDataset(captions=training_data, embedding=model.embedding)
test_dataset = FlickrDataset(captions=test_data, embedding=model.embedding)

# create dataloaders
batch_size = 1
train_loader = FlickrLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
test_loader = FlickrLoader(test_dataset, batch_size=1, shuffle=False, drop_last=True)

In [44]:
train_loader.batch_size

1

In [46]:
class Evaluator:
    def __init__(self, model, dataloader, device):
        # initiate variables 
        self.model = model
        self.dataloader = dataloader
        self.device = device
        self.model.eval()
        assert self.dataloader.batch_size == 1, "Batch size must be 1 for evaluation."
    
    def evaluate(self):
        scores = []

        for i, (images, captions, lengths, vectorized_captions) in enumerate(self.dataloader):
            # move to device
            images = images.to(self.device)
            captions = captions.to(self.device)
            vectorized_captions = vectorized_captions.to(self.device)
            
            # forward pass
            output = self.model.forward(images)[0]
            candidate = self.output_to_sentence(output)
            reference = self.output_to_sentence(self.model.embedding.index_to_caption(vectorized_captions).permute(1,0)[0])

            # calculate bleu score
            bleu_score = nltk.translate.bleu_score.sentence_bleu(reference, candidate)
            scores.append(bleu_score)

        return np.mean(scores), scores

    @staticmethod
    def output_to_sentence(output:list):
        '''
        Removes Tokens from model output.
        '''
        output = [token for token in output if token not in ["<SOS>", "<EOS>", "<PAD>"]]
        return output

    @staticmethod
    def bleu_score(reference, candidate):
        '''
        Calculates the BLEU score for a single reference and candidate. Uses the SmoothingFunction for smoothing when no overlap between certain n-grams is found. 

        Params:
        -------
        reference: list of strings - The reference sentence.
        candidate: list of strings - The candidate sentence.

        Returns:
        --------
        bleu_score: float - The BLEU score.
        '''
        # calculate the BLEU score
        return nltk.translate.bleu_score.sentence_bleu(reference, candidate, smoothing_function=nltk.translate.bleu_score.SmoothingFunction().method1)
            
            