In [1]:
# import python file from parent folder
from img_cap_lib import *
# imports
import torch
import torchvision
import torchtext
from torchtext.vocab import vocab, GloVe, Vectors
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as T
import os
from PIL import Image
import string
from collections import OrderedDict, Counter
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import time
from sklearn.model_selection import train_test_split
import pickle
import os
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
import nltk

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Daten herunterladen

In [2]:
data_download("flickr8k")

Data already exi sts at flickr8k


# Preprocessing

In [3]:
# caption preprocessing
embedding_dim = 300
min_frequency = 1

captions = pd.read_csv("flickr8k/captions.txt")
caption_preprocessor = CaptionPreprocessor(captions=captions, embedding_dim=embedding_dim, min_frequency=min_frequency)
caption_preprocessor.preprocess()

# image preprocessing
img_preprocessor = ImagePreprocessor(normalize=True, image_folder_path="flickr8k")
img_preprocessor.preprocess_images()

Shape captions: (40460, 2)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


Shape captions after filtering: (39749, 3)
Removed Captions:  711 , in Percent:  1.76
transformed_images folder already exists. No preprocessing necessary.


# Data Split

In [4]:
batch_size = 64

training_data, test_data = train_test_split(caption_preprocessor.captions, test_size=0.15, random_state=42)

embedding = Embedding(embedding_matrix=caption_preprocessor.embedding, vocabulary=caption_preprocessor.vocabulary)

# create dataset
train_dataset = FlickrDataset(captions=training_data, embedding=embedding)
test_dataset = FlickrDataset(captions=test_data, embedding=embedding)

# create dataloader
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, drop_last=True)

# Modell erstellen

In [None]:
encoder = EncoderCNN(net=torchvision.models.resnext50_32x4d, pretrained_weights=torchvision.models.ResNeXt50_32X4D_Weights.IMAGENET1K_V2, output_size=300)
decoder = DecoderRNN(input_size=300, hidden_size=caption_preprocessor.embedding_dim, num_layers=1, dropout=0.0, len_vocab=embedding.embedding_matrix.shape[0], len_subtract=0)

model = ImageCaptioning(encoder=encoder, decoder=decoder, embedding=embedding, batch_size=batch_size)

In [None]:
# train model
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
model_stats = model.train_model(loader=train_loader, optimizer=optimizer, criterion=criterion, epochs=250, print_every=1000)

# save model dict
torch.save(model_stats, "drive/MyDrive/without_eos.pt")

Epoch: 1/250 | Batch: 1/527 | Loss: 8.97307014465332
Epoch: 1/250 | Average Epoch Loss: 7.0130262908718395
Epoch: 2/250 | Batch: 1/527 | Loss: 6.673590183258057
Epoch: 2/250 | Average Epoch Loss: 6.482943236262342
Epoch: 3/250 | Batch: 1/527 | Loss: 6.243509292602539
Epoch: 3/250 | Average Epoch Loss: 6.282781223644568
Epoch: 4/250 | Batch: 1/527 | Loss: 6.2735137939453125
Epoch: 4/250 | Average Epoch Loss: 6.127988313135657
Epoch: 5/250 | Batch: 1/527 | Loss: 6.0735249519348145
Epoch: 5/250 | Average Epoch Loss: 6.000287556331569
Epoch: 6/250 | Batch: 1/527 | Loss: 5.871740818023682
Epoch: 6/250 | Average Epoch Loss: 5.891066057179866
Epoch: 7/250 | Batch: 1/527 | Loss: 5.835323810577393
Epoch: 7/250 | Average Epoch Loss: 5.7913342436079285
Epoch: 8/250 | Batch: 1/527 | Loss: 5.660100936889648
Epoch: 8/250 | Average Epoch Loss: 5.701445124407194
Epoch: 9/250 | Batch: 1/527 | Loss: 5.768121719360352
Epoch: 9/250 | Average Epoch Loss: 5.617565630056826
Epoch: 10/250 | Batch: 1/527 | Los

# Evaluierung

In [12]:
class Evaluator:
    def __init__(self, model, dataloader, device):
        # initiate variables 
        self.model = model
        self.dataloader = dataloader
        self.device = device
        # self.model.eval()
        # assert self.dataloader.batch_size == 1, "Batch size must be 1 for evaluation."
    
    def evaluate(self):
        scores = []

        for i, (images, captions, lengths, vectorized_captions) in enumerate(self.dataloader):
            # move to device
            images = images.to(self.device)
            captions = captions.to(self.device)
            vectorized_captions = vectorized_captions.to(self.device)
            
            # forward pass
            output = self.model.forward(images)
            references = self.model.words[vectorized_captions.cpu()]

            for j in range(output.shape[0]):
                candidate = self.output_to_sentence(output[j,:])
                reference = self.output_to_sentence(references[j,:])
                scores.append(self.bleu_score(candidate, reference))
            
            print(f"Batch: {i+1} of {len(self.dataloader)}")

        print(f"Average BLEU score: {np.mean(scores)}")
        return np.mean(scores), scores

    @staticmethod
    def output_to_sentence(output:list):
        '''
        Removes Tokens from model output.
        '''
        output = [token for token in output if token not in ["<SOS>", "<EOS>", "<PAD>"]]
        return output

    @staticmethod
    def bleu_score(reference, candidate):
        '''
        Calculates the BLEU score for a single reference and candidate. Uses the SmoothingFunction for smoothing when no overlap between certain n-grams is found. 

        Params:
        -------
        reference: list of strings - The reference sentence.
        candidate: list of strings - The candidate sentence.

        Returns:
        --------
        bleu_score: float - The BLEU score.
        '''
        # calculate the BLEU score
        return nltk.translate.bleu_score.sentence_bleu(reference, candidate, smoothing_function=nltk.translate.bleu_score.SmoothingFunction().method1)

In [13]:
path = 'drive/MyDrive/without_eos.pt'
model_stats = torch.load(path, map_location=device)
model = load_captioning_model(model_stats)

In [14]:
train_dataset = FlickrDataset(captions=training_data, embedding=embedding)
test_dataset = FlickrDataset(captions=test_data, embedding=embedding)

# create dataloader
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, drop_last=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True, drop_last=True)

# calculate bleu scores
train_evaluator = Evaluator(model, train_loader, device)
test_evaluator = Evaluator(model, test_loader, device)

train_bleu, train_scores = train_evaluator.evaluate()
test_bleu, test_scores = test_evaluator.evaluate()

print(f"Train BLEU: {train_bleu}")
print(f"Test BLEU: {test_bleu}")

Batch: 1 of 1055
Batch: 2 of 1055
Batch: 3 of 1055
Batch: 4 of 1055
Batch: 5 of 1055
Batch: 6 of 1055
Batch: 7 of 1055
Batch: 8 of 1055
Batch: 9 of 1055
Batch: 10 of 1055
Batch: 11 of 1055
Batch: 12 of 1055
Batch: 13 of 1055
Batch: 14 of 1055
Batch: 15 of 1055
Batch: 16 of 1055
Batch: 17 of 1055
Batch: 18 of 1055
Batch: 19 of 1055
Batch: 20 of 1055
Batch: 21 of 1055
Batch: 22 of 1055
Batch: 23 of 1055
Batch: 24 of 1055
Batch: 25 of 1055
Batch: 26 of 1055
Batch: 27 of 1055
Batch: 28 of 1055
Batch: 29 of 1055
Batch: 30 of 1055
Batch: 31 of 1055
Batch: 32 of 1055
Batch: 33 of 1055
Batch: 34 of 1055
Batch: 35 of 1055
Batch: 36 of 1055
Batch: 37 of 1055
Batch: 38 of 1055
Batch: 39 of 1055
Batch: 40 of 1055
Batch: 41 of 1055
Batch: 42 of 1055
Batch: 43 of 1055
Batch: 44 of 1055
Batch: 45 of 1055
Batch: 46 of 1055
Batch: 47 of 1055
Batch: 48 of 1055
Batch: 49 of 1055
Batch: 50 of 1055
Batch: 51 of 1055
Batch: 52 of 1055
Batch: 53 of 1055
Batch: 54 of 1055
Batch: 55 of 1055
Batch: 56 of 1055
B

In [None]:
# export bleu scores
with open("drive/My Drive/train_scores_without_eos.pkl", "wb") as f:
    pickle.dump(train_scores, f)

with open("drive/My Drive/test_scores_without_eos.pkl", "wb") as f:
    pickle.dump(test_scores, f)