In [1]:
import torch
import torchvision.models
import torch.nn as nn
import torchvision.transforms as transforms
from torch.autograd import Variable
import glob
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import pickle
from tqdm import tqdm
import pandas as pd
import nltk

token = './Flickr8k_text/Flickr8k.token.txt'
captions = open(token, 'r').read().strip().split('\n')

caption_data = {}
for i, row in enumerate(captions):
    row = row.split('\t')
    row[0] = row[0][:len(row[0])-2]
    if row[0] in caption_data:
        caption_data[row[0]].append(row[1])
    else:
        caption_data[row[0]] = [row[1]]

image_path = './Flickr8k_Dataset/Flickr8k_Dataset/'
images = glob.glob(image_path+'*.jpg')

def split_data(list_images, images):
    temp_train_images = []
    for image in images:
        if image[len(image_path):] in list_images:
            temp_train_images.append(image)

    return temp_train_images

train_images_file = './Flickr8k_text/Flickr_8k.trainImages.txt'
train_images = set(open(train_images_file, 'r').read().strip().split('\n'))
train_images = split_data(train_images, images)

dev_images_file = 'Flickr8k_text/Flickr_8k.devImages.txt'
dev_images = set(open(dev_images_file, 'r').read().strip().split('\n'))
dev_images = split_data(dev_images, images)

train_images = train_images + dev_images

test_images_file = './Flickr8k_text/Flickr_8k.testImages.txt'
test_images = set(open(test_images_file, 'r').read().strip().split('\n'))
test_images = split_data(test_images, images)

def preprocess_input(x):
    x /= 255.
    x -= 0.5
    x *= 2.

    return x

def preprocess(image_path):
    image = image.load_img(image_path, target_size=(299, 299))
    x = image.img_to_array(image)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)

    return x

class Flatten(nn.Module):
    def forward(self, input):
        
        return input.view(input.size(0), -1)

inception_v3 = torchvision.models.inception_v3(pretrained=True)
inception_v3.fc = Flatten()

inception_v3 = inception_v3.cuda()

data_transform = transforms.Compose([
        transforms.Resize((299, 299)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225])
    ])

# cudnn.benchmark = True
inception_v3.eval()

def encode(image_path, data_transform, inception_v3):
    image = Image.open(image_path)
    image = data_transform(image).cuda()
    image_variable = Variable(torch.unsqueeze(image, 0), volatile=True)
    output = inception_v3.forward(image_variable)
    encoded_image = torch.squeeze(output, 0)

    return encoded_image

# encoding_train = {}
# for image in tqdm(train_images):
#     inception_v3 = inception_v3.cuda()
#     encoding_train[image[len(image_path):]] = encode(image, data_transform, inception_v3)
    
# with open("encoded_images_inceptionV3.p", "wb") as encoded_pickle:
#     pickle.dump(encoding_train, encoded_pickle)

encoding_train = pickle.load(open('encoded_images_inceptionV3.p', 'rb'))

# encoding_test = {}
# for image in tqdm(test_images):
#     encoding_test[image[len(image_path):]] = encode(image, data_transform, inception_v3)
    
# with open("encoded_images_test_inceptionV3.p", "wb") as encoded_pickle:
#     pickle.dump(encoding_test, encoded_pickle)
    
encoding_test = pickle.load(open('encoded_images_test_inceptionV3.p', 'rb'))

inception_v3 = None

train_data = {}
for image in train_images:
    if image[len(image_path):] in caption_data:
        train_data[image] = caption_data[image[len(image_path):]]

test_data = {}
for image in test_images:
    if image[len(image_path):] in caption_data:
        test_data[image] = caption_data[image[len(image_path):]]

all_captions = []
for image, captions in train_data.items():
    for caption in captions:
        all_captions.append('<start> ' + caption + ' <end>')

words = [caption.split() for caption in all_captions]

# unique_words = []
# for word in words:
#     unique_words.extend(word)
    
# unique_words = list(set(unique_words))

# with open("unique_words.p", "wb") as pickle_d:
#     pickle.dump(unique_words, pickle_d)

unique_words = pickle.load(open('unique_words.p', 'rb'))
vocab_size = len(unique_words)

word2idx = {value:index for index, value in enumerate(unique_words)}
idx2word = {index:value for index, value in enumerate(unique_words)}

max_len = 0
for caption in all_captions:
    caption_words = caption.split()
    if len(caption_words) > max_len:
        max_len = len(caption_words)

# f = open('flickr8k_training_dataset.txt', 'w')
# f.write("image_id\tcaptions\n")

# for image, captions in train_data.items():
#     for caption in captions:
#         f.write(image[len(image_path):] + "\t" + "<start> " + caption +" <end>" + "\n")

# f.close()

data_file = pd.read_csv('flickr8k_training_dataset.txt', delimiter='\t')
caps = [caption for caption in data_file['captions']]
imgs = [image for image in data_file['image_id']]

samples_per_epoch = 0
for caption in caps:
    samples_per_epoch += len(caption.split()) - 1

def pad_sequences(sequences, maxlen):
    padded_output = []
    for index, sequence in enumerate(sequences):
        diff = maxlen - len(sequence)
        padded_output.append(sequence + [0] * diff)

    return padded_output

# Needs 16.3 GB storage space

# data_file = data_file.sample(frac=1)
# iter = data_file.iterrows()
# caps = []
# imgs = []
# for _ in range(data_file.shape[0]):
#     x = next(iter)
#     caps.append(x[1][1])
#     imgs.append(x[1][0])

# partial_caps = []
# next_words = []
# images = []
# samples = []
# for k in range (10):
#     for j, text in enumerate(tqdm(caps[k*3500:(k+1)*3500])):
#         current_image = encoding_train[imgs[j]]
#         for i in range(len(text.split())-1):
#             partial = [word2idx[txt] for txt in text.split()[:i+1]]
#             partial_caps.append(partial)

#             n = np.zeros(vocab_size)
#             n[word2idx[text.split()[i+1]]] = 1
#             next_words.append(Variable(torch.FloatTensor(n)))

#             images.append(current_image)

#     partial_caps = pad_sequences(partial_caps, max_len)
#     partial_caps = Variable(torch.FloatTensor(partial_caps))

#     torch.save(images, "./dataset/data_"+str(k+1)+"_images")
#     torch.save(partial_caps, "./dataset/data_"+str(k+1)+"_partial_caps")
#     torch.save(next_words, "./dataset/data_"+str(k+1)+"_next_words")

#     partial_caps = []
#     next_words = []
#     images = []

class TimeDistributed(nn.Module):
    def __init__(self, module, batch_first=False):
        super(TimeDistributed, self).__init__()
        self.module = module
        self.batch_first = batch_first

    def forward(self, x):

        if len(x.size()) <= 2:
            return self.module(x)

        # Squash samples and timesteps into a single axis
        x_reshape = x.contiguous().view(-1, x.size(-1))  # (samples * timesteps, input_size)

        y = self.module(x_reshape)

        # We have to reshape Y
        if self.batch_first:
            y = y.contiguous().view(x.size(0), -1, y.size(-1))  # (samples, timesteps, output_size)
        else:
            y = y.view(-1, x.size(1), y.size(-1))  # (timesteps, samples, output_size)

        return y

image_embedding_size = 300
word_embedding_size = 300
lstm1_units = 256
lstm2_units = 300

import torch.optim as optim

def train(data_path, model):
    
    criterion = nn.CrossEntropyLoss().cuda()
    optimizer = optim.SGD(final_model.parameters(), lr=0.001, momentum=0.9)

    training_loss = 0.0

    for epoch in range(1):
        for data_part in range(1, 2):
            images = torch.load(data_path+"data_"+str(data_part)+"_images")
            partial_caps = torch.load(data_path+"data_"+str(data_part)+"_partial_caps")
            next_words = torch.load(data_path+"data_"+str(data_part)+"_next_words")

            for index in tqdm(range(5)): # max(len(images), len(partial_caps), len(next_words))
                image_vector = images[index].cuda()
                caption_vector = partial_caps[index]
                next_word = next_words[index].cuda()

                optimizer.zero_grad()

                predicted_word = model(image_vector.unsqueeze(0), caption_vector.unsqueeze(0))
                loss = criterion(predicted_word, next_word)
                loss.backward()
                optimizer.step()

                training_loss += loss.data[0]

        if i % 10 == 0:
            print(epoch + 1, ":", "Loss:", training_loss)

        training_loss = 0.0

    print('Finished Training')
    
    return model

In [2]:
class ImageModel(nn.Module):
    def __init__(self):
        super(ImageModel, self).__init__()
        self.fc = nn.Linear(2048, image_embedding_size).cuda()
        self.relu = nn.ReLU().cuda()

    def forward(self, input):
        temp = self.relu(self.fc(input))
        print (temp.size())
        temp = torch.squeeze(temp, 0)
        return temp.repeat(max_len)

In [3]:
class CaptionModel(nn.Module):
    def __init__(self):
        super(CaptionModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, word_embedding_size).cuda()
        self.lstm = nn.LSTM(word_embedding_size, lstm1_units, 1, batch_first=True).cuda()
        self.td = TimeDistributed(nn.Linear(lstm1_units, word_embedding_size).cuda(), batch_first=True).cuda()

    def forward(self, input):
        embedded_output = self.embedding(torch.LongTensor(input.data).cuda()).cuda()
        output, hidden = self.lstm(embedded_output)
        return self.td(output)

In [4]:
class FinalModel(nn.Module):
    def __init__(self):
        super(FinalModel, self).__init__()
        self.image_model = ImageModel().cuda()
        self.caption_model = CaptionModel().cuda()
        input_size = image_embedding_size + word_embedding_size
        self.lstm = nn.LSTM(input_size, lstm2_units, 1, bidirectional=True, batch_first=True).cuda()
        self.fc = nn.Linear(batch_size*max_len*2*lstm2_units, vocab_size).cuda()
        self.softmax = nn.Softmax().cuda()

    def forward(self, image_vector, caption_vector):
        merged_input = torch.cat((self.image_model(image_vector), self.caption_model(caption_vector)), 1).cuda()
        output, hidden = self.lstm(merged_input.view(-1))
        return self.softmax(self.fc(output))

In [5]:
batch_size = 1
final_model = FinalModel().cuda()

In [6]:
final_model = train("./dataset/", final_model)

  0%|          | 0/5 [00:00<?, ?it/s]

torch.Size([1, 300])





TypeError: torch.LongTensor constructor received an invalid combination of arguments - got (torch.FloatTensor), but expected one of:
 * no arguments
 * (int ...)
      didn't match because some of the arguments have invalid types: ([31;1mtorch.FloatTensor[0m)
 * (torch.LongTensor viewed_tensor)
      didn't match because some of the arguments have invalid types: ([31;1mtorch.FloatTensor[0m)
 * (torch.Size size)
      didn't match because some of the arguments have invalid types: ([31;1mtorch.FloatTensor[0m)
 * (torch.LongStorage data)
      didn't match because some of the arguments have invalid types: ([31;1mtorch.FloatTensor[0m)
 * (Sequence data)
      didn't match because some of the arguments have invalid types: ([31;1mtorch.FloatTensor[0m)


In [28]:
# def count_parameters(model):
#     return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [29]:
# im = ImageModel()
# cm = CaptionModel()
# print (count_parameters(im), count_parameters(cm), count_parameters(final_model))

614700 3322992 434031407
