In [1]:
from os import listdir
from pickle import dump
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
from keras.applications.vgg16 import preprocess_input
from keras.utils import to_categorical
from nltk.translate.bleu_score import corpus_bleu
from pickle import load

import matplotlib.pyplot as plt
import tensorflow as tf

import os
from PIL import Image

from tensorflow.python.keras import backend as K
from tensorflow.python.keras.models import Model
from tensorflow.python.keras.layers import Input, Dense, GRU, Embedding, Dropout, LSTM, add
from tensorflow.python.keras.applications import VGG16
from tensorflow.python.keras.optimizers import RMSprop
from tensorflow.python.keras.callbacks import ModelCheckpoint, TensorBoard
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from keras.utils.vis_utils import plot_model
from keras.applications.vgg16 import decode_predictions
from tqdm import tqdm
import cv2
import PIL
import numpy as np

Using TensorFlow backend.


In [2]:
model = VGG16(include_top=True, weights='imagenet')

In [3]:
transfer_layer = model.get_layer('fc2')
model = Model(inputs=model.input, outputs=transfer_layer.output)

In [4]:
features = load(open('features.pkl', 'rb'))

In [139]:
def extract_features(directory):

    # extract features from each photo
    features = dict()
    image_names = list()
    i = 0;
    for name in listdir(directory):
        if i>=5000:
            break;
        # load an image from file
        filename = directory + '/' + name
        image = load_img(filename, target_size=(224, 224))
        # convert the image pixels to a numpy array
        image = img_to_array(image)
        # reshape data for the model
        image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
        # prepare the image for the VGG model
        image = preprocess_input(image)
        # get features
        feature = model.predict(image, verbose=0)
        # get image id
        image_id = name.split('.')[0]
        # store feature
        features[image_id] = feature
        image_names.append(name)
        if i%100==0:
            print("completed: ",i,"/5000 images")
        i = i+1
    return features, image_names

In [140]:
 # extract features from all images
directory = 'D:/image_cap/flickr30k-images'
features, image_names = extract_features(directory)
print('Extracted Features: %d' % len(features))
# save to file
dump(features, open('features.pkl', 'wb'))

data = '\n'.join(image_names)
file = open('image_names.txt', 'w')
file.write(data)
file.close()

completed:  0 /5000 images
completed:  100 /5000 images
completed:  200 /5000 images
completed:  300 /5000 images
completed:  400 /5000 images
completed:  500 /5000 images
completed:  600 /5000 images
completed:  700 /5000 images
completed:  800 /5000 images
completed:  900 /5000 images
completed:  1000 /5000 images
completed:  1100 /5000 images
completed:  1200 /5000 images
completed:  1300 /5000 images
completed:  1400 /5000 images
completed:  1500 /5000 images
completed:  1600 /5000 images
completed:  1700 /5000 images
completed:  1800 /5000 images
completed:  1900 /5000 images
completed:  2000 /5000 images
completed:  2100 /5000 images
completed:  2200 /5000 images
completed:  2300 /5000 images
completed:  2400 /5000 images
completed:  2500 /5000 images
completed:  2600 /5000 images
completed:  2700 /5000 images
completed:  2800 /5000 images
completed:  2900 /5000 images
completed:  3000 /5000 images
completed:  3100 /5000 images
completed:  3200 /5000 images
completed:  3300 /5000

In [150]:
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r', encoding="utf8")
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

In [151]:
filename = "D:/image_cap/results_20130124.txt"
# load descriptions
doc = load_doc(filename)

In [152]:
# extract descriptions for images
def load_descriptions(doc):
    mapping = dict()
    # process lines
    for line in doc.split('\n'):
        # split line by white space
        tokens = line.split()
        if len(line) < 2:
            continue
        # take the first token as the image id, the rest as the description
        image_id, image_desc = tokens[0], tokens[1:]
        # remove filename from image id
        image_id = image_id.split('.')[0]
        # convert description tokens back to string
        image_desc = ' '.join(image_desc)
        # create the list if needed
        if image_id not in mapping:
            mapping[image_id] = list()
        # store description
        mapping[image_id].append(image_desc)
    return mapping

In [153]:
# parse descriptions
descriptions = load_descriptions(doc)
print('Loaded: %d ' % len(descriptions))

Loaded: 31783 


In [154]:
import string
 
def clean_descriptions(descriptions):
    # prepare translation table for removing punctuation
    table = str.maketrans('', '', string.punctuation)
    for key, desc_list in descriptions.items():
        for i in range(len(desc_list)):
            desc = desc_list[i]
            # tokenize
            desc = desc.split()
            # convert to lower case
            desc = [word.lower() for word in desc]
            # remove punctuation from each token
            desc = [w.translate(table) for w in desc]
            # remove hanging 's' and 'a'
            desc = [word for word in desc if len(word)>1]
            # remove tokens with numbers in them
            desc = [word for word in desc if word.isalpha()]
            # store as string
            desc_list[i] =  ' '.join(desc)

In [155]:
# clean descriptions
clean_descriptions(descriptions)

In [156]:
# convert the loaded descriptions into a vocabulary of words
def to_vocabulary(descriptions):
    # build a list of all description strings
    all_desc = set()
    for key in descriptions.keys():
        [all_desc.update(d.split()) for d in descriptions[key]]
    return all_desc

In [157]:
# summarize vocabulary
vocabulary = to_vocabulary(descriptions)
print('Vocabulary Size: %d' % len(vocabulary))

Vocabulary Size: 19735


In [158]:
def save_descriptions(descriptions, filename):
    lines = list()
    for key, desc_list in descriptions.items():
        for desc in desc_list:
            lines.append(key + ' ' + desc)
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()

In [159]:
# save descriptions
save_descriptions(descriptions, 'descriptions.txt')

In [5]:
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

# load a pre-defined list of photo identifiers
def load_set(filename):
    doc = load_doc(filename)
    dataset = list()
    # process line by line
    for line in doc.split('\n'):
        # skip empty lines
        if len(line) < 1:
            continue
        # get the image identifier
        identifier = line.split('.')[0]
        dataset.append(identifier)
    return set(dataset)

# load clean descriptions into memory
def load_clean_descriptions(filename, dataset):
    # load document
    doc = load_doc(filename)
    descriptions = dict()
    for line in doc.split('\n'):
        # split line by white space
        tokens = line.split()
        # split id from description
        image_id, image_desc = tokens[0], tokens[1:]
        # skip images not in the set
        if image_id in dataset:
            # create list
            if image_id not in descriptions:
                descriptions[image_id] = list()
            # wrap description in tokens
            desc = 'startseq ' + ' '.join(image_desc) + ' endseq'
            # store
            descriptions[image_id].append(desc)
    return descriptions

# load photo features
def load_photo_features(filename, dataset):
    # load all features
    all_features = load(open(filename, 'rb'))
    # filter features
    features = {k: all_features[k] for k in dataset}
    return features

In [8]:
# SEGREGATING TRAINING DATA AND TEST DATA IN 4:1 RATIO OF THE DATASET.
filename = "image_names.txt"
dataset = load_doc(filename)

In [9]:
def train_test_seperation(doc):
    train_images=list()
    dev_images=list()
    test_images = list()
    i=0
    for line in doc.split('\n'):
        if i<50:
            train_images.append(line)
        if i>=50 and i<60:
            test_images.append(line)
        if i>=60 and i<70:
              dev_images.append(line)
        if i>=70:
            break
        i=i+1;
    return train_images, dev_images, test_images

In [10]:
train_images, dev_images, test_images = train_test_seperation(dataset)

In [11]:
def save_images(filename, doc):
    data = '\n'.join(doc)
    file = open(filename, 'w')
    file.write(data)
    file.close()

In [12]:
save_images('train_images.txt', train_images)
save_images('test_images.txt', test_images)
save_images('dev_images.txt', dev_images)

In [13]:
# load training dataset (6K)
filename = 'train_images.txt'
train = load_set(filename)
print('Dataset: %d' % len(train))
# descriptions
train_descriptions = load_clean_descriptions('descriptions.txt', train)
print('Descriptions: train=%d' % len(train_descriptions))
# photo features
train_features = load_photo_features('features.pkl', train)
print('Photos: train=%d' % len(train_features))

Dataset: 50
Descriptions: train=50
Photos: train=50


In [14]:
# load dev dataset (6K)
filename = 'dev_images.txt'
dev = load_set(filename)
print('Dataset: %d' % len(dev))
# descriptions
dev_descriptions = load_clean_descriptions('descriptions.txt', dev)
print('Descriptions: dev=%d' % len(dev_descriptions))
# photo features
dev_features = load_photo_features('features.pkl', dev)
print('Photos: dev=%d' % len(dev_features))

Dataset: 10
Descriptions: dev=10
Photos: dev=10


In [15]:
# load testing dataset (6K)
filename = 'test_images.txt'
test = load_set(filename)
print('Dataset: %d' % len(test))
# descriptions
test_descriptions = load_clean_descriptions('descriptions.txt', test)
print('Descriptions: test=%d' % len(test_descriptions))
# photo features
test_features = load_photo_features('features.pkl', test)
print('Photos: test=%d' % len(test_features))

Dataset: 10
Descriptions: test=10
Photos: test=10


In [16]:
# convert a dictionary of clean descriptions to a list of descriptions
def to_lines(descriptions):
    all_desc = list()
    for key in descriptions.keys():
        [all_desc.append(d) for d in descriptions[key]]
    return all_desc
 
# fit a tokenizer given caption descriptions
def create_tokenizer(descriptions):
    lines = to_lines(descriptions)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [17]:
# prepare tokenizer
tokenizer = create_tokenizer(train_descriptions)
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)
dump(tokenizer.word_index, open('tokenizer.pkl', 'wb'))

Vocabulary Size: 678


In [18]:
def create_sequences(tokenizer, max_length, descriptions, photos):
    X1, X2, y = list(), list(), list()
    # walk through each image identifier
    for key, desc_list in descriptions.items():
        # walk through each description for the image
        for desc in desc_list:
            # encode the sequence
            seq = tokenizer.texts_to_sequences([desc])[0]
            # split one sequence into multiple X,y pairs
            for i in range(1, len(seq)):
                # split into input and output pair
                in_seq, out_seq = seq[:i], seq[i]
                # pad input sequence
                in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                # encode output sequence
                out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                # store
                X1.append(photos[key][0])
                X2.append(in_seq)
                y.append(out_seq)
    return np.array(X1), np.array(X2), np.array(y)

In [19]:
def max_length(descriptions):
    lines = to_lines(descriptions)
    return max(len(d.split()) for d in lines)

In [20]:
X1train, X2train, ytrain = create_sequences(tokenizer, max_length(train_descriptions), train_descriptions, train_features)

In [21]:
X1test, X2test, ytest = create_sequences(tokenizer, max_length(train_descriptions), test_descriptions, test_features)

In [22]:
X1dev, X2dev, ydev = create_sequences(tokenizer, max_length(train_descriptions), dev_descriptions, dev_features)

In [23]:
def define_model(vocab_size, max_length):
    # feature extractor model
    inputs1 = Input(shape=(4096,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation='relu')(fe1)
    # sequence model
    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)
    # decoder model
    decoder1 = add([fe2, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax')(decoder2)
    # tie it together [image, seq] [word]
    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    # summarize model
    print(model.summary())
    return model

In [24]:
model = define_model(vocab_size, max_length(train_descriptions))

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 35)           0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 4096)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 35, 256)      173568      input_3[0][0]                    
__________________________________________________________________________________________________
dropout_1 (Dropout)             (None, 4096)         0           input_2[0][0]                    
__________________________________________________________________________________________________
dropout_2 

In [25]:
# define checkpoint callback
filepath = 'checkpoints/model-ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}.h5'
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

In [26]:
# fit model
model.fit([X1train, X2train], ytrain, epochs=5, verbose=2, callbacks=[checkpoint], validation_data=([X1dev, X2dev], ydev))

Train on 2782 samples, validate on 441 samples
Epoch 1/5
 - 47s - loss: 5.7215 - val_loss: 5.0274

Epoch 00001: val_loss improved from inf to 5.02745, saving model to checkpoints/model-ep001-loss5.721-val_loss5.027.h5
Epoch 2/5
 - 43s - loss: 4.7555 - val_loss: 5.2244

Epoch 00002: val_loss did not improve
Epoch 3/5
 - 40s - loss: 4.1459 - val_loss: 5.0771

Epoch 00003: val_loss did not improve
Epoch 4/5
 - 40s - loss: 3.7140 - val_loss: 4.9898

Epoch 00004: val_loss improved from 5.02745 to 4.98978, saving model to checkpoints/model-ep004-loss3.714-val_loss4.990.h5
Epoch 5/5
 - 37s - loss: 3.4741 - val_loss: 4.9315

Epoch 00005: val_loss improved from 4.98978 to 4.93145, saving model to checkpoints/model-ep005-loss3.474-val_loss4.931.h5


<tensorflow.python.keras._impl.keras.callbacks.History at 0xd8b30dc18>

In [27]:
# evaluate the skill of the model
def evaluate_model(model, descriptions, photos, tokenizer, max_length):
    actual, predicted = list(), list()
    # step over the whole set
    for key, desc_list in descriptions.items():
        # generate description
        yhat = generate_desc(model, tokenizer, photos[key], max_length)
        # store actual and predicted
        references = [d.split() for d in desc_list]
        actual.append(references)
        predicted.append(yhat.split())
    # calculate BLEU score
    print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
    print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
    print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
    print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

In [28]:
evaluate_model(model, test_descriptions, test_features, tokenizer, max_length(train_descriptions))

NameError: name 'generate_desc' is not defined

In [None]:
photo_features = features['1000092795']

In [None]:
desc = generate_desc(model, tokenizer, photo_features, max_length(train_descriptions))

In [None]:
desc

In [None]:
image_file = '1000092795.jpg'
Image.open(image_file)