In [62]:
import numpy as np
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
import os
import PIL.Image
from pickle import dump, load
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.preprocessing import image
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
#from keras.layers.merge import add
from tensorflow.keras.layers import Input, Dense, Flatten, LSTM, Dropout
from tensorflow.keras.callbacks import ModelCheckpoint
#from nltk.translate.bleu_score import corpus_blue

In [2]:
def extract_features(path):
    model = VGG16()
    #model = model.layers.pop()
    model = Model(inputs = model.input, outputs = model.layers[-2].output)
    print(model.summary())
    features = dict()
    for name in os.listdir(path):
        img_path = path + '/' + name
        img = image.load_img(img_path, target_size = (224,224))
        img = image.img_to_array(img)
        img = np.expand_dims(img, axis = 0)
        img = preprocess_input(img)
        feature = model.predict(img, verbose = 0)
        img_name = name.split('.')[0]
        features[img_name] = feature
    return features

In [3]:
path = r'Flicker8k_Dataset'

In [4]:
features = extract_features(path)

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 224, 224, 3)]     0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 224, 224, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 224, 224, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 112, 112, 64)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 112, 112, 128)     73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 112, 112, 128)     147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 56, 56, 128)      

In [5]:
print("Total number of features extracted = ", len(features))


Total number of features extracted =  8091


In [6]:
dump(features, open('features.pkl', 'wb'))


In [7]:
def load_doc(filepath):
    file = open(filepath, 'r')
    text = file.read()
    file.close()
    return text

In [8]:
def map_photo_to_desc(descriptions):
    desc_mapping = dict()
    for line in descriptions.split('\n'):
        words = line.split()
        if(len(line) < 2):
            continue
        img_id, img_desc = words[0], words[1:]
        img_id = img_id.split('.')[0]
        img_desc = ' '.join(img_desc)
        if img_id not in desc_mapping:
            desc_mapping[img_id] = list()
        desc_mapping[img_id].append(img_desc)   
    return desc_mapping        

In [9]:
import string

In [10]:
def clean_desc(desc_mapping):
    table = str.maketrans('','', string.punctuation)
    for key, descriptions in desc_mapping.items():
        for i in range(len(descriptions)):
            description = descriptions[i]
            description = description.split()
            description = [word.lower() for word in description]
            description = [word.translate(table) for word in description]
            description = [word for word in description if (len(word) > 1)] 
            description = [word for word in description if (word.isalpha())]
            descriptions[i] = ' '.join(description)
            #new
            #descriptions[key][i] = descriptions
    #return descriptions       

NameError: name 'descriptions' is not defined

In [12]:
def to_vocabulary(descriptions):
    all_desc = set()
    for key in descriptions.keys():
        [all_desc.update(d.split()) for d in descriptions[key]]
    return all_desc  

In [13]:
def save_descriptions(descriptions, filepath):
    lines = list()
    for key, desc_list in descriptions.items():
        for desc in desc_list:
            lines.append(key+' '+ desc)
    data = "\n".join(lines)
    file = open(filepath, 'w')
    file.write(data)
    file.close()

In [14]:
file_name = r'extras/Flickr8k.token.txt'
doc = load_doc(file_name)
descriptions = map_photo_to_desc(doc)
print("Loaded = ", len(descriptions))
clean_desc(descriptions)
vocabulary = to_vocabulary(descriptions)
print("Vocabulary length ",len(vocabulary))
save_descriptions(descriptions, "descriptions.txt")

Loaded =  8092
Vocabulary length  8763


In [15]:
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, Dropout

In [16]:
def load_doc(filepath):
    file = open(filepath, 'r')
    text = file.read()
    file.close()
    return text

In [17]:
def load_photo_identifiers(filepath):
    file = load_doc(filepath)
    photos = list()
    for line in file.split('\n'):
        if(len(line) < 1):
            continue
        identifier = line.split('.')[0]
        photos.append(identifier)
    return set(photos)

In [18]:
def load_clean_descriptions(filepath, photos):
    file = load_doc(filepath)
    descriptions = dict()
    for line in file.split('\n'):
        words = line.split()
        img_id, img_desc = words[0], words[1:]
        if img_id in photos:
            if img_id not in descriptions:
                descriptions[img_id] = list()
            desc = 'startseq ' + ' '.join(img_desc) + ' endseq'
            descriptions[img_id].append(desc)
    return descriptions

In [19]:
def load_photo_features(filepath, photos):
    all_features = load(open(filepath, 'rb'))
    features = {k:all_features[k] for k in photos}
    return features

In [20]:
filepath = r"extras/Flickr_8k.trainImages.txt"
train = load_photo_identifiers(filepath)
print("Dataset : ", len(train))
train_descriptions = load_clean_descriptions('descriptions.txt', train)
print("Descriptions: train = ", len(train_descriptions))
train_features = load_photo_features('features.pkl', train)
print("Photos: train = ", len(train_features))

Dataset :  6000
Descriptions: train =  6000
Photos: train =  6000


In [21]:
def to_lines(descriptions):
    all_desc = []
    for key in descriptions.keys():
        [all_desc.append(d) for d in descriptions[key]]
    return all_desc

In [22]:
def create_tokenizer(descriptions):
    lines = to_lines(descriptions)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [23]:
tokenizer = create_tokenizer(train_descriptions)
dump(tokenizer, open('tokenizer.pkl','wb'))
vocab_size = len(tokenizer.word_index) + 1
print("Vocabulary size", vocab_size)

Vocabulary size 7579


In [24]:
def max_length(descriptions):
    lines = to_lines(descriptions)
    return max(len(d.split()) for d in lines)

In [25]:
max_length = max_length(descriptions)
max_length

32

In [None]:
#real

In [None]:
# #def data_generator(descriptions, photos, tokenizer, max_length):
#     while 1:
#         for key, description_list in descriptions.items():
#             photo = photos[key][0]
#             input_image, input_sequence, output_word = create_sequences(tokenizer, max_length, description_list, photo)
#             yield([np.array(input_image), np.array(input_sequence)], np.array(output_word))
            
# #def create_sequences(tokenizer, max_length, desc_list, photo):
#     X1, X2, y = list(), list(), list()
#     for desc in desc_list:
#         seq = tokenizer.texts_to_sequences([desc])[0]
#         for i in range(1, len(seq)):
#             in_seq, out_seq = seq[:i], seq[i]
#             in_seq = pad_sequences([in_seq], maxlen = max_length)[0]
#             out_seq = to_categorical([out_seq], num_classes = vocab_size)[0]
#             X1.append(photo)
#             X2.append(in_seq)
#             y.append(out_seq)
#     return np.array(X1), np.array(X2), np.array(y)

In [26]:
def data_generator(descriptions, photos, tokenizer, max_length, vocab_size):
	# loop for ever over images
	while 1:
		for key, desc_list in descriptions.items():
			# retrieve the photo feature
			photo = photos[key][0]
			in_img, in_seq, out_word = create_sequences(tokenizer, max_length, desc_list, photo, vocab_size)
			yield [in_img, in_seq], out_word

In [27]:
def create_sequences(tokenizer, max_length, desc_list, photo, vocab_size):
	X1, X2, y = list(), list(), list()
	# walk through each description for the image
	for desc in desc_list:
		# encode the sequence
		seq = tokenizer.texts_to_sequences([desc])[0]
		# split one sequence into multiple X,y pairs
		for i in range(1, len(seq)):
			# split into input and output pair
			in_seq, out_seq = seq[:i], seq[i]
			# pad input sequence
			in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
			# encode output sequence
			out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
			# store
			X1.append(photo)
			X2.append(in_seq)
			y.append(out_seq)
	return np.array(X1), np.array(X2), np.array(y)

In [28]:
generator = data_generator(train_descriptions, train_features, tokenizer, max_length, vocab_size)
inputs, outputs = next(generator)
print(inputs[0].shape)
print(inputs[1].shape)
print(outputs.shape)

(47, 4096)
(47, 32)
(47, 7579)


In [29]:
(a,b),c = next(data_generator(train_descriptions, train_features, tokenizer, max_length, vocab_size))
a.shape, b.shape, c.shape
#((47, 2048), (47, 32), (47, 7577))
#a, b, c = create_sequences(tokenizer, max_length, descriptions, photos, vocab_size)
#a.shape, b.shape, c.shape

((47, 4096), (47, 32), (47, 7579))

In [30]:
from tensorflow.keras.utils import plot_model
def define_model(vocab_size, max_length):
    #feature extractor model
    i1 = Input(shape = (4096,))
    fe1 = Dropout(0.5)(i1)
    fe2 = Dense(256, activation = 'relu')(fe1)
    #sequence model
    i2 = Input(shape = (max_length,))
    se1 = Embedding(vocab_size, 256, mask_zero = True)(i2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)
    #decoder model
    decoder1 = add([fe2, se3])
    decoder2 = Dense(256, activation = 'relu')(decoder1)
    out = Dense(vocab_size, activation = 'softmax')(decoder2)
    #Binding them together
    model = Model(inputs = [i1, i2], outputs = out)
    model.compile(loss = 'categorical_crossentropy', optimizer = 'adam')
    #Model summary
    print(model.summary())
    plot_model(model, to_file = 'model.png', show_shapes = True)
    return model

In [32]:
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import add
from tensorflow.keras.layers import concatenate
train_descriptions = load_clean_descriptions('descriptions.txt', train)
print("Descriptions: train = ", len(train_descriptions))
train_features = load_photo_features('features.pkl', train)
print("Photos: train = ", len(train_features))
model = define_model(vocab_size, max_length)
epochs = 10
steps = len(train_descriptions)

for i in range(epochs):
    generator = data_generator(train_descriptions, train_features, tokenizer, max_length, vocab_size)
    model.fit_generator(generator, epochs = 1, steps_per_epoch = steps, verbose = 1)
    model.save('model_' + str(i) + '.h5')

Descriptions: train =  6000
Photos: train =  6000
Model: "functional_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, 32)]         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 4096)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 32, 256)      1940224     input_3[0][0]                    
__________________________________________________________________________________________________
dropout (Dropout)               (None, 4096)         0           input_2[0][0]                    
_____________________________________

In [34]:
[a,b],c = next(data_generator(train_descriptions, features, tokenizer, max_length, vocab_size))
a.shape, b.shape, c.shape
#((47, 2048), (47, 32), (47, 7577))

((47, 4096), (47, 32), (47, 7579))

# Testing the Mode

In [35]:
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
import argparse

In [63]:
from keras.preprocessing.image import load_img

def extract_features(filename):
    model = VGG16()
    model.layers.pop()
    model = Model(model.input, outputs = model.layers[-1].output)
    image = load_img(filename, target_size = (224, 224))
    image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
    image = preprocess_input(image)
    feature = model.predict(image, verbose = 0)
    return feature

In [64]:
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

In [65]:
def generate_desc(model, tokenizer, photo, max_length):
    in_text = 'start'
    for i in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length)
        yhat = model.predict([photo,sequence], verbose=0)
        yhat = np.argmax(pred)
        word = word_for_id(yhat, tokenizer)
        if word is None:
            break
        in_text += ' ' + word
        if word == 'end':
            break
    return in_text

In [66]:
tokenizer = load(open('tokenizer.pkl', 'rb'))
max_length = 34

In [75]:
import tensorflow as tf

gpu_options = tf.compat.v1.GPUOptions(allow_growth=True)
session = tf.compat.v1.InteractiveSession(config=tf.compat.v1.ConfigProto(gpu_options=gpu_options))

In [76]:
model = load_model('model_9.h5')
path = 'Check.jpg'
photo = extract_features(path)
description = generate_desc(model, tokenizer, photo, max_length)
print("\n\n")
print(description)
plt.imshow(img)

ResourceExhaustedError: OOM when allocating tensor with shape[25088,4096] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc [Op:RandomUniform]