In [7]:
import string
import numpy as np
from PIL import Image
import os
import pickle
from pickle import dump, load
import numpy as np
from keras.utils import plot_model
from keras.applications.xception import Xception, preprocess_input
from tensorflow.keras.utils import img_to_array , load_img
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences
from keras.utils import to_categorical
from tensorflow.keras.layers import concatenate
from keras.models import Model, load_model
from keras.layers import Input, Dense, LSTM, Embedding, Dropout


In [8]:
def load_document(filename):
  file = open(filename,'r')
  data = file.read()
  file.close()
  return data

In [9]:
def load_image_captions(filename):
  file = load_document(filename)
  img_captions = file.split('\n')
  del img_captions[0]
  descriptions = {}
  rearranged = []

  for img_caption in img_captions:
    index = img_caption.find(',')
    img_caption = img_caption[:index] + "\t" + img_caption[index+1:] 
    rearranged.append(img_caption)
  img_captions = rearranged
  
  for img_caption in img_captions:
    image , caption = img_caption.split('\t')
    if image not in descriptions:
      descriptions[image] = [caption]
    else:
      descriptions[image].append(caption)
  return descriptions

In [10]:
def cleaning_captions(captions):
  table = str.maketrans('','',string.punctuation)
  for image,img_captions in captions.items():
    for index, image_caption in enumerate(img_captions):
      image_caption.replace("-","")
      caption_words = image_caption.split()
      caption_words = [word.lower() for word in caption_words]
      caption_words = [word.translate(table) for word in caption_words]
      caption_words = [word for word in caption_words if len(word)>1]
      caption_words = [word for word in caption_words if word.isalpha()]
      image_caption = ' '.join(caption_words)
      captions[image][index] = image_caption
  return captions

In [11]:
def vocabulary(image_captions):
  vocabulary = set()
  for image in image_captions.keys():
    [vocabulary.update(caption.split()) for caption in image_captions[image]]
  return vocabulary

In [12]:
def save_to_file(image_captions,filename):
  image_captions_list = list()
  for image,captions in image_captions.items():
    for caption in captions:
      image_captions_list.append(image + '\t' + caption)
  data = "\n".join(image_captions_list)
  file = open(filename,'w')
  file.write(data)
  file.close()

In [21]:
filename = "/content/captions.txt"
image_caption = load_image_captions(filename)
image_captions = cleaning_captions(image_caption)
language_vocabulary = vocabulary(image_captions)
save_to_file(image_captions,'image_captions.txt')

In [14]:
def extract_features(file_location):
  model = Xception(include_top = False,pooling = 'avg')
  features = {}
  for image in os.listdir(file_location):
    img = file_location + '/' + image 
    img = Image.open(img)
    img = img.resize((299,299))
    img = np.expand_dims(img, axis=0)
    img = img/127.5
    img = img - 1.0
    
    feature = model.predict(img)
    features[image] = feature
  return features
      

file_location = "/content/Flicker8k_Dataset"
features = extract_features(file_location)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m


In [15]:
dump(features,open("Imagefeatures.p","wb"))
len(features)

8091

In [16]:
features

{'2650620212_0586016e0d.jpg': array([[0.11478459, 0.01220256, 0.04452307, ..., 0.09385932, 0.01691082,
         0.8238552 ]], dtype=float32),
 '1998255400_0cd086908f.jpg': array([[0.4464863 , 0.1165757 , 0.05023019, ..., 0.11435186, 0.00787378,
         0.01820921]], dtype=float32),
 '3677613006_4689cb8e4e.jpg': array([[0.        , 0.34320092, 0.        , ..., 0.02530656, 0.01392863,
         0.43037793]], dtype=float32),
 '1389651420_8d95d8f6ed.jpg': array([[0.02839253, 0.2562915 , 0.        , ..., 0.        , 0.01098079,
         0.2142018 ]], dtype=float32),
 '3259883609_6a1b46919e.jpg': array([[3.8384292e-01, 0.0000000e+00, 3.6622601e-04, ..., 4.2943176e-02,
         5.0339578e-03, 1.7239831e-02]], dtype=float32),
 '3189251454_03b76c2e92.jpg': array([[0.0028904 , 0.02617667, 0.        , ..., 0.03504298, 0.01607359,
         0.        ]], dtype=float32),
 '2981702521_2459f2c1c4.jpg': array([[0.01130221, 0.01176566, 0.03118346, ..., 0.05140596, 0.19574302,
         0.        ]], dtyp

In [17]:
def load_imgNames(description):
    return list(description.keys())

In [18]:
def load_clean_captions(filename,imgNames):
    imgCaptions = load_document(filename)
    imgCaptions = imgCaptions.split('\n')
    descriptions = {}
    for imgCaption in imgCaptions:
        words = imgCaption.split()
        if len(words) < 1:
            continue
        
        imgName = words[0]
        imgCap = words[1:]
        
        if imgName in imgNames:
            if imgName not in descriptions:
                descriptions[imgName] = []
            
            imgCap = "<start> " + " ".join(imgCap) + " <end>"
            descriptions[imgName].append(imgCap)
    
    return descriptions
    

In [22]:
clean_ImgCaptions = load_clean_captions('/content/image_captions.txt',load_imgNames(image_captions)[:-1])
imgNames = load_imgNames(clean_ImgCaptions)
# load_imgFeatures(imgNames)

In [23]:
def list_captions(imgCaptions):
    imgNames = imgCaptions.keys()
    captions_list = []
    for imgName in imgNames:
        [captions_list.append(imgCaption) for imgCaption in imgCaptions[imgName]]
    return captions_list

In [24]:
def create_Tokenizer(imgCaptions):
    captions_list = list_captions(imgCaptions)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(captions_list)
    return tokenizer

In [25]:
tokenizer = create_Tokenizer(clean_ImgCaptions)
dump(tokenizer,open('tokens.p','wb'))
vocabulary_size = len(tokenizer.word_index) + 1

In [26]:
def max_length(imgCaptions):
    captions_list = list_captions(imgCaptions)
    return max(len(imgCaption.split()) for imgCaption in captions_list)

maxlength = max_length(image_caption)
maxlength

32

In [27]:
vocabulary_size

8764

In [28]:
def create_sequences(tokenizer, max_length, img_captions, feature):
    X1, X2, y = list(), list(), list()
    for img_caption in img_captions:
        
        seq = tokenizer.texts_to_sequences([img_caption])[0]
        for i in range(1, len(seq)):
            in_seq, out_seq = seq[:i], seq[i]
            in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
            out_seq = to_categorical([out_seq], num_classes=vocabulary_size)[0]
            
            X1.append(feature)
            X2.append(in_seq)
            y.append(out_seq)
    return np.array(X1), np.array(X2), np.array(y)

def data_generator(imgCaptions, features, tokenizer, max_length):
    while 1:
        for key, img_captions in imgCaptions.items():
            feature = features[key][0]
            input_image, input_sequence, output_word = create_sequences(tokenizer, max_length, img_captions, feature)
            yield [[input_image, input_sequence], output_word]

In [29]:
[a,b],c = next(data_generator(clean_ImgCaptions, features, tokenizer, maxlength))
a.shape, b.shape, c.shape

((47, 2048), (47, 32), (47, 8764))

In [30]:
def define_model(vocabulary_size, max_length):
    inputs1 = Input(shape=(2048,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation='relu')(fe1)
    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocabulary_size, 256, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)
    decoder1 = concatenate([fe2, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocabulary_size, activation='softmax')(decoder2)
    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    print(model.summary())
    plot_model(model, to_file='model.png', show_shapes=True)
    return model

In [31]:
model = define_model(vocabulary_size, maxlength)
epochs = 10
steps = len(clean_ImgCaptions)
for i in range(epochs):
    generator = data_generator(clean_ImgCaptions, features, tokenizer, maxlength)
    model.fit(generator, epochs=1, steps_per_epoch= steps, verbose=1)

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 32)]         0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 2048)]       0           []                               
                                                                                                  
 embedding (Embedding)          (None, 32, 256)      2243584     ['input_3[0][0]']                
                                                                                                  
 dropout (Dropout)              (None, 2048)         0           ['input_2[0][0]']                
                                                                                              

KeyboardInterrupt: ignored