In [None]:
import string
import numpy as np
import tensorflow as tf
from PIL import Image
import argparse
import os
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt
import pickle
from pickle import dump, load
import numpy as np
from keras.utils import plot_model
from keras.applications.xception import Xception, preprocess_input
from tensorflow.keras.utils import img_to_array , load_img
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences
from keras.utils import to_categorical
from tensorflow.keras.layers import concatenate
from keras.models import Model, load_model
from keras.layers import Input, Dense, LSTM, Embedding, Dropout


In [None]:
def load_document(filename):
  file = open(filename,'r')
  data = file.read()
  file.close()
  return data

In [None]:
def load_image_captions(filename):
  file = load_document(filename)
  img_captions = file.split('\n')
  del img_captions[0]
  descriptions = {}
  rearranged = []

  for img_caption in img_captions[:-1]:
    index = img_caption.find(',')
    img_caption = img_caption[:index] + "\t" + img_caption[index+1:] 
    rearranged.append(img_caption)
  img_captions = rearranged
  
  for img_caption in img_captions:
    image , caption = img_caption.split('\t')
    if image not in descriptions:
      descriptions[image] = [caption]
    else:
      descriptions[image].append(caption)
  return descriptions

In [None]:
def cleaning_captions(captions):
  table = str.maketrans('','',string.punctuation)
  for image,img_captions in captions.items():
    for index, image_caption in enumerate(img_captions):
      image_caption.replace("-","")
      caption_words = image_caption.split()
      caption_words = [word.lower() for word in caption_words]
      caption_words = [word.translate(table) for word in caption_words]
      caption_words = [word for word in caption_words if len(word)>1]
      caption_words = [word for word in caption_words if word.isalpha()]
      image_caption = ' '.join(caption_words)
      captions[image][index] = image_caption
  return captions

In [None]:
def vocabulary(image_captions):
  vocabulary = set()
  for image in image_captions.keys():
    [vocabulary.update(caption.split()) for caption in image_captions[image]]
  return vocabulary

In [None]:
def save_to_file(image_captions,filename):
  image_captions_list = list()
  for image,captions in image_captions.items():
    for caption in captions:
      image_captions_list.append(image + '\t' + caption)
  data = "\n".join(image_captions_list)
  file = open(filename,'w')
  file.write(data)
  file.close()

In [None]:
filename = "/content/captions.txt"
image_caption = load_image_captions(filename)
image_caption
image_captions = cleaning_captions(image_caption)
language_vocabulary = vocabulary(image_captions)
save_to_file(image_captions,'image_captions.txt')

In [None]:
def extract_features(file_location):
  model = Xception(include_top = False,pooling = 'avg')
  features = {}
  for image in os.listdir(file_location):
    img = file_location + '/' + image 
    img = Image.open(img)
    img = img.resize((299,299))
    img = np.expand_dims(img, axis=0)
    img = img/127.5
    img = img - 1.0
    
    feature = model.predict(img)
    features[image] = feature
  return features
      

file_location = "/content/Flicker8k_Dataset"
features = extract_features(file_location)

In [None]:
dump(features,open("Imagefeatures.p","wb"))
len(features)

In [None]:
features

In [None]:
def load_imgNames(description):
    return list(description.keys())

In [None]:
def load_clean_captions(filename,imgNames):
    imgCaptions = load_document(filename)
    imgCaptions = imgCaptions.split('\n')
    descriptions = {}
    for imgCaption in imgCaptions:
        words = imgCaption.split()
        if len(words) < 1:
            continue
        
        imgName = words[0]
        imgCap = words[1:]
        
        if imgName in imgNames:
            if imgName not in descriptions:
                descriptions[imgName] = []
            
            imgCap = "<start> " + " ".join(imgCap) + " <end>"
            descriptions[imgName].append(imgCap)
    
    return descriptions
    

In [None]:
def load_imgFeatures(imgNames):
  img_features = load(open('Imagefeatures.p','rb'))
  features = {k:img_features[k] for k in imgNames}

In [None]:
imgNames = load_imgNames(image_caption)[:-1]
clean_ImgCaptions = load_clean_captions('/content/image_captions.txt',imgNames)
load_imgFeatures(imgNames)

In [None]:
def list_captions(imgCaptions):
    imgNames = imgCaptions.keys()
    captions_list = []
    for imgName in imgNames:
        [captions_list.append(imgCaption) for imgCaption in imgCaptions[imgName]]
    return captions_list

In [None]:
def create_Tokenizer(imgCaptions):
    captions_list = list_captions(imgCaptions)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(captions_list)
    return tokenizer

In [None]:
tokenizer = create_Tokenizer(clean_ImgCaptions)
dump(tokenizer,open('tokens.p','wb'))
vocabulary_size = len(tokenizer.word_index) + 1

In [None]:
vocabulary_size

In [None]:
def max_length(imgCaptions):
    captions_list = list_captions(imgCaptions)
    return max(len(imgCaption.split()) for imgCaption in captions_list)

maxlength = max_length(image_caption)
maxlength

In [None]:
def create_sequences(tokenizer, max_length, img_captions, feature):
    X1, X2, y = list(), list(), list()
    for img_caption in img_captions:
        
        seq = tokenizer.texts_to_sequences([img_caption])[0]
        for i in range(1, len(seq)):
            in_seq, out_seq = seq[:i], seq[i]
            in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
            out_seq = to_categorical([out_seq], num_classes=vocabulary_size)[0]
            
            X1.append(feature)
            X2.append(in_seq)
            y.append(out_seq)
    return np.array(X1), np.array(X2), np.array(y)

def data_generator(imgCaptions, features, tokenizer, max_length):
    while 1:
        for key, img_captions in imgCaptions.items():
            feature = features[key][0]
            input_image, input_sequence, output_word = create_sequences(tokenizer, max_length, img_captions, feature)
            yield [[input_image, input_sequence], output_word]

In [None]:
[a,b],c = next(data_generator(clean_ImgCaptions, features, tokenizer, maxlength))
a.shape, b.shape, c.shape

In [None]:
def define_model(vocabulary_size, max_length):
    inputs1 = Input(shape=(2048,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation='relu')(fe1)
    fe3 = Dense(128, activation='relu')(fe2)

    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocabulary_size, 256, mask_zero=True)(inputs2)
    se2 = Dropout(0.4)(se1)
    se3 = LSTM(256)(se2)
    decoder1 = concatenate([fe3, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    decoder3 = Dense(128, activation='relu')(decoder2)
    outputs = Dense(vocabulary_size, activation='softmax')(decoder3)
    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer=Adam(lr = 0.002))
    print(model.summary())
    plot_model(model, to_file='model.png', show_shapes=True)
    return model

In [None]:
model = define_model(vocabulary_size, maxlength)
epochs = 50
steps = len(clean_ImgCaptions)
for i in range(epochs):
    generator = data_generator(clean_ImgCaptions, features, tokenizer, maxlength)
    model.fit(generator, epochs=1, steps_per_epoch= steps, verbose=1)

In [None]:
def word_for_id(integer, tokenizer):
  for word, index in tokenizer.word_index.items():
     if index == integer:
         return word
  return None

In [None]:
def generate_desc(model, tokenizer, photo, max_length):
    in_text = 'start'
    for i in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length)
        pred = model.predict([photo,sequence], verbose=0)
        pred = np.argmax(pred)
        word = word_for_id(pred, tokenizer)
        if word is None:
            break
        in_text += ' ' + word
        if word == 'end':
            break
    return in_text

In [None]:
tokenizer = load(open("tokens.p","rb"))

In [None]:
def extract_features(filename, model):
        try:
            image = Image.open(filename)
        except:
            print("ERROR: Couldn't open image! Make sure the image path and extension is correct")
        image = image.resize((299,299))
        image = np.array(image)
        # for images that has 4 channels, we convert them into 3 channels
        if image.shape[2] == 4: 
            image = image[..., :3]
        image = np.expand_dims(image, axis=0)
        image = image/127.5
        image = image - 1.0
        feature = model.predict(image)
        return feature

In [None]:
xception_model = Xception(include_top=False, pooling="avg")

In [None]:
filePath = '/content/Flicker8k_Dataset'
for i in range(0,100):
  filename = filePath + '/' + os.listdir(filePath)[i]
  img_feature = extract_features(filename,xception_model)

  img_description = generate_desc(model,tokenizer,img_feature,maxlength)
  image = Image.open(filename)
  plt.imshow(image)
  plt.show()
  print(img_description)