In [1]:
import pandas as pd
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
from pickle import dump, load
import string
import os
from os import listdir
import glob
from keras.applications.xception import Xception, preprocess_input
from tensorflow.keras.utils import load_img, img_to_array
from keras.utils import plot_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.layers import add
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.models import Model, load_model
from tqdm.notebook import tqdm
from keras.layers import Input, Dense, LSTM, Embedding, Dropout
from nltk.translate.bleu_score import corpus_bleu

In [20]:
def readFile (path):
    with open(path, encoding="utf8") as file:
        data = file.read()
    return data

#mapping image with the captions
def mapping(captions):
  description={}
  for caption in captions.split('\n'):
    if len(caption)<2:
      continue
    splitted_words = caption.split()
    image_id, image_caption = splitted_words[0], splitted_words[1:]
    #getting image id without extension
    image_id = image_id.split('.')[0]
    image_caption = ' '.join(image_caption)

    if image_id not in description:
        description[image_id] = []

    description[image_id].append(image_caption)

  return description
  

#cleaning the text file such as removing punctuation, coverting every words to lowercase
def clean_data(captions):
   table = str.maketrans('','', string.punctuation)
   for image,caption in captions.items():
     for i,img_caption in enumerate(caption):
       
       img_caption.replace("-"," ")
       description = img_caption.split()

       description = [word.lower() for word in description]
       description = [word.translate(table) for word in description]
       description = [word for word in description if(len(word)>1)]  
       description = [word for word in description if(word.isalpha())]
       caption = ' '.join(description)
       captions[image][i]= caption
       
   return captions
   
#building descriptions into vocabulary of words
def to_vocab(descriptions):
    
    all_desc = set()
    
    for key in descriptions.keys():
        [all_desc.update(d.split()) for d in descriptions[key]]
    
    return all_desc

#saving all the descriptions in one file
def save_data(descriptions, filename):
    lines = list()
    for key, description_list in descriptions.items():
        for description in description_list:
            lines.append(key + '\t' + description )
    data = "\n".join(lines)
    file = open(filename,"w")
    file.write(data)
    file.close()

In [23]:
data = readFile ("Flickr30k_Dataset/data/30k_captions.txt")
#loading the caption data

# text_doc = load_doc(dataset_text)

#mapping captions to their image
captions = mapping(data)
print('Description: %d ' % len(captions))

#cleaning the descriptions
processed_description = clean_data(captions)
print('Processed Description: %d ' % len(processed_description))
#building vocabulary
vocabs = to_vocab(processed_description)
print('Vocabulary: %d' % len(vocabs))

#saving description to file
save_data(processed_description, "Flickr30k_Dataset/data/descriptions.txt")


Description: 31783 
Processed Description: 31783 
Vocabulary: 19735


In [24]:
total_words = []

for key, description_list in processed_description.items():
  for description in description_list:
    for i in description.split():
      total_words.append(i)

print("Total Words = %d" %len(total_words))

Total Words = 1672394


In [25]:
def extract_features(directory):
  model = Xception( include_top=False, pooling='avg' )
  #model.summary()
  features = {}
  for image_path in listdir(directory):
     filename = directory  + '/' + image_path
     image = Image.open(filename)
     image = image.resize((299,299))
     image = np.expand_dims(image, axis=0)
     #image = preprocess_input(image)
     image = image/127.5
     image = image - 1.0
     feature = model.predict(image,verbose=0)
     image_id = image_path.split('.')[0]

     # Storing the feature mapping to the image
     features[image_id] = feature
     
  return features

In [16]:
directory = "flickr30k_images"
features = extract_features(directory)
print('Extracted Features: ', len(features))
#dump features in pickle file
dump(features, open("features.pkl","wb"))

In [28]:
features = load(open("Flickr30k_Dataset/features.pkl","rb"))

In [30]:
# load the file 
def load_file(filename):
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text

# loading list of photos
def load_photo_identifiers(filename):
    
    # Loading the file containing the list of photos
    file = load_file(filename)
  # Creating a list for storing the photos
    photos = list()

    for line in file.split('\n'):
        if len(line) < 1:
            continue

        identifier = line.split('.')[0]
        photos.append(identifier)
        
    # Returning the set of photos created
    return set(photos)

def load_clean_descriptions(filename, photos): 
    file = load_file(filename)
    descriptions = {}

    for line in file.split("\n"):
        words = line.split()

        if len(words)<1 :
            continue

        image, image_caption = words[0], words[1:]

        if image in photos:
            if image not in descriptions:
                descriptions[image] = []
            description = 'startseq ' + " ".join(image_caption) + ' endseq'
            descriptions[image].append(description)

    return descriptions

def load_features(photos):
    #loading all features
    all_features = load(open("Flickr30k_Dataset/features.pkl","rb"))
    #selecting only needed features
    features = {k:all_features[k] for k in photos}
    return features


filename = "Flickr30k_Dataset/data/flickr30k_train.txt"

#train = loading_data(filename)
train_imgs = load_photo_identifiers(filename)
train_descriptions = load_clean_descriptions("Flickr30k_Dataset/data/descriptions.txt", train_imgs)
print(len(train_descriptions))
train_features = load_features(train_imgs)

29000


In [31]:
# converting dictionary of cleaned descriptions to the list 
def dict_to_list(descriptions):
    all_desc = []
    for key in descriptions.keys():
        [all_desc.append(d) for d in descriptions[key]]
    return all_desc


def create_tokenizer(descriptions):
    desc_list = dict_to_list(descriptions)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(desc_list)
    return tokenizer

# give each word an index, and store that into tokenizer.p pickle file
tokenizer = create_tokenizer(train_descriptions)
dump(tokenizer, open("Flickr30k_Dataset/data/tokenizer.pkl", 'wb'))
vocab_size = len(tokenizer.word_index) + 1
vocab_size

19063

In [32]:
def maximum_length(descriptions):
    lines = dict_to_list(descriptions)
    return max(len(d.split()) for d in lines)

max_length = maximum_length(train_descriptions)
max_length

74

In [33]:
def data_generator(descriptions, features, tokenizer, max_length):
    while 1:
        for key, description_list in descriptions.items():
            #retrieve photo features
            feature = features[key][0]
            input_image, input_sequence, output_word = create_sequences(tokenizer, max_length, description_list, feature)
            yield ([input_image, input_sequence], output_word)

def create_sequences(tokenizer, max_length, desc_list, feature):
    X1, X2, y = list(), list(), list()
    # walk through each description for the image
    for desc in desc_list:
        # encode the sequence
        seq = tokenizer.texts_to_sequences([desc])[0]
        # split one sequence into multiple X,y pairs
        for i in range(1, len(seq)):
            # split into input and output pair
            in_seq, out_seq = seq[:i], seq[i]
            # pad input sequence
            in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
            # encode output sequence
            out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
            # store
            X1.append(feature)
            X2.append(in_seq)
            y.append(out_seq)
    return np.array(X1), np.array(X2), np.array(y)
[a,b],c = next(data_generator(train_descriptions, features, tokenizer, max_length))
a.shape, b.shape, c.shape

((52, 2048), (52, 74), (52, 19063))

In [34]:
def define_model(vocab_size, max_length):
    
    # feature extractor model
    inputs1 = Input(shape=(2048,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation='relu')(fe1)

    # sequence model
    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)

    # decoder model
    decoder1 = add([fe2, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax')(decoder2)

    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer='adam')
     # summarize model
    print(model.summary())
    plot_model(model, to_file='model.png', show_shapes=True)
    
    return model

In [35]:
epochs = 5
steps = len(train_descriptions)
model = define_model(vocab_size, max_length)
for i in range(epochs):
    # create data generator
    generator = data_generator(train_descriptions, train_features, tokenizer, max_length)
    # fit for one epoch
    model.fit(generator, epochs=1, steps_per_epoch=steps, verbose=1)
    model.save("Flickr30k_Dataset" + "/model.h5")

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 74)]         0           []                               
                                                                                                  
 input_1 (InputLayer)           [(None, 2048)]       0           []                               
                                                                                                  
 embedding (Embedding)          (None, 74, 256)      4880128     ['input_2[0][0]']                
                                                                                                  
 dropout (Dropout)              (None, 2048)         0           ['input_1[0][0]']                
                                                                                              

KeyboardInterrupt: 

In [None]:
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

def generate_desc(model, tokenizer, photo, max_length):
    in_text = 'startseq'
    for i in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        #padding the input
        sequence = pad_sequences([sequence], maxlen=max_length)
        prob = model.predict([photo,sequence], verbose=0)
        prob = np.argmax(prob)
        word = word_for_id(prob, tokenizer)
        if word is None:
            break
        in_text += ' ' + word
        if word == 'endseq':
            break
    return in_text


def evaluate_model(model, descriptions, photos, tokenizer, max_length):
    actual, predicted = list(), list()
    for key, desc_list in descriptions.items():
        prediction = generate_desc(model, tokenizer, photos[key], max_length)
        actual_desc = [d.split() for d in desc_list]
        actual.append(actual_desc)
        predicted.append(prediction.split())

    print('BLEU-1: ', corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
    print('BLEU-2: ', corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
    print('BLEU-3: ', corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
    print('BLEU-4: ', corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))


In [None]:
filename = "Flickr30k_Dataset/data/flickr30k_test.txt"
test = load_photo_identifiers(filename)
print('Dataset: ', len(test))
test_descriptions = load_clean_descriptions("Flickr30k_Dataset/data/descriptions.txt", test)
print('Descriptions: test=', len(test_descriptions))
test_features = load_features(test)
print('Photos: test=', len(test_features))


In [None]:
filename = '/content/drive/MyDrive/Image-Caption-Generator/model.h5'
model = load_model(filename)

In [None]:
evaluate_model(model, test_descriptions, test_features, tokenizer, max_length)

In [None]:
tokenizer = load(open("Flickr30k_Dataset/data/tokenizer.pkl", 'rb'))
max_length = maximum_length(train_descriptions)
max_length

In [None]:
def extract_features(filename, model):
        try:
            image = Image.open(filename)
            
        except:
            print("ERROR: Couldn't open image! Make sure the image path and extension is correct")
        image = image.resize((299,299))
        image = np.array(image)
        # for images that has 4 channels, we convert them into 3 channels
        if image.shape[2] == 4: 
            image = image[..., :3]
        image = np.expand_dims(image, axis=0)
        image = image/127.5
        image = image - 1.0
        feature = model.predict(image)
        return feature

def word_for_id(integer, tokenizer):
 for word, index in tokenizer.word_index.items():
     if index == integer:
         return word
 return None


def generate_desc(model, tokenizer, photo, max_length):
    in_text = 'startseq'
    for i in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length)
        pred = model.predict([photo,sequence], verbose=0)
        pred = np.argmax(pred)
        word = word_for_id(pred, tokenizer)
        if word is None:
            break
        in_text += ' ' + word
        if word == 'endseq':
            break
    return in_text

In [None]:
model = load_model('/content/drive/MyDrive/Image-Caption-Generator/model.h5')
path = 'Flickr30k_Dataset/flickr30k_images/301246.jpg'
xception_model = Xception(include_top=False, pooling="avg")
photo = extract_features(path, xception_model)
img = Image.open(path)
plt.imshow(img)
description = generate_desc(model, tokenizer, photo, max_length)
print(description)
