# Importing the packages

In [1]:
import string
import numpy as np
from PIL import Image
import os
from pickle import dump, load
import tensorflow as tf

In [2]:
from tensorflow.keras.applications.xception import Xception, preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from keras.layers.merge import add
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout

In [3]:
# small library for seeing the progress of loops
from tqdm import tqdm_notebook as tqdm
tqdm().pandas()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  tqdm().pandas()


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

  from pandas import Panel


# Getting and Cleaning the data

In [4]:
# Loading a text file into memory
def load_doc(filename):
    # opening the file as read only
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text

In [5]:
# Get all images with their captions
def all_img_captions(filename):
    file = load_doc(filename)
    captions = file.split('\n')
    descriptions = {}
    for caption in captions[:-1]:
        img, caption = caption.split('\t')
        if img[:-2] not in descriptions:
            descriptions[img[:-2]] = [caption]
        else:
            descriptions[img[:-2]].append(caption)
    return descriptions

In [6]:
# Data Cleaning - lower casing, removing punctuations and words containing numbers
def cleaning_text(captions):
    table = str.maketrans('', '', string.punctuation)
    for img, caps in captions.items():
        for i, img_caption in enumerate(caps):
            img_caption.replace("-", " ")
            desc = img_caption.split()
            
            # converts to lowercase
            desc = [word.lower() for word in desc]
            
            # remove punctuation from each token
            desc = [word.translate(table) for word in desc]
            
            # remove hanging 's and a
            desc = [word for word in desc if (len(word)>1)]
            
            # remove tokens with numbers in them
            desc = [word for word in desc if (word.isalpha())]
            
            # convert back to string
            img_caption = ' '.join(desc)
            captions[img][i] = img_caption
        return captions

In [7]:
# Creating bag of words
def text_vocabulary(descriptions):
    # build vocabulary of all unique words
    vocab = set()
    
    for key in descriptions.keys():
        [vocab.update(d.split()) for d in descriptions[key]]
    
    return vocab

In [8]:
# All descriptions in one file
def save_descriptions(descriptions, filename):
    lines = list()
    for key, desc_list in descriptions.items():
        for desc in desc_list:
            lines.append(key+'\t'+desc)
    data = "\n".join(lines)
    file = open(filename, "w")
    file.write(data)
    file.close()

In [9]:
# Set these path according to project folder in your system
dataset_text = "C:\\Users\\Tushar\\Desktop\\DL Projects\\ImageCaptionGenerator\\Flickr8k_text"
dataset_images = "C:\\Users\\Tushar\\Desktop\\DL Projects\\ImageCaptionGenerator\\Flicker8k_Dataset"

# Prepare text data
filename = dataset_text+"/"+"Flickr8k.token.txt"
# loading the file that contains all data

In [10]:
# mapping them into description dictionary img to 5 captions
descriptions = all_img_captions(filename)
print("Length of descriptions =",len(descriptions))

Length of descriptions = 8092


In [None]:
# cleaning the descriptions
clean_descriptions = cleaning_text(descriptions)

In [None]:
# building vocabulary (Bag of words)
vocabulary = text_vocabulary(clean_descriptions)
print("Length of vocabulary =",len(vocabulary))

In [None]:
# save each description to file
save_descriptions(clean_descriptions, "descriptions.txt")

# Extracting the feature vector from all images

In [None]:
def extract_features(directory):
    model = Xception(include_top=False, pooling='avg')
    features = {}
    for img in tqdm(os.listdir(directory)):
        filename = directory+"/"+img
        image = Image.open(filename)
        image = image.resize((299,299))
        image = np.expand_dims(image, axis=0)
        # image = preprocess_input(image)
        image = image/127.5
        image = image - 1.0
        
        feature = model.predict(image)
        features[img] = feature
    return features

In [None]:
# 2048 feature vector
features = extract_features(dataset_images)
dump(features, open("features.p", "wb"))

In [34]:
features = load(open("features.p", "rb"))

# Loading the dataset for Training the model 

In [11]:
# load the data
def load_photos(filename):
    file = load_doc(filename)
    photos = file.split("\n")[:-1]
    return photos

In [12]:
def load_clean_descriptions(filename, photos):
    # loading clean descriptions
    file = load_doc(filename)
    descriptions = {}
    for line in file.split("\n"):
        words = line.split()
        if len(words)<1:
            continue
        
        image, image_caption = words[0], words[1:]
        if image in photos:
            if image not in descriptions:
                descriptions[image] = []
            desc = '<start> '+" ".join(image_caption)+" <end>"
            descriptions[image].append(desc)
    return descriptions

In [13]:
def load_features(photos):
    # loading all features
    all_features = load(open("features.p", "rb"))
    # selecting only needed features
    features = {k:all_features[k] for k in photos}
    return features

In [14]:
filename = dataset_text+"/"+"Flickr_8k.trainImages.txt"

In [15]:
# train = loading_data(filename)
train_imgs = load_photos(filename)
train_descriptions = load_clean_descriptions("descriptions.txt", train_imgs)
train_features = load_features(train_imgs)

# Tokenizing the vocabulary

In [16]:
# converting dictionary to clean the list of descriptions
def dict_to_list(descriptions):
    all_desc = []
    for key in descriptions.keys():
        [all_desc.append(d) for d in descriptions[key]]
    return all_desc

In [17]:
# creating tokenizer class
# this will vectorise text corpus
# each integer will represent token in dictionary

from keras.preprocessing.text import Tokenizer

def create_tokenizer(descriptions):
    desc_list = dict_to_list(descriptions)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(desc_list)
    return tokenizer

In [18]:
# give each word an index, and store that into tokenizer.p pickle file
tokenizer = create_tokenizer(train_descriptions)
dump(tokenizer, open('tokenizer.p', 'wb'))
vocab_size = len(tokenizer.word_index) + 1

In [19]:
vocab_size

7577

In [20]:
# caluclate maximum length of descriptions
def max_length(descriptions):
    desc_list = dict_to_list(descriptions)
    return max(len(d.split()) for d in desc_list)

max_length = max_length(descriptions)
max_length

38

# Create data generator

In [21]:
# create input-output sequene pairs from the image description

# data generator, used by model.fit_generator()
def data_generator(descriptions, features, tokenizer, max_length):
    while 1:
        for key, description_list in descriptions.items():
            # retrieve photo features
            feature = features[key][0]
            input_image, input_sequence, output_word = create_sequences(tokenizer, max_length, description_list, feature)
            yield[[input_image, input_sequence], output_word]

In [28]:
def create_sequences(tokenizer, max_length, desc_list, feature):
    x1, x2, y = list(), list(), list()
    # walk through each description for the image
    for desc in desc_list:
        # encode the sequence
        seq = tokenizer.texts_to_sequences([desc])[0]
        # split one sequence into multiple x, y pairs
        for i in range(1, len(seq)):
            # split into input and output pair
            in_seq, out_seq = seq[:i], seq[i]
            # pad input sequence
            in_Seq = pad_sequences([in_seq], maxlen=max_length)[0]
            # encode output sequence
            out_seq = to_categorical([out_seq], num_classes = vocab_size)[0]
            # store
            x1.append(feature)
            x2.append(in_seq)
            y.append(out_seq)
    return np.array(x1), np.array(x2), np.array(y)

# Defining the CNN - RNN model

In [23]:
from tensorflow.keras.utils import plot_model

In [29]:
def define_model(vocab_size, max_length):
    
    # features from the CNN model squeezed from 2048 to 256 nodes
    inputs1 = Input(shape=(2048,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation='relu')(fe1)
    
    # LSTM sequence model
    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
    se2 = Dropout(0.)(se1)
    se3 = LSTM(256)(se2)
    
    # Merging both models
    decoder1 = add([fe2, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax')(decoder2)
    
    # tie it together [image, seq] [word]
    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    # summarize model
    print(model.summary())
    plot_model(model, to_file='model.png', show_shapes=True)
    
    return model

# Training the model

In [35]:
print('Dataset :',len(train_imgs))
print('Descriptions : train=', len(train_descriptions))
print('Photos : train=', len(train_features))
print('Vocabulary Size :',vocab_size)
print('Description Length :',max_length)

# train our model
model = define_model(vocab_size, max_length)
epochs = 10
steps = len(train_descriptions)

# making a directory models to save our models
os.mkdir("models")
for i in range(epochs):
    generator = data_generator(train_descriptions, train_features, tokenizer, max_length)
    model.fit_generator(generator, epochs=1, steps_per_epoch=steps, verbose=1)
    model.save("models/model_"+str(i)+".h5")

Dataset : 6000
Descriptions : train= 6000
Photos : train= 6000
Vocabulary Size : 7577
Description Length : 38
Model: "functional_12"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_14 (InputLayer)           [(None, 38)]         0                                            
__________________________________________________________________________________________________
input_13 (InputLayer)           [(None, 2048)]       0                                            
__________________________________________________________________________________________________
embedding_6 (Embedding)         (None, 38, 256)      1939712     input_14[0][0]                   
__________________________________________________________________________________________________
dropout_12 (Dropout)            (None, 2048)         0           input_13[0

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type list).

# Testing the model

In [37]:
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt

In [38]:
def extract_features(filename, model):
    try:
        image = Image.open(filename)
    except:
        print("Error occurred!")
    image = image.resize((299, 299))
    image = np.array(image)
    # if image has 4 channels, convert them to 3 channels
    if image.shape[2]==4:
        image = image[..., :3]
    image = np.expand_dims(image, axis=0)
    image = image/127.5
    image = image - 1.0
    feature = model.predict(image)
    return feature

In [39]:
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index==integer:
            return word
    return None

In [40]:
def generate_desc(model, tokenizer, photo, max_length):
    in_text = 'start'
    for i in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen = max_length)
        pred = model.predict([photo, sequence], verbose=0)
        pred = np.argmax(pred)
        word = word_for_id(pred, tokenizer)
        if word is None:
            break
        in_text += ' '+word
        if word=='end':
            break
        return in_text

In [45]:
max_length = 38
tokenizer = load(open("tokenizer.p", "rb"))
model = load_model('model.h5')
xception_model = Xception(include_top = False, pooling="avg")

In [46]:
img_path = 'C:\\Users\\Tushar\\Desktop\\DL Projects\\ImageCaptionGenerator\\test_data\\3.png'
photo = extract_features(img_path, xception_model)
description = generate_desc(model, tokenizer, photo, max_length)
print(description)

start two
