In [40]:
# Import library

import string
import numpy as np
from PIL import Image
import os
from pickle import dump, load
import pandas as pd

from keras.applications.xception import Xception, preprocess_input
from keras.preprocessing.image import load_img, img_to_array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers.merge import add
from keras.models import Model, load_model
from keras.layers import Input, Dense, LSTM, Embedding, Dropout

In [41]:
pip install tqdm

Note: you may need to restart the kernel to use updated packages.


In [42]:
# Small library for seeing the progress of loops

from tqdm.notebook import tqdm
tqdm().pandas()

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

In [43]:
# Data Cleaning

# Loading a text file into memory
def load_doc(filename):
    file = open(filename,'r')
    text = file.read()
    file.close()
    return text

In [44]:
# Get all images with their captions

def all_img_captions(filename):
    file = load_doc(filename)
    captions = file.split('\n')
    descriptions = {}
    for caption in captions[:-1]:
        img, caption = caption.split('\t')
        if img[:-2] not in descriptions:
            descriptions[img[:-2]] = [ caption ]
        else:
            descriptions[img[:-2]].append(caption)
    return descriptions

In [45]:
# Data cleaning - lower casing, removing puntuations, and words containing numbers

def cleaning_text(captions):
    table = str.maketrans('','',string.punctuation)
    for img,caps in captions.items():
        for i,img_caption in enumerate(caps):
            img_caption.replace("-"," ")
            desc = img_caption.split()
            #converts to lowercase
            desc = [word.lower() for word in desc]
            #remove punctuation from each token
            desc = [word.translate(table) for word in desc]
            #remove hanging 's and a 
            desc = [word for word in desc if(len(word)>1)]
            #remove tokens with numbers in them
            desc = [word for word in desc if(word.isalpha())]
            #convert back to string
            img_caption = ' '.join(desc)
            captions[img][i]= img_caption
    return captions

In [46]:
# Build vocabulary of all unique words

def text_vocabulary(descriptions):
    vocab = set()
    for key in descriptions.keys():
        [vocab.update(d.split()) for d in descriptions[key]]
    return vocab

In [47]:
#All descriptions in one file 

def save_descriptions(descriptions, filename):
    lines = list()
    for key, desc_list in descriptions.items():
        for desc in desc_list:
            lines.append(key + '\t' + desc)
    data = "\n".join(lines)
    file = open(filename,"w")
    file.write(data)
    file.close()

In [49]:
# Set the path according to project folder

dataset_text = r"C:\Users\ASUS\Anaconda3\envs\FAIEnv\python-project-image-caption-generator\Flickr8k_text"
dataset_images = r"C:\Users\ASUS\Anaconda3\envs\FAIEnv\python-project-image-caption-generator\Flicker8k_Dataset"

In [50]:
# Prepare our text data

filename = dataset_text + "/" + "Flickr8k.token.txt"

In [60]:
descriptions = all_img_captions(filename)
print("Length of descriptions =",len(descriptions))

Length of descriptions = 8092


In [63]:
# Cleaning the descriptions

clean_descriptions = cleaning_text(descriptions)

In [64]:
# Building vocabulary

vocabulary = text_vocabulary(clean_descriptions)

In [65]:
print("Length of vocabulary =",len(vocabulary))

Length of vocabulary = 10908


In [66]:
# Saving each description to file

save_descriptions(clean_descriptions,"my_descriptions.txt")

In [67]:
# Extracting the feature vector from all images

def extract_features(directory):
        model = Xception( include_top=False, pooling='avg' )
        features = {}
        for img in tqdm(os.listdir(directory)):
            filename = directory + "/" + img
            image = Image.open(filename)
            image = image.resize((299,299))
            image = np.expand_dims(image, axis=0)
            #image = preprocess_input(image)
            image = image/127.5
            image = image - 1.0
            feature = model.predict(image)
            features[img] = feature
        return features

In [68]:
#2048 feature vector

features = extract_features(dataset_images)

HBox(children=(IntProgress(value=0, max=8091), HTML(value='')))




In [69]:
dump(features, open("features.p","wb"))

In [70]:
features = load(open("features.p","rb"))

In [71]:
# Loading dataset for training the model

def load_photos(filename):
    file = load_doc(filename)
    photos = file.split("\n")[:-1]
    return photos

def load_clean_descriptions(filename, photos): 
    #loading clean_descriptions
    file = load_doc(filename)
    descriptions = {}
    for line in file.split("\n"):
        words = line.split()
        if len(words)<1 :
            continue
        image, image_caption = words[0], words[1:]
        if image in photos:
            if image not in descriptions:
                descriptions[image] = []
            desc = '<start> ' + " ".join(image_caption) + ' <end>'
            descriptions[image].append(desc)
    return descriptions

def load_features(photos):
    #loading all features
    all_features = load(open("features.p","rb"))
    #selecting only needed features
    features = {k:all_features[k] for k in photos}
    return features

In [72]:
filename2 = dataset_text + "/" + "Flickr_8k.trainImages.txt"

In [73]:
train_imgs = load_photos(filename2)

In [75]:
train_descriptions = load_clean_descriptions("my_descriptions.txt", train_imgs)

In [76]:
train_features = load_features(train_imgs)

In [77]:
# Keras library for tokenizer function

from keras.preprocessing.text import Tokenizer

In [78]:
# converting dictionary to clean list of descriptions

def dict_to_list(descriptions):
    all_desc = []
    for key in descriptions.keys():
        [all_desc.append(d) for d in descriptions[key]]
    return all_desc

In [80]:
# creating tokenizer class 
# this will vectorise text corpus
# each integer will represent token in dictionary

def create_tokenizer(descriptions):
    desc_list = dict_to_list(descriptions)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(desc_list)
    return tokenizer

In [81]:
# give each word an index, and store that into my_tokenizer.p pickle file

tokenizer = create_tokenizer(train_descriptions)
dump(tokenizer, open('my_tokenizer.p', 'wb'))
vocab_size = len(tokenizer.word_index) + 1

In [82]:
vocab_size

9501

In [83]:
# calculate maximum length of descriptions

def max_length(descriptions):
    desc_list = dict_to_list(descriptions)
    return max(len(d.split()) for d in desc_list)
    
max_length = max_length(descriptions)

In [84]:
max_length

34

In [85]:
# Create Data Generator

# create input-output sequence pairs from the image description

# data generator, used by model.fit_generator()

def data_generator(descriptions, features, tokenizer, max_length):
    while 1:
        for key, description_list in descriptions.items():
            #retrieve photo features
            feature = features[key][0]
            input_image, input_sequence, output_word = create_sequences(tokenizer, max_length, description_list, feature)
            yield [[input_image, input_sequence], output_word]
            
def create_sequences(tokenizer, max_length, desc_list, feature):
    X1, X2, y = list(), list(), list()
    # walk through each description for the image
    for desc in desc_list:
        # encode the sequence
        seq = tokenizer.texts_to_sequences([desc])[0]
        # split one sequence into multiple X,y pairs
        for i in range(1, len(seq)):
            # split into input and output pair
            in_seq, out_seq = seq[:i], seq[i]
            # pad input sequence
            in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
            # encode output sequence
            out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
            # store
            X1.append(feature)
            X2.append(in_seq)
            y.append(out_seq)
    return np.array(X1), np.array(X2), np.array(y)

In [86]:
# You can check the shape of the input and output for your model

[a,b],c = next(data_generator(train_descriptions, features, tokenizer, max_length))

In [87]:
a.shape, b.shape, c.shape

((51, 2048), (51, 34), (51, 9501))

In [88]:
# Defining the CNN-RNN model

from keras.utils import plot_model

# Define the captioning model
def define_model(vocab_size, max_length):
    # features from the CNN model squeezed from 2048 to 256 nodes
    inputs1 = Input(shape=(2048,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation='relu')(fe1)
    
    # LSTM sequence model
    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)
    
    # Merging both models
    decoder1 = add([fe2, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax')(decoder2)
    
    # tie it together [image, seq] [word]
    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    # summarize model
    print(model.summary())
    plot_model(model, to_file='my_model.png', show_shapes=True)
    return model

In [89]:
# Train the model

print('Dataset: ', len(train_imgs))
print('Descriptions: train=', len(train_descriptions))
print('Photos: train=', len(train_features))
print('Vocabulary Size:', vocab_size)
print('Description Length: ', max_length)

Dataset:  6000
Descriptions: train= 6000
Photos: train= 6000
Vocabulary Size: 9501
Description Length:  34


In [90]:
!pip install pydot



In [91]:
import keras
import pydot as pyd
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
keras.utils.vis_utils.pydot = pyd

In [92]:
!pip install graphviz



In [93]:
from keras.models import Sequential
from keras.layers import Dense
from keras.utils.vis_utils import plot_model

In [94]:
model = define_model(vocab_size, max_length)
epochs = 10
steps = len(train_descriptions)

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 34)           0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 2048)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 34, 256)      2432256     input_3[0][0]                    
__________________________________________________________________________________________________
dropout_1 (Dropout)             (None, 2048)         0           input_2[0][0]                    
____________________________________________________________________________________________

In [None]:
# Making a directory models to save our models

os.mkdir("new_models")
for i in range(epochs):
    generator = data_generator(train_descriptions, train_features, tokenizer, max_length)
    model.fit_generator(generator, epochs=1, steps_per_epoch= steps, verbose=1)
    model.save("new_models/model_" + str(i) + ".h5")

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/1
 563/6000 [=>............................] - ETA: 1:48:41 - loss: 9.16 - ETA: 1:12:52 - loss: 9.14 - ETA: 1:04:05 - loss: 9.11 - ETA: 57:28 - loss: 9.0832 - ETA: 54:12 - loss: 9.06 - ETA: 51:44 - loss: 9.00 - ETA: 49:35 - loss: 8.94 - ETA: 48:13 - loss: 8.85 - ETA: 47:50 - loss: 8.72 - ETA: 46:35 - loss: 8.60 - ETA: 45:33 - loss: 8.49 - ETA: 45:08 - loss: 8.42 - ETA: 44:26 - loss: 8.54 - ETA: 43:38 - loss: 8.46 - ETA: 43:21 - loss: 8.35 - ETA: 42:54 - loss: 8.23 - ETA: 42:38 - loss: 8.22 - ETA: 42:12 - loss: 8.10 - ETA: 41:46 - loss: 8.02 - ETA: 41:26 - loss: 7.90 - ETA: 41:14 - loss: 7.86 - ETA: 41:08 - loss: 7.76 - ETA: 40:56 - loss: 7.70 - ETA: 40:53 - loss: 7.63 - ETA: 40:52 - loss: 7.57 - ETA: 40:44 - loss: 7.53 - ETA: 40:40 - loss: 7.50 - ETA: 40:28 - loss: 7.47 - ETA: 40:25 - loss: 7.44 - ETA: 40:35 - loss: 7.42 - ETA: 40:23 - loss: 7.35 - ETA: 40:18 - loss: 7.34 - ETA: 40:07 - loss: 7.31 - ETA: 40:02 - loss: 7.29 - ETA: 39:44 - loss: 7.29 - ETA: 39:39 - loss: 7.27 - E