In [1]:
import os

In [2]:
def load_doc(filename):

    # loading the file and returing the data of file 

    file = open(filename,'r')
    text = file.read()
    text = text.split("\n")
    file.close()
    return text 

In [3]:
text = load_doc("./Flickr8k_text/Flickr8k.token.txt")
text[:5]

['1000268201_693b08cb0e.jpg#0\tA child in a pink dress is climbing up a set of stairs in an entry way .',
 '1000268201_693b08cb0e.jpg#1\tA girl going into a wooden building .',
 '1000268201_693b08cb0e.jpg#2\tA little girl climbing into a wooden playhouse .',
 '1000268201_693b08cb0e.jpg#3\tA little girl climbing the stairs to her playhouse .',
 '1000268201_693b08cb0e.jpg#4\tA little girl in a pink dress going into a wooden cabin .']

In [4]:
def caption_grouping(captions):

    # returns a dictionary 
    # key contains image name 
    # values contains 5 captions of the img respectively 
 
    img_captions = dict()
    captions = captions[:-1]
    temp = []
    for cap in captions:
        if len(cap)<20:
            continue
        else:
            temp_key = cap.split('#')
            key = temp_key[0]
            temp_caption = temp_key[-1].split('\t')
            if temp_caption[0]=='4':
                temp.append(temp_caption[-1])
                img_captions[key]=temp
                temp=[]
            else:
                temp.append(temp_caption[-1])
    return img_captions

In [5]:
img_captions = caption_grouping(text)
list(img_captions.items())[:2]

[('1000268201_693b08cb0e.jpg',
  ['A child in a pink dress is climbing up a set of stairs in an entry way .',
   'A girl going into a wooden building .',
   'A little girl climbing into a wooden playhouse .',
   'A little girl climbing the stairs to her playhouse .',
   'A little girl in a pink dress going into a wooden cabin .']),
 ('1001773457_577c3a7d70.jpg',
  ['A black dog and a spotted dog are fighting',
   'A black dog and a tri-colored dog playing with each other on the road .',
   'A black dog and a white dog with brown spots are staring at each other in the street .',
   'Two dogs of different breeds looking at each other on the road .',
   'Two dogs on pavement moving toward each other .'])]

In [6]:
import re 

In [7]:
def caption_preprocessing(img_captions):

    # caption in the dictionary are processed and updated dictionay is returned 

    maxlen = 0
    for key,val in img_captions.items():
        temp=[]
        for sent in val:
            prepro = sent.lower()
            prepro = re.sub('[^a-z0-9 ]','',prepro)
            prepro = [ word for word in prepro.split() if len(word)>1 ]
            prepro = [ word for word in prepro if (word.isalpha())]
            if len(prepro)>maxlen:
                maxlen = len(prepro)
            prepro = ' '.join(prepro)
            temp.append(prepro)
            
        img_captions[key] = temp
    return (img_captions,maxlen)


In [8]:
description , max_length= caption_preprocessing(img_captions)
max_length


32

In [9]:
def text_vocabulary(description):

    # returns the vocabulary of captions of all images 

    vocab = set()
    for key in description.keys():
        for d in description[key]:
            vocab.update(d.split())
    return vocab

In [10]:
vocab = text_vocabulary(description)
vocab_size = len(vocab)
vocab_size

8763

In [11]:
from pickle import dump, load

In [12]:
def save_description(description,filename):

    # saves the processed caption in a txt document 

    lines=list()
    for key , val in description.items():
        for cap in val:
            lines.append(key+"\t"+cap)
    data = '\n'.join(lines)
    file = open("./"+filename,"w")
    file.write(data)
    file.close()

In [13]:
save_description(description,"caption_description.txt")

In [14]:
from tensorflow.keras.applications.xception import Xception , preprocess_input
from PIL import Image
import numpy as np
from tqdm import tqdm 
from pickle import dump , load

In [15]:
img_dataset_dir = "./Flickr8k_Dataset/Flicker8k_Dataset"

In [16]:
def extract_features(directory):
    model = Xception(include_top=False, pooling = 'avg')
    features = dict()
    for img_name in tqdm(os.listdir(directory)):
        img = Image.open(directory+'/'+img_name)
        img = img.resize((299,299))
        img = np.expand_dims(img, axis=0)
        img = img/127.5
        img = img-1

        feature = model.predict(img)
        features[img_name]=feature
    return features

In [186]:
features = extract_features(img_dataset_dir)
dump(features,open("features.p",'wb'))

100%|██████████| 8091/8091 [15:45<00:00,  8.55it/s]


In [17]:
features = load(open("features.p","rb"))

In [18]:
def load_photos(filename):
    file = load_doc(filename)
    photos = file[:-1]
    return photos

In [19]:
train_images_path = "./Flickr8k_text/Flickr_8k.trainImages.txt"
description_path = "./caption_description.txt"



In [20]:

train_images = load_photos(train_images_path)
train_images[:5]

['2513260012_03d33305cf.jpg',
 '2903617548_d3e38d7f88.jpg',
 '3338291921_fe7ae0c8f8.jpg',
 '488416045_1c6d903fe0.jpg',
 '2644326817_8f45080b87.jpg']

In [21]:
def load_cleaned_descriptions(path,photos):
    file = open(path,'r')
    text = file.read()
    file.close()
    text = text.split("\n")
    descriptions={}
    for sent in text:
        sent = sent.split('\t')
        img = sent[0]
        if img in photos:
            if img not in descriptions:
                descriptions[img] = []
            des = "<start> "+sent[-1]+" <end>"
            descriptions[img].append(des)
    return descriptions

In [22]:
train_img_desc = load_cleaned_descriptions(description_path,train_images)
print(list(train_img_desc.items())[:3])

# these are the images which are present in training images but not present in descriptions  
for key in train_images:
    if key not in train_img_desc.keys():
        print(key)

[('1000268201_693b08cb0e.jpg', ['<start> child in pink dress is climbing up set of stairs in an entry way <end>', '<start> girl going into wooden building <end>', '<start> little girl climbing into wooden playhouse <end>', '<start> little girl climbing the stairs to her playhouse <end>', '<start> little girl in pink dress going into wooden cabin <end>']), ('1001773457_577c3a7d70.jpg', ['<start> black dog and spotted dog are fighting <end>', '<start> black dog and tricolored dog playing with each other on the road <end>', '<start> black dog and white dog with brown spots are staring at each other in the street <end>', '<start> two dogs of different breeds looking at each other on the road <end>', '<start> two dogs on pavement moving toward each other <end>']), ('1002674143_1b742ab4b8.jpg', ['<start> little girl covered in paint sits in front of painted rainbow with her hands in bowl <end>', '<start> little girl is sitting in front of large painted rainbow <end>', '<start> small girl in 

In [23]:
def load_training_features(path,photos):
    features = load(open(path, 'rb'))
    required_features = {k:features[k] for k in photos if k not in ["2837799692_2f1c50722a.jpg", "3273625566_2454f1556b.jpg","2833582518_074bef3ed6.jpg","2924483864_cfdb900a13.jpg"]}
    return required_features

In [24]:
p_filepath = "./features.p"
training_features = load_training_features(p_filepath,train_images)
print(len(training_features))
list(training_features.items())[0]

5996


('2513260012_03d33305cf.jpg',
 array([[0.        , 0.44815865, 0.        , ..., 0.1775745 , 0.00191514,
         0.00869677]], dtype=float32))

In [25]:
def dict_to_list(dic):
    ans = []
    for i in dic.keys():
        [ans.append(cap) for cap in dic[i]]
    return ans

def creat_token(desc):
    desc_list = dict_to_list(desc)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(desc_list)
    return tokenizer

In [35]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
dump(tokenizer, open("tokenizer.p",'wb'))
tokenizer = creat_token(train_img_desc)
token_length = len(tokenizer.word_index) + 1
print(token_length)

7576


In [27]:
def data_generator(descriptions, features, tokenizer, max_length ):
    while 1:
        for key , desc_list in descriptions.items():
            feature = features[key][0]
            input_img, input_seq, output_word = creat_sequences(tokenizer , max_length, desc_list, feature)
            yield ([input_img,input_seq], output_word)

def creat_sequences(tokenizer, max_length,desc_list,feature):
    x1 , x2 , y = list(), list(), list()
    for desc in desc_list:
        seq = tokenizer.texts_to_sequences([desc])[0]
        for i in range(1,len(seq)):
            in_seq ,out_seq = seq[:i], seq[i]
            in_seq = pad_sequences([in_seq],maxlen=max_length)[0]
            out_seq = to_categorical([out_seq],num_classes=vocab_size)[0]
            x1.append(feature)
            x2.append(in_seq)
            y.append(out_seq)
    return np.array(x1),np.array(x2),np.array(y)
   

In [28]:
[[a, b], c] = next(data_generator(train_img_desc, training_features, tokenizer, max_length))

In [29]:
a.shape,b.shape,c.shape

((47, 2048), (47, 32), (47, 8763))

In [32]:
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout, Input
from tensorflow.keras.utils import plot_model
from keras.layers.merge import add
from tensorflow.keras.models import Model, load_model

# define the captioning model
def define_model(vocab_size, max_length):

    # features from the CNN model squeezed from 2048 to 256 nodes
    inputs1 = Input(shape=(2048,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation='relu')(fe1)
    print(inputs1,"-----",fe1,"------",fe2)

    # LSTM sequence model
    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)
    

    # Merging both models
    decoder1 = add([fe2, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax')(decoder2)

    # tie it together [image, seq] [word]
    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer='adam')

    # summarize model

    plot_model(model, to_file='./model.png', show_shapes=True)

    return model

In [33]:
print('Dataset: ', len(train_images))
print('Descriptions: train=', len(train_img_desc))
print('Photos: train=', len(training_features))
print('Vocabulary Size:', vocab_size)
print('Description Length: ', max_length)

Dataset:  6000
Descriptions: train= 5996
Photos: train= 5996
Vocabulary Size: 8763
Description Length:  32


In [34]:
model = define_model(vocab_size, max_length)
epochs = 10
steps = len(train_img_desc)
# making a directory models to save our models
os.mkdir("models")
for i in range(epochs):
    generator = data_generator(train_img_desc, training_features, tokenizer, max_length)
    model.fit_generator(generator, epochs=1, steps_per_epoch= steps, verbose=1)
    model.save("models/model_" + str(i) + ".h5")

KerasTensor(type_spec=TensorSpec(shape=(None, 2048), dtype=tf.float32, name='input_1'), name='input_1', description="created by layer 'input_1'") ----- KerasTensor(type_spec=TensorSpec(shape=(None, 2048), dtype=tf.float32, name=None), name='dropout/Identity:0', description="created by layer 'dropout'") ------ KerasTensor(type_spec=TensorSpec(shape=(None, 256), dtype=tf.float32, name=None), name='dense/Relu:0', description="created by layer 'dense'")
KerasTensor(type_spec=TensorSpec(shape=(None, 32), dtype=tf.float32, name='input_2'), name='input_2', description="created by layer 'input_2'") ------- KerasTensor(type_spec=TensorSpec(shape=(None, 32, 256), dtype=tf.float32, name=None), name='embedding/embedding_lookup/Identity_1:0', description="created by layer 'embedding'") ------ KerasTensor(type_spec=TensorSpec(shape=(None, 32, 256), dtype=tf.float32, name=None), name='dropout_1/Identity:0', description="created by layer 'dropout_1'") ----- KerasTensor(type_spec=TensorSpec(shape=(None







