In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import re
from pickle import load
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Load data

In [None]:
def load_data(filename):
    with open(filename) as file:
        text = file.read()
    return text

data = load_data('../input/flicker8k-image-captioning/Flickr8k_text/Flickr8k.token.txt')
data.split("\n")[:10]

**Extract description**

In [None]:
def extract_desc(data):
    mapping = {}
    for line in data.split('\n'):
#         print(line)
        try:
            name, desc = line.split('\t')[0].split('.')[0], line.split('\t')[1]
        except:
            pass
        if name not in mapping:
            mapping[name] = []
        mapping[name].append(desc)
    return mapping
    
descriptions = extract_desc("\n".join(data.split("\n")[:10]))
descriptions

**clean descriptions**

In [None]:
def clean_desc(descriptions):
    for key, desc_lists in descriptions.items():
        for i in range(len(desc_lists)):
            desc_lists[i] = re.sub("[^a-zA-Z0-9 ]", "", desc_lists[i]).lower()
            desc_lists[i] = " ".join([word for word in desc_lists[i].split() if len(word)>1])

descriptions = extract_desc("\n".join(data.split("\n")[:10]))
clean_desc(descriptions)
descriptions

**create vocabulary**

In [None]:
def create_vocabulary(descriptions):
    lines = []
    vocab = set()
    for key, desc_list in descriptions.items():
        for desc in desc_list:
            lines.append(desc)
            vocab.update(desc.split())
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return vocab, tokenizer
    
vocabulary, tokenizer = create_vocabulary(descriptions)
print(vocabulary)

In [None]:
tokenizer.texts_to_sequences(['child in pink dress is climbing up set of stairs in an entry way'])[0]

In [None]:
def reduce_desc(descriptions):
    for key, desc_list in descriptions.items():
        descriptions[key] = descriptions[key][:2]
    return descriptions
tmp = {'1000268201_693b08cb0e': ['child in pink dress is climbing up set of stairs in an entry way',
  'girl going into wooden building',
  'little girl climbing into wooden playhouse',
  'little girl climbing the stairs to her playhouse',
  'little girl in pink dress going into wooden cabin'],
 '1001773457_577c3a7d70': ['black dog and spotted dog are fighting',
  'black dog and tricolored dog playing with each other on the road',
  'black dog and white dog with brown spots are staring at each other in the street',
  'two dogs of different breeds looking at each other on the road',
  'two dogs on pavement moving toward each other']}
reduce_desc(tmp)

In [None]:
def encodeStartEnd(desc):
    for key,value in desc.items():
        for i in range(len(desc[key])):
            desc[key][i] = 'startseq '+ desc[key][i] + ' endseq'
    return desc
tmp = {'1000268201_693b08cb0e': ['child in pink dress is climbing up set of stairs in an entry way',
  'girl going into wooden building'],
 '1001773457_577c3a7d70': ['black dog and spotted dog are fighting',
  'black dog and tricolored dog playing with each other on the road']}
encodeStartEnd(tmp)

# preprocess

In [None]:
def preprocess(filename):
    text = load_data(filename)
    desc = extract_desc(text)
    clean_desc(desc)
    reduce_desc(desc)
    desc = encodeStartEnd(desc)
    vocab, tokenizer = create_vocabulary(desc)
    return desc, vocab, tokenizer

In [None]:
path = "../input/flicker8k-image-captioning/Flickr8k_text/Flickr8k.token.txt"
preprocessed_data, vocabulary, tokenizer = preprocess(path)

In [None]:
print("Vocabulary size",len(vocabulary))
print("Processed description for image 1001773457_577c3a7d70")
preprocessed_data["1001773457_577c3a7d70"]

In [None]:
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

In [None]:
def create_record_of_images(data):
    dataset = []
    for line in data.split('\n'):
        try:
            name = line.split('\t')[0].split('.')[0]
        except:
            pass
        if name not in dataset:
            dataset.append(name)
    return dataset

dataset = create_record_of_images(data)
dataset[:10]

In [None]:
len(preprocessed_data)

create text data for training

In [None]:
len(dataset)

In [None]:
preprocessed_data['1001773457_577c3a7d70']

prepare dataset for training

In [None]:
vocab_size = len(vocabulary)

In [None]:
def max_len(descriptions):
    maxlen = 0
    for name, desc_list in descriptions.items():
        for desc in desc_list:
            if len(desc)>maxlen:
                maxlen = len(desc)
    return maxlen

max_length = max_len(preprocessed_data)

In [None]:
def create_sequences(tokenizer, max_length, descriptions, image_features, vocab_size):
    X1, X2, y = list(), list(), list()
    for key in image_features.keys():
        for desc in preprocessed_data[key]:
            seq = tokenizer.texts_to_sequences([desc])[0]
            for i in range(1, len(seq)):
                X1.append(image_features[key])
                X2.append(pad_sequences([seq[:i]], maxlen=max_length, padding='post')[0])
                y.append(to_categorical([seq[i]], num_classes=vocab_size)[0])
    return np.array(X1), np.array(X2), np.array(y)

In [None]:
#X1_train, X2_train, y_train = create_sequences(tokenizer, max_length, preprocessed_data, image_features, vocab_size+1)

In [None]:
#X1_train.shape, X2_train.shape, y_train.shape

In [None]:
#X1_train

In [None]:
#X2_train[:5]

In [None]:
#list(y_train[0]).index(1)

In [None]:
from keras.applications.inception_v3 import InceptionV3,preprocess_input
from keras.layers import Dense,BatchNormalization,Dropout,Embedding,RepeatVector
from keras.preprocessing.image import load_img, img_to_array

from keras.models import Sequential
from keras.models import Model

In [None]:
inception = InceptionV3(weights='imagenet')

In [None]:
inception.layers.pop()
for layer in inception.layers:
    layer.trainable = False
inception.layers[-2].output

In [None]:
final_model = Model(inception.input,inception.layers[-1].output)

In [None]:
dataset

In [None]:
TARGET_SIZE = (299,299)
image_features = dict()
    
try:
    for el in dataset:
        tokens = el.split(".")
        image_id = tokens[0]
        img = load_img("../input/flicker8k-image-captioning/Flickr8k_Dataset/Flicker8k_Dataset/{}.jpg".format(image_id),target_size=TARGET_SIZE)
        # Converting image to array
        img_array = img_to_array(img)
        nimage = preprocess_input(img_array)
        # Adding one more dimesion
        nimage = np.expand_dims(nimage, axis=0)    
        fea_vec = final_model.predict(nimage, verbose=0)
        image_features[image_id] = np.reshape(fea_vec, fea_vec.shape[1])
except Exception as e:
    print("Exception got :- \n",e)

In [None]:
image_features['1000268201_693b08cb0e'].shape

create model

In [None]:
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
from keras.layers import Dropout
from keras.layers.merge import add
from keras.callbacks import ModelCheckpoint

In [None]:
def create_model(vocab_size, max_length):
    # for image part
    inputs1 = Input(shape=(1000,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation='relu')(fe1)
    # for text part
    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)
    # combine both
    decoder1 = add([fe2, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax')(decoder2)
    # create model
    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    # summary
    print(model.summary())
    return model

In [None]:
model = create_model(vocab_size+1, max_length)

In [None]:
model_path = 'model-ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}.h5'
checkpoint = ModelCheckpoint(model_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

In [None]:
#model.fit([X1_train, X2_train], y_train, epochs=2, verbose=1, callbacks=[checkpoint])

data generator 

In [None]:
def create_sequences_for_datagen(tokenizer, max_length, desc_list, image_feature, vocab_size):
    X1, X2, y = list(), list(), list()
    #for key in image_features.keys():
    for desc in desc_list:
        seq = tokenizer.texts_to_sequences([desc])[0]
        for i in range(1, len(seq)):
            X1.append(image_feature)
            X2.append(pad_sequences([seq[:i]], maxlen=max_length, padding='post')[0])
            y.append(to_categorical([seq[i]], num_classes=vocab_size)[0])
    return np.array(X1), np.array(X2), np.array(y)

In [None]:
def data_generator(preprocessed_data, image_features, tokenizer, max_length, vocab_size):
    while 1:
        #for key, desc_list in descriptions.items():
        for key in image_features.keys():
            desc_list = preprocessed_data[key]
            image_feature = image_features[key]
            in_img, in_seq, out_word = create_sequences_for_datagen(tokenizer, max_length, desc_list, image_feature, vocab_size)
            yield [in_img, in_seq], out_word

In [None]:
model = create_model(vocab_size+1, max_length)

In [None]:
steps = len(preprocessed_data)
generator = data_generator(preprocessed_data, image_features, tokenizer, max_length, vocab_size+1)
model.fit_generator(generator, epochs=20, steps_per_epoch=steps, verbose=1)
model.save('model_20_epochs.h5')

prediction

In [None]:
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

In [None]:
def generate_desc(model, tokenizer, image_feature, max_length):
    in_text = 'startseq'
    # iterate over the whole length of the sequence
    for i in range(max_length):
        # integer encode input sequence
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        # pad input
        sequence = pad_sequences([sequence], padding='post', maxlen=max_length)
        # predict next word
        yhat = model.predict([image_feature, sequence], verbose=0)
        # convert probability to integer
        yhat = np.argmax(yhat)
        # map integer to word
        word = word_for_id(yhat, tokenizer)
        # stop if we cannot map the word
        if word is None:
            break
        # append as input for generating the next word
        in_text += ' ' + word
        # stop if we predict the end of the sequence
        if word == 'endseq':
            break
    return in_text

In [None]:
image_desc = image_features['1015118661_980735411b']
image_desc = image_desc.reshape((1,image_desc.shape[0]))
description = generate_desc(model, tokenizer, image_desc, max_length)
print(description)

In [None]:
#tokenizer.word_index

In [None]:
image_features['2916586390_664f0139ea'][0]

In [None]:
preprocessed_data['997338199_7343367d7f']

In [None]:
dataset[dataset.index('997338199_7343367d7f')+1]

In [None]:
preprocessed_data['997722733_0cb5439472']

In [None]:
vocab_size

In [None]:
tokenizer.index_word[6046]

In [None]:
a = [1,2,3]
pad_sequences([a], maxlen=5)[0]

In [None]:
b = 2
to_categorical([b], num_classes=5)[0]

In [None]:
pre