# Import everything

In [None]:
import tensorflow as tf

import matplotlib.pyplot as plt

# regex
import re

# merge paths
import os

import numpy as np 

# create datasets
import pandas as pd 

# to get file names in a folder 
from glob import glob

# The Image module provides a class with the same name which is used to represent a PIL image. The module also provides a number of factory functions, including functions to load images from files, and to create new images.
from PIL import Image

# to save a model as a file 
from pickle import load, dump

# one-hot encoding 
from keras.utils import to_categorical

# It takes as input a list of tensors, all of the same shape, and returns a single tensor (also of the same shape).
from keras.layers.merge import add

# Loads a model saved via model.save(), Model is the actual model 
from keras.models import Model, load_model

# Dense: dense layers perform classification on the features extracted by the convolutional layers and down-sampled by the pooling layers. 
# LSTM: RNN may suffer from the vanishing gradient problem - LSTM solves this problem, LSTM knows what to store and what to throw away. 
# Dropout: The Dropout layer randomly sets input units to 0 with a frequency of rate at each step during training time, which helps prevent overfitting. Inputs not set to 0 are scaled up by 1/(1 - rate) such that the sum over all inputs is unchanged.
# Embedding: Use to create our own word embeddings using a tokenizer 
# Input: is used to instantiate a Keras tensor.
from keras.layers import Dense, LSTM, Dropout, Embedding, Input

# Pads sequences to the same length.
from keras.preprocessing.sequence import pad_sequences

# Define global variables

In [None]:
start = "<start>"
end = "<end>"
pathToImageFolder = '/kaggle/input/flickr-image-dataset/flickr30k_images/flickr30k_images/flickr30k_images/'
pathToImageCaptionCSV = '/kaggle/input/flickr-image-dataset/flickr30k_images/flickr30k_images/results.csv'

# training params 
pictures_used_when_training = 10000

# In terms of artificial neural networks, an epoch refers to one cycle through the full training dataset
epochs = 10

# dimentions of the pictures
xdim = 299
ydim = 299

# Define functions used when preprocessing data

We use the pretrained CNN model "Xception": https://keras.io/api/applications/xception/

Quote from webbsite: 
Optionally loads weights pre-trained on ImageNet. 
the default input image size for this model is 299x299.

In [None]:
# returns a list with the name of all vectors and a dictionary with the name as key and caption as value
def fetchData(): 
    
    # read the captions to a panda datastructure 
    captions = pd.read_csv(pathToImageCaptionCSV, delimiter='|')
    captions.columns = ['imageName', 'captionNumber', 'caption']

    # stores all the file names in a list 
    all_img_name_vector = []
    
    # stores all the captions in a dictionary with filenames as key 
    all_captions = {}
    
    # parse the panda and fill the structures above 
    for index, row in captions.head(n=pictures_used_when_training).iterrows():
        caption = start + " " + row[2].replace(".", "").strip() + " " + end
        im_ID = row[0]
        all_img_name_vector.append(im_ID)
        
        if im_ID not in all_captions:
            all_captions[im_ID] = []
        all_captions[im_ID].append(caption)
    
    all_img_name_vector = list(set(all_img_name_vector))
    return all_img_name_vector, all_captions

# creates the feature dictionary (feature representation of every picture)
def createFeatures(img_name_vector):
    
    # create a features.p file if it does not exist
    if not os.path.exists('features.p'):
        
        # load the pretrained Xception model 
        # the pooling layers downsomples the image data extracted by the convolutional layers to reduce the dimentionality of the feaature map inorder to decrease processing time 
        # include_top false to excklude the top layer 
        model = tf.keras.applications.Xception(pooling = 'avg', include_top = False)
        
        # dictionary used to store the features images 
        features = {}       
        
        # create the feature of each picture (represent the picture as a vector) using the CNN
        for im_ID in img_name_vector:       
            image = loadPicture(im_ID)    
            
            
            feature = model.predict(image)
            features[im_ID] = feature

        # store the feature representation fo every picture in the file feature.p
        dump(features, open("features.p","wb"))  

    features = load(open("features.p", "rb"))
   
    # only return the images that are in the name vector 
    features = {im_ID:features[im_ID] for im_ID in img_name_vector}
    
    return features

# transform picture to standard dimention  (299x299)
def loadPicture(im_ID):
    imagePath = pathToImageFolder + im_ID
    image = Image.open(imagePath).resize((xdim,ydim))
    image = np.expand_dims(image, axis=0) 
    return normalize(image) 

# normalising the values to -1 to 1 
def normalize(image):
    return (image/127.5) - 1.0

# create and return tokenizer 
# This class allows to vectorize a text corpus, by turning each text into a sequence of integers 

def extractTokens(all_captions, num_words):
    tkz = tf.keras.preprocessing.text.Tokenizer(num_words = num_words, oov_token = "<unk>", filters='!"#$%&()*+.,-/:;=?@[\]^_`{|}~ ')
    # Updates internal vocabulary based on a list of texts.
    tkz.fit_on_texts(all_captions)
    dump(tkz, open('tokenizer.p', 'wb'))
    return tkz 

# flatten dictionary
def flatten(dictionary):
    flat = list()
    for item in dictionary.keys():
        [flat.append(d) for d in dictionary[item]]
    return flat

# Preprocess data

In [None]:
all_img_name, all_captions = fetchData()

# feature representation of the images
features = createFeatures(all_img_name)

# tokenizer
flat_all_captions = flatten(all_captions)
tkz = extractTokens(flat_all_captions, 5000)
vocabSize = len(tkz.word_index) + 1

# max length used for the caption length
caption_max_length = max(len(t.split()) for t in flat_all_captions)

In [None]:
print(features)

In [None]:
tkz.get_config()

# Define functions used when building and training

In [None]:
# Build the RNN 
def buildRNN(vocabSize, caption_max_length):

    # specify the shape (the number of features that each input sample has) 
    # 2048 because that if the shape that Imagenet uses 
    # Output from Imagenets is the input for the RNN model 
    inputs1 = Input(shape=(2048,))
    
    # the dropout layer randomly sets input units to 0 with a frequency of 50%
    # to avoid overfitting 
    fe1 = Dropout(0.5)(inputs1)
    
    # create the dense layer and use the relu as the activation function
    # relu because we have many outputs? 
    fe2 = Dense(256, activation='relu')(fe1)
    
    # Create the LSTM layer
    # shape of the caption input 
    inputs2 = Input(shape=(caption_max_length,))
    se1 = Embedding(vocabSize, 256, mask_zero=True)(inputs2)
    # drop 50% of all neurons 
    se2 = Dropout(0.5)(se1)
    # the layer for the picture 
    se3 = LSTM(256)(se2)
    
    # Merg the two models 
    decoder1 = add([fe2, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocabSize, activation='softmax')(decoder2)
    
    # tie it together [image, seq] [word]
    RNN_model = Model(inputs=(inputs1, inputs2), outputs=outputs)
    RNN_model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return RNN_model

# generator is used so that we dont have to give all data to the model at once (that will exhaust the model)
def data_generator(all_captions, features, tkz, caption_max_length):
    
    while True:
        for im_ID, captions in all_captions.items():   
            
            # vector representation of the picture 
            feature = features[im_ID][0]
            
            # create the sequence 
            input_image, input_sequence, output_word = createSequences(tkz, caption_max_length, captions, feature)
            
            yield ([input_image, input_sequence], output_word)
            
def createSequences(tkz, caption_max_length, captions, feature):
    X1 = []
    X2 = []
    y = []
    
    for caption in captions:
        # transform text to vector using the tokenizor
        seq = tkz.texts_to_sequences([caption])[0]
        
        # loop every word in the sequence 
        for i in range(1, len(seq)):
            
            # all the words in the caption until letter i
            inputSequence = seq[:i]
            
            # pad so that it always is the same length
            inputSequence = pad_sequences([inputSequence], maxlen = caption_max_length)[0]

            # the new word 
            outputSequence = seq[i]
            
            # go from words to indexes 
            outputSequence = to_categorical([outputSequence], num_classes=vocabSize)[0]
            
            # append to inputs and outputs 
            X1.append(feature)
            X2.append(inputSequence)
            y.append(outputSequence)
    
    X1 = np.array(X1)
    X2 = np.array(X2)
    y = np.array(y)
    return X1, X2 , y

# Build and train the model

In [None]:
model = buildRNN(vocabSize, caption_max_length)
steps_per_epoch = len(all_captions)

for i in range(epochs):
    filename = "model_" + str(i) + ".h5"

    gen = data_generator(all_captions, features, tkz, caption_max_length)
    model.fit_generator(gen, epochs = epochs, steps_per_epoch = steps_per_epoch, verbose = 1)
    
    model.save(filename)

# Load pre-saved model (so we dont have to train it every time)

In [None]:
tkz = load(open("../input/longtraining/tokenizer.p","rb"))
model = load_model('../input/longtraining/model_8.h5')

# Define functions used when testing the model

In [None]:
# takes an index and returns a word
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word    
    return None
    
def generate_caption(model, tkz, img_feature, caption_max_length):    
    # first word is always the start symbol 
    caption = start
    
    # loop until we get "<end>" or reached the max length
    for i in range(caption_max_length):
        
        # convert the text to a vector using the tokenizer
        seq = tkz.texts_to_sequences([caption])[0]
        
        # transform to the same length 
        seq = pad_sequences([seq], maxlen = caption_max_length)
        
        # gives the probability of each word given the picture and the previous sequence
        predicted_id_list = model.predict([img_feature, seq], verbose=0)
        
        # gives the index of the word with the highest probability
        pred_id = np.argmax(predicted_id_list)
        
        # gives the word of the above index 
        word = word_for_id(pred_id, tkz)
        
        # bug-fix when a tokenizer that does not match the captions have been used 
        if word is None:
            break
            
        caption += ' ' + word
        
        # end if the word is end or it it wants to write start again 
        if word == end or word == start:
            break
            
    return caption

def test_model(model, tkz, im_ID):        
    
    # CNN model
    xc_model = tf.keras.applications.Xception(pooling = 'avg', include_top = False)
    
    # create the vector representation of the image 
    image = loadPicture(im_ID)
    feature = xc_model.predict(image)
    
    # generate caption
    print(generate_caption(model, tkz, feature, caption_max_length))
    
    # display image 
    img = Image.open(pathToImageFolder + im_ID)
    plt.imshow(img)

# Test the model

In [None]:
test_model(model, tkz, '1001633352.jpg')

In [None]:
test_model(model, tkz, '1005216151.jpg')
all_captions['1005216151.jpg']

In [None]:
test_model(model, tkz, '1009434119.jpg')
all_captions['1009434119.jpg']

In [None]:
test_model(model, tkz, '1021293940.jpg')
all_captions['1021293940.jpg']

In [None]:
test_model(model, tkz, '1022975728.jpg')

In [None]:
test_model(model, tkz, '1032122270.jpg')

In [None]:
captions = pd.read_csv(pathToImageCaptionCSV, delimiter='|')
i = 0
for cap in captions.image_name:
    i += 1
    if i > 10000:
        print(cap)

In [None]:
test_model(model, tkz, '1714937792.jpg')

In [None]:
test_model(model, tkz, '1891331926.jpg')