In [None]:
from tqdm.notebook import tqdm
import time
import requests
import os
import numpy as np
import io
import imageio
import tensorflow as tf
import tensorflow_datasets as tfds
import cv2
import matplotlib.pyplot as plt
import numpy
import h5py
import pickle
import pandas as pd
from urllib.request import urlopen

from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras import Input, layers
from array import array
from keras.layers.merge import add
from keras.models import Model
from keras.layers import Input, LSTM, GRU, Embedding, Multiply,  Concatenate, TimeDistributed, Dense, RepeatVector, Activation, Flatten, Reshape, concatenate, Dropout, BatchNormalization

WORKING_DIRECTORY = os.path.dirname(os.getcwd())
TRAIN_DATA_FILENAME = "../input/project-data/eee443_project_dataset_train.h5"
TEST_DATA_FILENAME = "../input/project-data/eee443_project_dataset_test.h5"
TRAIN_IMAGES_DIR = WORKING_DIRECTORY + "/data/train_images/"
TEST_IMAGES_DIR = WORKING_DIRECTORY + "/data/test_images/"
TID = "../input/images-for-train/train_images/"
CAPS_DICT_DIR = "../input/caps-dict/caps_dict"
PICKLE_FEATURE_DIR = "../input/features/feature_data"

In [None]:
f = h5py.File(TRAIN_DATA_FILENAME, "r")

for key in list(f.keys()):
    print(key, ":", f[key][()].shape)

train_cap = f["train_cap"][()]
train_imid = f["train_imid"][()]
train_ims = f["train_ims"][()]
train_url = f["train_url"][()]
word_code = f["word_code"][()]


df = pd.DataFrame(word_code)
df = df.sort_values(0, axis=1)
words = np.asarray(df.columns)

wordtoix = {}
for i in range(len(words)):
  word = words[i]
  wordtoix[word] = i

print("Vocab Size =", len(words))


def caption_array_to_str(caption_array):
    
    caption = ""
    
    for word in caption_array:

        if (word == 'x_NULL_') or (word == 'x_START_') or (word == 'x_END_'):
            continue
            
        caption += word + " "
            
    return caption

def tidy_caps(url, imid, cap, image_directory=None):

    caps_dict = {}

    if image_directory is not None:
        
        url_list = [u.split("/")[-1].strip() for u in np.char.decode(url).tolist()]
        
        for f in tqdm(os.listdir(image_directory)):
            
            if f.endswith(".jpg") or f.endswith(".jpeg"):
                
                ind = url_list.index(f) + 1
                caps_dict[f] = cap[np.where(imid == ind)]

    else:
        
        for i in range(len(url)):
            
            name = url[i].decode().split("/")[-1]
            caps_dict[name] = cap[np.where(imid == (i+1))]
            
    return caps_dict




def create_pre_processed_set(image_directory, shuffle=False):
    
    file_data = tf.data.Dataset.list_files(str(image_directory) + "*.jpg", shuffle=shuffle)

    def process_files(path):
        img = tf.io.read_file(path)
        img = tf.image.decode_jpeg(img, channels=3)
        img = tf.image.resize(img, (299, 299))
        img = tf.keras.applications.inception_v3.preprocess_input(img)
        return img

    def process_name(path):
        name = path.numpy().decode().split("/")[-1]
        return name

    def trial(path):
        img = tf.py_function(process_files, [path], tf.float32)
        name = tf.py_function(process_name, [path], tf.string)
        d = (img, name)
        return d

    pre_processed_set = file_data.map(lambda x: trial(x))
    
    return pre_processed_set





def create_features_pickled(pickle_file_dir, dataset_images):

    model = tf.keras.applications.InceptionV3(weights='imagenet')
    inception = tf.keras.Model(model.input, model.layers[-2].output)

    with open(pickle_file_dir, "wb") as outfile:

        for data in tqdm(dataset_images.batch(1)):

            image = data[0]
            key = data[1].numpy().astype(str)[0]

            feature = inception(image)

            feature = tf.reshape(feature, -1).numpy()

            super_tuple = (feature, caps_dict[key], tf.constant(key))
            pickle.dump(super_tuple, outfile)

    outfile.close()
    
    
    
def loadpickle(filename):

    with open(filename, "rb") as f:

        while True:

            try:
                yield pickle.load(f)

            except EOFError:
                break
                

                
def create_dataset(image_directory, feature_pickle_directory):
    
    if not os.path.isfile(feature_pickle_directory):
        dataset_images = create_pre_processed_set(image_directory)
        create_features_pickled(feature_pickle_directory, dataset_images)
        
    dataset = tf.data.Dataset.from_generator(loadpickle, args=[feature_pickle_directory], output_types=(np.float32,np.int32, tf.string))
    
    return dataset


def create_caption_dictionary(train_url, train_imid, train_cap, image_directory, caption_pickle_directory):
    
    if not os.path.isfile(caption_pickle_directory):
        caps_dict, data_length = tidy_caps(train_url, train_imid, train_cap, image_directory)

        with open(caption_pickle_directory, "wb") as outfile:
            pickle.dump(caps_dict, outfile)
        outfile.close()

    else:
        with open(caption_pickle_directory, "rb") as infile:
            caps_dict = pickle.load(infile)
        infile.close()
    
    return caps_dict, len(caps_dict)



caps_dict, data_length = create_caption_dictionary(train_url, train_imid, train_cap, TID, CAPS_DICT_DIR)

print( "{} of {} retrieved. {:.1f}% of data is clean.".format(data_length, len(train_url), 100 * data_length/len(train_url) ) )



feature_dataset = create_dataset(TID, PICKLE_FEATURE_DIR)

for d in feature_dataset.take(2):
    features = d[0]
    captions = d[1].numpy()
    image_name= d[2].numpy().decode()
    
    im = cv2.imread(TID + image_name)
    im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
    plt.imshow(im)
    plt.show()
    cap = words[captions]
    
    for c in cap:
        c = caption_array_to_str(c)
        print(c)
        
    print(features.shape, captions.shape, TID + image_name)

In [None]:
def data_generator(dataset, max_length, num_photos_per_batch, vocab_size):

  X1, X2, y = list(), list(), list()

  n = 0
  
  while 1:
        
    for data in dataset:
        
        n += 1
        feature = data[0].numpy()
        caps = data[1].numpy()
        
        for i in range(caps.shape[0]):
        
            seq = caps[i]
            
#             print(n, i, seq, feature)
            
            for m in range(1, seq.shape[0]):
                # split into input and output pair
                in_seq, out_seq = seq[:m], seq[m]
                # pad input sequence
                in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                # encode output sequence
                out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                # store
                # print('photo size', photo.shape)
                # print('X2 size', in_seq.shape)
                # print('y size', out_seq.shape)
                X1.append(feature)
                X2.append(in_seq)
                y.append(out_seq)

    
        if n == num_photos_per_batch:
#             print("BATCH END\n", np.array(y))
            yield [np.array(X1), np.array(X2)], np.array(y)
#             print("BATCH START")
            X1, X2, y = list(), list(), list()
            n = 0

In [None]:
def create_embedding(wordtoix):
    # Load Glove vectors
    glove_dir = '../input/glove6b200d/glove.6B.200d.txt'
    embeddings_index = {} # empty dictionary
    f = open(glove_dir, encoding="utf-8")

    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        # if (word == 'startseq' or word == 'unk' ):
        #   print(word)

        embeddings_index[word] = coefs
    f.close()
    print('Found %s word vectors.' % len(embeddings_index))


    embedding_dim = 200

    # Get 200-dim dense vector for each of the 10000 words in out vocabulary
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    for word, i in wordtoix.items():

        if (word == 'xUNK'):
          word = 'unk'

        embedding_vector = embeddings_index.get(word)
        if  embedding_vector is None:
          print(word)

        if embedding_vector is not None:
            # Words not found in the embedding index will be all zeros
            embedding_matrix[i] = embedding_vector
    
    return embedding_matrix

In [None]:
max_length = 17
vocab_size = 1004
embedding_dim = 200
embedding_matrix = create_embedding(wordtoix)

In [None]:
def create_merge_model(embedding_matrix):
    
    inputs1 = Input(shape=(2048,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation='relu')(fe1)
    
    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, embedding_dim, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)
    
    #add, not concatenate! wrong 
    decoder1 = add([fe2, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax')(decoder2)
    
    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    
    #set embedding layer's weight matrix 
    model.layers[2].set_weights([embedding_matrix])
    model.layers[2].trainable = True
    
    return model

In [None]:
model = create_merge_model(embedding_matrix)
model.summary()

In [None]:
def create_init_inject_model(embedding_matrix):
    
    inputs1 = Input(shape=(2048,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation='relu')(fe1)  
    
    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, embedding_dim, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)

    #image is set as state 
    se3,state = GRU(256,return_state = True)(se2,initial_state = fe2)  
    
    decoder2 = Dense(256, activation='relu')(se3)    
    outputs = Dense(vocab_size, activation='softmax')(decoder2)   
    
    model = Model(inputs=[inputs1, inputs2], outputs=outputs)    
    
    #set embedding layer's weight matrix 
    model.layers[2].set_weights([embedding_matrix])
    model.layers[2].trainable = True
    
    return model

In [None]:
def create_pre_inject_model(embedding_matrix):
    
    inputs1 = Input(shape=(2048,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(embedding_dim, activation='relu')(fe1)    
    fe2_reshaped = Reshape((1, embedding_dim), input_shape=(embedding_dim,))(fe2)    

    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, embedding_dim, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)        
    se3,state3 = GRU(256,return_state = True)(fe2_reshaped)   
    se4,state4 = GRU(256,return_state = True)(se2, initial_state = state3)      
    decoder2 = Dense(256, activation='relu')(se4)    
    outputs = Dense(vocab_size, activation='softmax')(decoder2)    

    model = Model(inputs=[inputs1, inputs2], outputs=outputs)    

    model.layers[4].set_weights([embedding_matrix])
    model.layers[4].trainable = True
   
        
    return model

#concatenate trial!
inputs1 = Input(shape=(2048,))
fe1 = Dropout(0.5)(inputs1)
fe2 = Dense(200, activation='relu')(fe1)
fe2 = Reshape((1, embedding_dim), input_shape=(embedding_dim,))(fe2)   
x = Model(inputs=inputs1, outputs=fe2)

inputs2 = Input(shape=(max_length,))
se1 = Embedding(vocab_size, embedding_dim, mask_zero=True)(inputs2)
se2 = Dropout(0.5)(se1)        
y = Model(inputs=inputs2, outputs=se2)

combined = Concatenate(axis = 1)([x.output, y.output])
decoder2 = Dense(256, activation='relu')(combined)    
outputs = Dense(vocab_size, activation='softmax')(decoder2)    

model = Model(inputs=[inputs1, inputs2], outputs=outputs)    

model.layers[4].set_weights([embedding_matrix])
model.layers[4].trainable = True


inputA = Input(shape=(32,))
inputB = Input(shape=(128,))

x = Dense(8, activation="relu")(inputA)
x = Dense(4, activation="relu")(x)
x = Model(inputs=inputA, outputs=x)

y = Dense(64, activation="relu")(inputB)
y = Dense(32, activation="relu")(y)
y = Dense(4, activation="relu")(y)
y = Model(inputs=inputB, outputs=y)

combined = concatenate([x.output, y.output])

z = Dense(2, activation="relu")(combined)
z = Dense(1, activation="linear")(z)

model = Model(inputs=[x.input, y.input], outputs=z)

In [None]:
def create_par_inject_model(embedding_matrix):
    
    max_length = 17
    vocab_size = 1004
    embedding_dim = 200
    
    inputs1 = Input(shape=(2048,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(200, activation='relu')(fe1)

    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, embedding_dim, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)

    de = Multiply()([fe2,se2])

    se3 = LSTM(256)(de)
    decoder2 = Dense(256, activation='relu')(se3)
    outputs = Dense(vocab_size, activation='softmax')(decoder2)

    model = Model(inputs=[inputs1, inputs2], outputs=outputs)

    model.layers[3].set_weights([embedding_matrix])
    model.layers[3].trainable = True
    
    return model 

In [None]:
print("Creating embedding matrix...")
embedding_matrix = create_embedding(wordtoix)
print('Embedding matrix is ready!')

In [None]:
max_length = 17
vocab_size = 1004
embedding_dim = 200

print("Creating par-inject model...")
model = create_par_inject_model(embedding_matrix) 

#print("Creating pre-inject model...")
#model = create_pre_inject_model(embedding_matrix)

#print("Creating init-inject model...")
#model = create_init_inject_model()

#print("Creating merge model...")
#model = create_merge_model()
model.summary()

model.compile(loss='categorical_crossentropy', optimizer='adam')
print("Model compiled...")

# lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
#     initial_learning_rate=1e-2,
#     decay_steps=10000,
#     decay_rate=0.9)
# opt = tf.keras.optimizers.RMSprop(learning_rate=lr_schedule, clipvalue=5)
# model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=["mae", "acc"])

epochs = 1
num_photos_per_batch = 8

data_length = 1000
feature_dataset = feature_dataset.take(data_length)

val_length = round(data_length * 0.15)
train_length = data_length - val_length

print(train_length)

train_step = train_length//num_photos_per_batch
val_step = val_length//num_photos_per_batch

val_dataset = feature_dataset.take(val_length) 
train_dataset = feature_dataset.skip(val_length)
train_dataset = feature_dataset

print("Starting training...")

for i in range(epochs):

    train_generator = data_generator(train_dataset, max_length, num_photos_per_batch, vocab_size)
    val_generator = data_generator(val_dataset, max_length, num_photos_per_batch, vocab_size)
    
    history=model.fit_generator(train_generator,
                                steps_per_epoch = train_step,
                                epochs = 1,
                                validation_data = val_generator,
                                validation_steps = val_step,
                                shuffle=False, verbose=1)

In [None]:
def print_formatted(ls):
    caption = ""
    for word in ls:
        if word is not ("x_NULL_" or "x_START_" or "x_END_"):
            caption += word + " "
    print(caption)

from nlgeval import compute_individual_metrics
references = ["i am running fast", "cow is running"]
hypothesis = "i am running" 
metrics_dict = compute_individual_metrics(references, hypothesis)

In [None]:
def predict_caption(model, feature_model, img_path):
        
    img = tf.io.read_file(img_path)
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, (299, 299))
    img = tf.keras.applications.inception_v3.preprocess_input(img)
    img = np.expand_dims(img, axis=0)
    img = feature_model(img)
    img = img.numpy().reshape(1, -1)
    print(img.shape)
    seq = np.array([0]*17).reshape(1, -1)
    seq[:, 0] = 1
    
    
    for i in range(16):
        pred = model.predict([img,seq])
        seq[:, i+1] = np.argmax(pred)
    
    seq = seq.reshape(-1)
    
    im = cv2.imread(img_path)
    im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
    plt.imshow(im)
    plt.show()
    return seq

image_name = "../input/images-for-train/train_images/10004450986_d3c8220117_z.jpg"

inception_model = tf.keras.applications.InceptionV3(weights='imagenet')
inception = tf.keras.Model(inception_model.input, inception_model.layers[-2].output)

seq = predict_caption(model, inception, image_name)
    
pred_seq = words[seq]
actual = words[caps_dict["10004450986_d3c8220117_z.jpg"]]
from nltk.translate.bleu_score import sentence_bleu
score = sentence_bleu(actual, pred_seq)
print(score)
print_formatted(pred_seq)
print(actual)



In [None]:
prediction = ['pos','lkl','lkd','aaa']
references = [['aaa','aaa','aaa'],['ooo', 'ooo', 'ooo'],['ooo', 'ooo', 'ooo']]

from nltk.translate.bleu_score import sentence_bleu
score = sentence_bleu(references, prediction)
print(score)

!pip install git+https://github.com/Maluuba/nlg-eval.git@master
!nlg-eval --setup