In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import string
import numpy as np
import PIL.Image

from os import listdir

from numpy import array
from numpy import argmax
import tensorflow as tf
from tensorflow.keras.applications.vgg16 import VGG16 ,preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from pickle import dump, load

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout

from nltk.translate.bleu_score import corpus_bleu

In [None]:
CSV_PATH="../input/flickr-image-dataset/flickr30k_images/results.csv"
results = pd.read_csv(CSV_PATH,error_bad_lines=False,sep="|")

In [None]:
results.columns = [i.strip() for i in results.columns]

In [None]:
df = pd.DataFrame(results[['image_name']],columns=['image_name'])
df.to_csv('flickr30kimages.txt',sep=' ',header=None,index=False)

In [None]:
df = pd.DataFrame(results[['image_name','comment']],columns=['image_name','comment'])
df['image_name'] = df['image_name'].apply(lambda x: x[:-4])
df.head()

In [None]:
df.to_csv('flickr30kdescribtion.txt',sep=' ',header=None,index=False)

# File loader

In [None]:
def load_file(filename):
    file = open(filename, 'r')
    text = file.readlines()
    file.close()
    return text

# Describtion preprocessing

In [None]:
def load_clean_descriptions(filename, photos):
    
    file = load_file(filename)
    
    descriptions = dict()
    for line in file:
       
        words = line.split()
        
        image_id, image_description = words[0], words[1:]
       
        if image_id in photos:
            
            if image_id not in descriptions:
                descriptions[image_id] = list()
            
           
            
            desc = 'startseq ' + ' '.join(image_description[1:-1]) + ' endseq'
            descriptions[image_id].append(desc)
    return descriptions


# Images names handler

In [None]:
def load_photo_identifiers(filename):
    
    file = load_file(filename)
    
    photos = list()
    
    for line in file:
        if len(line) < 1:
            continue
        
        identifier = line.split('.')[0]
        
        photos.append(identifier)
        
    return set(photos)

# Prepare Tokenizer

In [None]:
def to_lines(descriptions):
    all_desc = list()
    for key in descriptions.keys():
        [all_desc.append(d) for d in descriptions[key]]
    return all_desc
def create_tokenizer(descriptions):
    lines = to_lines(descriptions)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [None]:
filename = './flickr30kimages.txt'
train = load_photo_identifiers(filename) # Set of train images names
print('Dataset: ', len(train))
train_descriptions = load_clean_descriptions('./flickr30kdescribtion.txt', train)
print('Descriptions: train=', len(train_descriptions))
tokenizer = create_tokenizer(train_descriptions)
dump(tokenizer, open('tokenizer30k.pkl', 'wb'))

# Model Preparation

In [None]:
def extract_features_resnet(directory):
    model = VGG16()
    model = Model(inputs=model.inputs, outputs=model.layers[-2].output)
    #print(model.summary())
    features = dict()
    l = listdir(directory)
    l.remove("flickr30k_images")
    l.remove("results.csv")
    for name in l:
        filename = os.path.join(directory  , name)
        image = load_img(filename, target_size=(224, 224))
        image = img_to_array(image)
        image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
        image = preprocess_input(image)/ 255.0
        feature = model.predict(image, verbose=2)
        image_id = name.split('.')[0]
        features[image_id] = feature
    return features
features = extract_features_resnet("../input/flickr-image-dataset/flickr30k_images/flickr30k_images/")

print('Extracted Features: ', len(features))

dump(features, open('features30k.pkl', 'wb'))

# Decoder Layers

In [None]:
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath='./',
    monitor='accuracy',
    mode='auto',
    save_freq="epoch")
def define_model_resnet(vocab_size, max_length):
    
    # feature extractor model
    inputs1 = Input(shape=(2048,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation='relu')(fe1)

    # sequence model
    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)

    # decoder model
    decoder1 = tf.keras.layers.add([fe2, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax')(decoder2)
    
    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])
    
    #print(model.summary())
    
    return model

# Generators

In [None]:
from itertools import islice
def data_generator(descriptions, photos, tokenizer, max_length):
    while True:
        for key, description_list in descriptions.items():
            photo = photos[key][0]
            input_image, input_sequence, output_word = create_sequences(tokenizer, max_length, description_list, photo)
            yield  [input_image,input_sequence],output_word

def create_sequences(tokenizer, max_length, desc_list, photo):
    X1, X2, y = list(), list(), list()
    for desc in desc_list:
        seq = tokenizer.texts_to_sequences([desc])[0]
        for i in range(1, len(seq)):
            in_seq, out_seq = seq[:i], seq[i]
            in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
            out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
            # store
            X1.append(photo)
            X2.append(in_seq)
            y.append(out_seq)
    return array(X1), array(X2), array(y)

In [None]:
def load_photo_features(filename, photos):
    
    all_features = load(open(filename, 'rb'))
    
    features = {k: all_features[k] for k in photos}
    
    return features

In [None]:
train_features = load_photo_features('features30k_resnet.pkl', train)
vocab_size = len(tokenizer.word_index) + 1
lines = to_lines(train_descriptions)
max_length = max(len(d.split()) for d in lines)

In [None]:
model = define_model_resnet(vocab_size, max_length)
epochs = 20
steps = len(train_descriptions)
generator = data_generator(train_descriptions, train_features, tokenizer, max_length)
for i in range(epochs):
    model.fit(generator, epochs=1,steps_per_epoch=steps,verbose=1,callbacks = [model_checkpoint_callback])
    #model.save('decoder_resnet_30k_' + str(i) + '.h5')

# Predictions

In [None]:
def encoder_resnet(filename):
    model = VGG16(weights="imagenet")
    model = Model(inputs=model.inputs, outputs=model.layers[-2].output)
    image = load_img(filename, target_size=(224, 224))
    image = img_to_array(image)
    image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
    image = preprocess_input(image)
    feature = model.predict(image, verbose=0)
    return feature

In [None]:
def generate_desc(model, tokenizer, photo, max_length):
    in_text = 'startseq'
    for i in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length)
        yhat = model.predict([photo,sequence], verbose=0)
        yhat = argmax(yhat)
        word = word_for_id(yhat, tokenizer)
        if word is None:
            break
        in_text += ' ' + word
        if word == 'endseq':
            break
    return in_text

In [None]:
# model = load_model('decoder_vgg.h5')
# path = './WIN_20210109_20_48_40_Pro.jpg'
# photo = encoder_resnet(path)
# photo.shape
# description = generate_desc(model, tokenizer, photo, max_length)
# print(description)

# Evaluations

In [None]:
# def evaluate_model(model, descriptions, photos, tokenizer, max_length):
#     actual, predicted = list(), list()
#     for key, desc_list in descriptions.items():
#         prediction = generate_desc(model, tokenizer, photos[key], max_length)
#         actual_desc = [d.split() for d in desc_list]
#         actual.append(actual_desc)
#         predicted.append(prediction.split())

#     print('BLEU-1: ', corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
#     print('BLEU-2: ', corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
#     print('BLEU-3: ', corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
#     print('BLEU-4: ', corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

In [None]:
# filename = 'decoder_vgg.h5'
# model = load_model(filename)
# evaluate_model(model, train_descriptions, train_features, tokenizer, max_length)