In [1]:
from cnnattn_model import *
import tensorflow as tf
import json

image_folder_dir = "../input/foodrecipe/"
tf.__version__

'2.4.1'

In [2]:
image_path = json.load(open("../input/foodrecipe/image_path.json", "r"))
recipe_string = json.load(open("../input/foodrecipe/data_strings_local.json", "r"))

# Preprocess the data

In [3]:
import re

recipe_joined = {}

for recipe, steps in recipe_string.items():
    steps_joined = "<START> "
    for i in range(len(steps)):
        step_i = steps[i].strip()
        if step_i[-1] == ".": 
            step_i = step_i[:-1] + " <SEP> "
        steps_joined += step_i
    
        
    recipe_joined[recipe] = steps_joined[:-7] + " <END>"

In [4]:
text = []
images = []
menu_names =[]
not_having_image = []

for menu, steps in recipe_joined.items():
    
    try:
        paths = image_path[menu]
    except KeyError as e:
        not_having_image.append(menu)
        continue
    
    for path in paths:
        menu_names.append(menu)
        text.append(steps)
        images.append(image_folder_dir+path)

In [5]:
assert len(images) == len(text)
not_having_image

['Eggless_Strawberry_Cupcakes_with_Pink_Strawberry_Buttercream_Frosting',
 'Homemade_Lychee_Kulfi',
 'Stenciled_Christmas_Sugar_Cookies_Video_Tutorial',
 'Chicken_and_Broccoli_Zucchini_Noodle_Stir_Fry']

In [6]:
print(menu_names[0])
print(text[0])

Cream_Cheese_Strawberry_Braided_Pastry
<START> Preheat the oven at 200 C/ 390 F <SEP> Cream cheese spread - combine all ingredients in a bowl - set aside <SEP> Roll puff pastry to a rectangle - 12 x 6-inches wide <SEP> Transfer to a parchment paper so it's easier to move to the baking tray <SEP> Mark the pastry at 1 ½" from both edges lengthways (see video) <SEP> Cut off two corners (triangles) from the top and bottom - see video <SEP> Spread a generous amount of cream cheese down the center (I prefer to use a piping bag) <SEP> Then, spread a generous amount of strawberry jam over the cream cheese mixture <SEP> Using a sharp knife or pizza wheel, cut slanting strips along both sides of the pastry <SEP> Fold both ends at the top and bottom over the filling (this will prevent the filling coming out when baking) <SEP> Braid the pastry by crossing the strips over the filling overlapping each other (see video) <SEP> When you reach the end, you can overlap any excess strips or cut them off <

# 1. Tokenization & Pad sequences
`Vocab size = 1000`

In [5]:
vocab_size = 300
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=vocab_size, 
                                                  filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n', 
                                                  oov_token="<unk>")
tokenizer.word_index['<pad>'] = 0
tokenizer.index_word[0] = '<pad>'
tokenizer.word_index['<sep>'] = 1
tokenizer.index_word[1] = '<sep>'
tokenizer.fit_on_texts(text)

In [6]:
text_index = tokenizer.texts_to_sequences(text)
text_index = tf.keras.preprocessing.sequence.pad_sequences(text_index, maxlen=vocab_size, padding='post', value=0.0)

In [7]:
text_index

array([[ 56, 117,   2, ...,   0,   0,   0],
       [ 56, 117,   2, ...,   0,   0,   0],
       [ 56, 117,   2, ...,   0,   0,   0],
       ...,
       [  4, 210,   5, ...,   1,   7,  55],
       [  4, 210,   5, ...,   1,   7,  55],
       [  4, 210,   5, ...,   1,   7,  55]], dtype=int32)

# 2. Create a tf.data dataset for training

Let's first preprocess the images and save them back to disk.

In [8]:
import numpy as np

def load_image(image_path):
    img = tf.io.read_file(image_path)
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, (128, 128))
    img = tf.keras.applications.inception_v3.preprocess_input(img)
    return img, image_path

image_model = tf.keras.applications.InceptionV3(include_top=False, weights='imagenet')
new_input = image_model.input
hidden_layer = image_model.layers[-1].output

image_features_extract_model = tf.keras.Model(new_input, hidden_layer)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/inception_v3/inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5


In [9]:
# Feel free to change batch_size according to your system configuration
image_dataset = tf.data.Dataset.from_tensor_slices(images)
image_dataset = image_dataset.map(load_image, num_parallel_calls=tf.data.AUTOTUNE).batch(16)

for img, path in image_dataset:
    batch_features = image_features_extract_model(img)
    batch_features = tf.reshape(batch_features,
                              (batch_features.shape[0], -1, batch_features.shape[3]))

    for bf, p in zip(batch_features, path):
        p = p.numpy().decode("utf-8")
        p = p.replace("../input/foodrecipe/images/","")
        p = p.replace(".jpg","")
        np.save(p, bf.numpy())

In [10]:
BATCH_SIZE = 64
BUFFER_SIZE = 1000
embedding_dim = 128
units = 256

# Shape of the vector extracted from InceptionV3 is (64, 2048)
# These two variables represent that vector shape
features_shape = 2048
attention_features_shape = 64

In [11]:
images_path = []

for p in images:
    p = p.replace("../input/foodrecipe/images/","")
    p = p.replace(".jpg","")
    images_path.append(p)
images_path

['Cream_Cheese_Strawberry_Braided_Pastry_0',
 'Cream_Cheese_Strawberry_Braided_Pastry_1',
 'Cream_Cheese_Strawberry_Braided_Pastry_2',
 'The_BEST_Thumbprint_Cookies_Eggless_Jam_Cookies_Recipe_0',
 'The_BEST_Thumbprint_Cookies_Eggless_Jam_Cookies_Recipe_1',
 'Bean_Paste_Recipe_for_Sugar_Flowers_or_Korean_Bean_Paste_0',
 'Bean_Paste_Recipe_for_Sugar_Flowers_or_Korean_Bean_Paste_1',
 'Bean_Paste_Recipe_for_Sugar_Flowers_or_Korean_Bean_Paste_2',
 'Bean_Paste_Recipe_for_Sugar_Flowers_or_Korean_Bean_Paste_3',
 'Bean_Paste_Recipe_for_Sugar_Flowers_or_Korean_Bean_Paste_4',
 'Slow_Cooker_Shredded_Beef_0',
 'Slow_Cooker_Shredded_Beef_1',
 'Slow_Cooker_Shredded_Beef_2',
 'Slow_Cooker_Shredded_Beef_3',
 'Ground_Beef_Soup_0',
 'Moist_Strawberry_Chocolate_Brownie_Recipe_0',
 'Cherry_Cheesecake___Baked_0',
 'Cherry_Cheesecake___Baked_1',
 'Cherry_Cheesecake___Baked_2',
 'Cherry_Cheesecake___Baked_3',
 'Crock_Pot_or_Slow_Cooker_Lamb_Recipe_0',
 'Crock_Pot_or_Slow_Cooker_Lamb_Recipe_1',
 'Crock_Pot_or_

In [12]:
# Load the numpy files
def map_func(img_name, cap):
    img_tensor = np.load(img_name.decode('utf-8')+'.npy')
    return img_tensor, cap

dataset = tf.data.Dataset.from_tensor_slices((images_path, text_index))

# Use map to load the numpy files in parallel
dataset = dataset.map(lambda item1, item2: tf.numpy_function(
          map_func, [item1, item2], [tf.float32, tf.int32]),
          num_parallel_calls=tf.data.AUTOTUNE)

# Shuffle and batch
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
dataset = dataset.prefetch(buffer_size=tf.data.AUTOTUNE)

In [13]:
class BahdanauAttention(tf.keras.Model):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, features, hidden):
        hidden_with_time_axis = tf.expand_dims(hidden, 1)

        attention_hidden_layer = (tf.nn.tanh(self.W1(features) +
                                             self.W2(hidden_with_time_axis)))

        score = self.V(attention_hidden_layer)
        attention_weights = tf.nn.softmax(score, axis=1)
        context_vector = attention_weights * features
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector, attention_weights

class CNN_Encoder(tf.keras.Model):
    def __init__(self, embedding_dim):
        super(CNN_Encoder, self).__init__()
        # shape after fc == (batch_size, 64, embedding_dim)
        self.fc = tf.keras.layers.Dense(embedding_dim)

    def call(self, x):
        x = self.fc(x)
        x = tf.nn.relu(x)
        return x

class RNN_Decoder(tf.keras.Model):
    def __init__(self, embedding_dim, units, vocab_size):
        super(RNN_Decoder, self).__init__()
        self.units = units

        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        self.fc1 = tf.keras.layers.Dense(self.units)
        self.fc2 = tf.keras.layers.Dense(vocab_size)

        self.attention = BahdanauAttention(self.units)

    def call(self, x, features, hidden):
        # defining attention as a separate model
        context_vector, attention_weights = self.attention(features, hidden)

        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)

        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

        # passing the concatenated vector to the GRU
        output, state = self.gru(x)

        # shape == (batch_size, max_length, hidden_size)
        x = self.fc1(output)

        # x shape == (batch_size * max_length, hidden_size)
        x = tf.reshape(x, (-1, x.shape[2]))

        # output shape == (batch_size * max_length, vocab)
        x = self.fc2(x)

        return x, state, attention_weights

    def reset_state(self, batch_size):
        return tf.zeros((batch_size, self.units))

In [14]:
encoder = CNN_Encoder(embedding_dim)
decoder = RNN_Decoder(embedding_dim, units, vocab_size)

In [15]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')


def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

In [52]:
tf.saved_model.save(encoder, './model/encoder')

In [54]:
tf.saved_model.save(decoder, './model/decoder')

TypeError: call() missing 2 required positional arguments: 'features' and 'hidden'

In [17]:
@tf.function
def train_step(img_tensor, target):
    loss = 0

    # initializing the hidden state for each batch
    # because the captions are not related from image to image
    hidden = decoder.reset_state(batch_size=target.shape[0])

    dec_input = tf.expand_dims([tokenizer.word_index['<start>']] * target.shape[0], 1)

    with tf.GradientTape() as tape:
        features = encoder(img_tensor)

        for i in range(1, target.shape[1]):
            # passing the features through the decoder
            predictions, hidden, _ = decoder(dec_input, features, hidden)

            loss += loss_function(target[:, i], predictions)

            # using teacher forcing
            dec_input = tf.expand_dims(target[:, i], 1)
    
    print('finish Forward pass')
    total_loss = (loss / int(target.shape[1]))

    trainable_variables = encoder.trainable_variables + decoder.trainable_variables

    gradients = tape.gradient(loss, trainable_variables)

    optimizer.apply_gradients(zip(gradients, trainable_variables))
    print('finish batch')
    return loss, total_loss

In [29]:
checkpoint_path = "./checkpoints/train"
ckpt = tf.train.Checkpoint(encoder=encoder,
                           decoder=decoder,
                           optimizer=optimizer)
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

start_epoch = 0
if ckpt_manager.latest_checkpoint:
    start_epoch = int(ckpt_manager.latest_checkpoint.split('-')[-1])
    ckpt.restore(ckpt_manager.latest_checkpoint)

In [28]:
import time

EPOCHS = 50
num_steps = 2880/BATCH_SIZE
for epoch in range(EPOCHS):
    start = time.time()
    total_loss = 0

    for (batch, (img_tensor, target)) in enumerate(dataset):
        print(f'batch: {batch}')
        batch_loss, t_loss = train_step(img_tensor, target)
        total_loss += t_loss

        if batch % 10 == 0:
            average_batch_loss = batch_loss.numpy()/int(target.shape[1])
            print(f'Epoch {epoch+1} Batch {batch} Loss {average_batch_loss:.4f}')

    if epoch % 5 == 0:
        ckpt_manager.save()
    print(f'------> Epoch {epoch+1} Loss {total_loss/num_steps:.6f}')
    print(f'Time taken for 1 epoch {time.time()-start:.2f} sec\n')

batch: 0
Epoch 1 Batch 0 Loss 1.0016
batch: 1
batch: 2
batch: 3
batch: 4
batch: 5
batch: 6
batch: 7
batch: 8
batch: 9
batch: 10
Epoch 1 Batch 10 Loss 0.9842
batch: 11
batch: 12
batch: 13
batch: 14
batch: 15
batch: 16
batch: 17
batch: 18
batch: 19
batch: 20
Epoch 1 Batch 20 Loss 1.0472
batch: 21
batch: 22
batch: 23
batch: 24
batch: 25
batch: 26
batch: 27
batch: 28
batch: 29
batch: 30
Epoch 1 Batch 30 Loss 0.8520
batch: 31
batch: 32
batch: 33
batch: 34
batch: 35
batch: 36
batch: 37
batch: 38
batch: 39
batch: 40
Epoch 1 Batch 40 Loss 0.9534
batch: 41
batch: 42
batch: 43
batch: 44
batch: 45
------> Epoch 1 Loss 1.005439
Time taken for 1 epoch 22.32 sec

batch: 0
Epoch 2 Batch 0 Loss 0.9882
batch: 1
batch: 2
batch: 3
batch: 4
batch: 5
batch: 6
batch: 7
batch: 8
batch: 9
batch: 10
Epoch 2 Batch 10 Loss 0.9460
batch: 11
batch: 12
batch: 13
batch: 14
batch: 15
batch: 16
batch: 17
batch: 18
batch: 19
batch: 20
Epoch 2 Batch 20 Loss 1.0146
batch: 21
batch: 22
batch: 23
batch: 24
batch: 25
batch:

In [49]:
def evaluate(image):
    attention_plot = np.zeros((max_length, 4))

    hidden = decoder.reset_state(batch_size=1)

    temp_input = tf.expand_dims(load_image(image)[0], 0)
    img_tensor_val = image_features_extract_model(temp_input)
    img_tensor_val = tf.reshape(img_tensor_val, (img_tensor_val.shape[0],
                                                 -1,
                                                 img_tensor_val.shape[3]))

    features = encoder(img_tensor_val)

    dec_input = tf.expand_dims([tokenizer.word_index['<start>']], 0)
    result = []

    for i in range(max_length):
        predictions, hidden, attention_weights = decoder(dec_input,
                                                         features,
                                                         hidden)

        attention_plot[i] = tf.reshape(attention_weights, (-1, )).numpy()

        predicted_id = tf.random.categorical(predictions, 1)[0][0].numpy()
        result.append(tokenizer.index_word[predicted_id])

        if tokenizer.index_word[predicted_id] == '<end>':
            return result, attention_plot

        dec_input = tf.expand_dims([predicted_id], 0)

    attention_plot = attention_plot[:len(result), :]
    return result, attention_plot

def plot_attention(image, result, attention_plot):
    temp_image = np.array(Image.open(image))

    fig = plt.figure(figsize=(10, 10))

    len_result = len(result)
    for i in range(len_result):
        temp_att = np.resize(attention_plot[i], (8, 8))
        grid_size = max(np.ceil(len_result/2), 2)
        ax = fig.add_subplot(grid_size, grid_size, i+1)
        ax.set_title(result[i])
        img = ax.imshow(temp_image)
        ax.imshow(temp_att, cmap='gray', alpha=0.6, extent=img.get_extent())

    plt.show()

In [55]:
# captions on the validation set
from PIL import Image
import matplotlib.pyplot as plt

max_length = 500
rid = np.random.randint(0, len(images))
image = images[rid]
real_caption = ' '.join([tokenizer.index_word[i]
                        for i in text_index[rid] if i not in [0]])
result, attention_plot = evaluate(image)

print('Real Caption:', real_caption)
print('\nPrediction Caption:', ' '.join(result))

Real Caption: if necessary so the layers <unk> <unk> <sep> spread more chocolate <unk> around the cake and spread with a spatula <sep> use a <unk> <unk> to smooth the sides of the cake and off set spatula for the top <sep> place in the fridge for at least 2 to 4 hours so the <unk> is set <sep> place the cake on a <unk> rack with a large <unk> or clean baking tray to <unk> the <unk> <sep> make sure your <unk> is <unk> consistency if not warm it in the microwave for 10 to 20 <unk> <sep> tip – do not <unk> the <unk> you want it to be <unk> consistency but not hot or it will melt the frosting on the cake pour the <unk> on the cake <unk> <unk> in the <unk> and then to the sides making sure you pour <unk> <unk> to cover the <unk> cake <sep> if you have any <unk> <unk> use a spatula to <unk> smooth the sides <sep> <unk> do not use the spatula too much as you will leave <unk> on the cake <sep> the <unk> you <unk> on this at this <unk> the <unk> the <unk> <sep> place the cake in the fridge and 

In [51]:
del real_caption, result, attention_plot, image

In [12]:
import numpy as np

saved_images = np.load("../input/saved-images/saved_images (1).npy")
dataset = tf.data.Dataset.from_tensor_slices({"image":saved_images, "text":text_index}).batch(128)

In [13]:
for data in dataset.take(1):
    print(data['text'].shape)
    print(data['image'].shape)

(128, 1000)
(128, 128, 128, 3)


In [14]:
model = CNNattn(
        vocab_size = vocab_size,
        emb_dim = 64,
        gru_units = 16,
        w_units = 64,
        head_denses_units = [100],
        w = 64,
        h = 64,
        channel = 3,
        n_cov = 2,
        filters_list =[16, 16],
        kernel_sizes = [(8,8), (8,8)],
        n_pool = 2
    )

Try model forward pass

In [19]:
for data in dataset.take(1):
    print('image.shape',data['image'].shape)
    print('text.shape',data['text'].shape)

    image = data['image']
    text  = data['text']

    o = model(image, text[:,1], verbose=1)
    break

image.shape (128, 128, 128, 3)
text.shape (128, 1000)
input image: (128, 128, 128, 3)
conv_0 output: (128, 121, 121, 16)
pool_0 output: (128, 60, 60, 16)
conv_1 output: (128, 53, 53, 16)
pool_1 output: (128, 26, 26, 16)
final output: (128, 676, 16)
embed output: (128, 1, 64)
GRU output: (128, 1, 16)
attention output: (128, 1, 64)
head_denses output: (128, 1, 100)
output: (128, 1, 1000)
output: (128, 1000)


# 3. Train model

In [16]:
model.compile(optimizer='adam',loss='categorical_crossentropy')

optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')


def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

In [17]:
@tf.function
def train_step(image, text, verbose):
    loss = 0
    dec_input = tf.expand_dims([tokenizer.word_index['<start>']] * text.shape[0], 1)

    with tf.GradientTape() as tape:
        print(text.shape[1])
        for i in range(1, text.shape[1]):
            
            print(".", end='')
            # passing the features through the decoder
            predictions = model(image, dec_input, verbose)

            loss += loss_function(text[:, i], predictions)

            # using teacher forcing
            dec_input = text[:, i]

    total_loss = (loss / int(text.shape[1]))

    trainable_variables = model.trainable_variables

    gradients = tape.gradient(loss, trainable_variables)

    optimizer.apply_gradients(zip(gradients, trainable_variables))

    return loss, total_loss

In [None]:
import time

EPOCHS = 5

for epoch in range(EPOCHS):
    start = time.time()
    total_loss = 0

    for (batch, dataset) in enumerate(dataset):
        print(type(dataset))
        image = dataset['image']
        text  = dataset['text']
        batch_loss, t_loss = train_step(image, text, verbose=0)
        total_loss += t_loss

        if batch % 100 == 0:
            average_batch_loss = batch_loss.numpy()/int(text.shape[1])
            print(f'\nEpoch {epoch+1} Batch {batch} Loss {average_batch_loss:.4f}')


    print(f'\nLoss: {total_loss}')

    print(f'Time taken for 1 epoch {time.time()-start:.2f} sec\n')

<class 'dict'>
1000
....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

In [23]:
np.sum([np.prod(v.get_shape().as_list()) for v in model.trainable_variables])

197101.0

In [30]:
for i in model.trainable_variables:
    print(i.get_shape().as_list())

[8, 8, 3, 16]
[16]
[8, 8, 16, 16]
[16]
[1000, 64]
[64, 48]
[16, 48]
[2, 48]
[16, 64]
[64]
[16, 64]
[64]
[]
[64, 100]
[100]
[100, 1000]
[1000]


In [27]:
np.prod((8, 8, 3, 16))

3072

3072

In [33]:
i=0
for data in dataset:
    print(data['text'])
    i+=1
    print("="*30)
    if i==5:break

tf.Tensor(
[[ 56 117   2 ...   0   0   0]
 [ 56 117   2 ...   0   0   0]
 [ 56 117   2 ...   0   0   0]
 ...
 [ 56 451   1 ...   0   0   0]
 [ 56 169 532 ...   0   0   0]
 [ 56 117  46 ...   0   0   0]], shape=(16, 1000), dtype=int32)
tf.Tensor(
[[ 56 929  41 ...   0   0   0]
 [ 56 929  41 ...   0   0   0]
 [ 56 929  41 ...   0   0   0]
 ...
 [ 56 117   2 ...   0   0   0]
 [ 56 117   2 ...   0   0   0]
 [ 56 117   2 ...   0   0   0]], shape=(16, 1000), dtype=int32)
tf.Tensor(
[[ 56 117   2 ...   0   0   0]
 [ 56 117   2 ...   0   0   0]
 [ 56 117   2 ...   0   0   0]
 ...
 [ 56 107   2 ...   0   0   0]
 [ 56 107   2 ...   0   0   0]
 [ 56 117   2 ...   0   0   0]], shape=(16, 1000), dtype=int32)
tf.Tensor(
[[ 56 117   2 ...   0   0   0]
 [ 56 117   2 ...   0   0   0]
 [ 56 117   2 ...   0   0   0]
 ...
 [ 56 117  46 ...   0   0   0]
 [ 56 242   7 ...   0   0   0]
 [ 56 242   7 ...   0   0   0]], shape=(16, 1000), dtype=int32)
tf.Tensor(
[[ 56 242   7 ...   0   0   0]
 [ 56 117   2 ... 

In [22]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1
