In [1]:
from google.colab import drive
drive.mount("/content/drive")
# cd to a folder in your Drive - in my case is this route
%cd '/content/drive/Othercomputers/Mi portátil/gastroml'

Mounted at /content/drive
/content/drive/Othercomputers/Mi portátil/gastroml


In [2]:
import numpy as np
# data processing, CSV file I / O (e.g. pd.read_csv)
import pandas as pd
import os
import tensorflow as tf
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.models import Model
from keras.layers import Flatten, Dense, LSTM, Dropout, Embedding, Activation
from keras.layers import concatenate, BatchNormalization, Input
from keras.layers import add
# layers to consider transformers

from keras.utils import to_categorical, plot_model
from keras.applications.inception_v3 import InceptionV3, preprocess_input

from keras.applications.resnet import ResNet50, ResNet101, ResNet152
from keras.applications.resnet import preprocess_input as resnet_preprocess_input

from keras.applications.efficientnet import EfficientNetB7
from keras.applications.efficientnet import preprocess_input as efficientnet_preprocess_input

from keras.applications.convnext import ConvNeXtSmall
from keras.applications.convnext import preprocess_input as convnext_preprocess_input

import matplotlib.pyplot as plt  # for plotting data
import cv2
import string

In [3]:
# beging code taken from https://keras.io/examples/nlp/text_classification_with_transformer/
from tensorflow.keras.layers import MultiHeadAttention, LayerNormalization
class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential(
            [tf.keras.layers.Dense(ff_dim, activation="relu"), tf.keras.layers.Dense(embed_dim),]
        )
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)


In [4]:
df = pd.read_csv('Kaggle data/final_data.csv')
df.head()

Unnamed: 0,id,title,ingredients,instructions,image_name,cleaned_ingredients,ingredients_rawmats,partition
0,0,Miso-Butter Roast Chicken With Acorn Squash Pa...,"['1 (3Â½â€“4-lb.) whole chicken', '2Â¾ tsp. ko...","Pat chicken dry with paper towels, season all ...",miso-butter-roast-chicken-acorn-squash-panzanella,"['1 (3Â½â€“4-lb.) whole chicken', '2Â¾ tsp. ko...","['squash', 'oil', 'apples', 'onion', 'bread', ...",train
1,1,Crispy Salt and Pepper Potatoes,"['2 large egg whites', '1 pound new potatoes (...",Preheat oven to 400Â°F and line a rimmed bakin...,crispy-salt-and-pepper-potatoes-dan-kluger,"['2 large egg whites', '1 pound new potatoes (...","['pepper', 'salt', 'egg', 'potatoes']",train
2,2,Thanksgiving Mac and Cheese,"['1 cup evaporated milk', '1 cup whole milk', ...",Place a rack in middle of oven; preheat to 400...,thanksgiving-mac-and-cheese-erick-williams,"['1 cup evaporated milk', '1 cup whole milk', ...","['onion', 'garlic', 'pepper', 'cheese', 'chedd...",train
3,3,Italian Sausage and Bread Stuffing,"['1 (Â¾- to 1-pound) round Italian loaf, cut i...",Preheat oven to 350Â°F with rack in middle. Ge...,italian-sausage-and-bread-stuffing-240559,"['1 (Â¾- to 1-pound) round Italian loaf, cut i...","['oil', 'garlic', 'turkey', 'butter', 'eggs', ...",train
4,4,Newton's Law,"['1 teaspoon dark brown sugar', '1 teaspoon ho...",Stir together brown sugar and hot water in a c...,newtons-law-apple-bourbon-cocktail,"['1 teaspoon dark brown sugar', '1 teaspoon ho...","['cinnamon', 'bourbon', 'apple', 'juice', 'but...",train


In [5]:
def load_description(data):
	mapping = dict()
	for index, row in data.iterrows():
		mapping[row['image_name']] = row['title']
	return mapping

descriptions = load_description(df)
print(descriptions['miso-butter-roast-chicken-acorn-squash-panzanella'])


Miso-Butter Roast Chicken With Acorn Squash Panzanella


In [6]:
def clean_description(desc):
	for img_name, title in desc.items():
		caption = [ch for ch in title if ch not in string.punctuation]
		caption = ''.join(caption)
		caption = caption.split(' ')
		caption = [word.lower() for word in caption if len(word)>1 and word.isalpha()]
		caption = ' '.join(caption)
		desc[img_name] = caption

clean_description(descriptions)
descriptions['miso-butter-roast-chicken-acorn-squash-panzanella']

'misobutter roast chicken with acorn squash panzanella'

In [7]:
def to_vocab(desc):
    words = set()
    for key in desc.keys():
        for line in desc[key]:
            words.update(line.split())
    return words
vocab = to_vocab(descriptions)

In [8]:
import glob
images = 'Kaggle data/images/train/'
# Create a list of all image names in the directory
img = glob.glob(images + '*.jpg')
len(img)

6062

In [9]:
img[0:5]

['Kaggle data/images/train/vodka-spiked-cherry-tomatoes-with-pepper-salt-354490.jpg',
 'Kaggle data/images/train/spicy-adobo-shrimp-cocktail-354493.jpg',
 'Kaggle data/images/train/plum-kuchen-354489.jpg',
 'Kaggle data/images/train/tangy-frozen-greek-yogurt-354476.jpg',
 'Kaggle data/images/train/grilled-herb-potatoes-354496.jpg']

In [10]:
descriptions.items()

dict_items([('miso-butter-roast-chicken-acorn-squash-panzanella', 'misobutter roast chicken with acorn squash panzanella'), ('crispy-salt-and-pepper-potatoes-dan-kluger', 'crispy salt and pepper potatoes'), ('thanksgiving-mac-and-cheese-erick-williams', 'thanksgiving mac and cheese'), ('italian-sausage-and-bread-stuffing-240559', 'italian sausage and bread stuffing'), ('newtons-law-apple-bourbon-cocktail', 'newtons law'), ('warm-comfort-tequila-chamomile-toddy', 'warm comfort'), ('apples-and-oranges-spiked-cider', 'apples and oranges'), ('turmeric-hot-toddy-claire-sprouse', 'turmeric hot toddy'), ('instant-pot-lamb-haleem', 'instant pot lamb haleem'), ('spiced-lentil-and-caramelized-onion-baked-eggs', 'spiced lentil and caramelized onion baked eggs'), ('hot-pimento-cheese-dip-polina-chesnakova', 'hot pimento cheese dip'), ('spiral-ham-in-the-slow-cooker-guarnaschelli', 'spiral ham in the slow cooker'), ('butternut-squash-apple-soup-365210', 'butternut squash and apple soup'), ('caesar-

In [11]:
#train_img = img[0:5]
train_img = img

# load descriptions of training set in a dictionary. Name of the image will act as ey
def load_clean_descriptions(des, dataset):
	dataset_des = dict()
	for img_name, title in des.items():
		if 'Kaggle data/images/train/' + img_name + '.jpg' in dataset:
			dataset_des[img_name] = title
	return dataset_des

train_descriptions = load_clean_descriptions(descriptions, train_img)
print(len(train_descriptions))

6062


In [12]:
from keras.preprocessing.image import load_img, img_to_array
def preprocess_img(img_path,model_type):
	# inception v3 excepts img in 299 * 299 * 3
	if model_type == "inception":
		size = 299
	elif model_type == "resnet":
		size = 224
	img = load_img(img_path, target_size = (size, size))
	x = img_to_array(img)
	# Add one more dimension
	x = np.expand_dims(x, axis = 0)
	x = preprocess_input(x)
	return x

def encode(image):
	image = preprocess_img(image,"resnet")
	vec = img_model.predict(image)
	vec = np.reshape(vec, (vec.shape[1]))
	return vec

#base_model = InceptionV3(weights = 'imagenet')
base_model = ResNet50(weights='imagenet')
img_model = Model(base_model.input, base_model.layers[-2].output)
# run the encode function on all train images and store the feature vectors in a list
encoding_train = {}
counter = 0
for img in train_img:
	print(counter)
	counter += 1
	encoding_train[img[len(images):]] = encode(img)


[1;30;43mSe han truncado las últimas 5000 líneas del flujo de salida.[0m
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747


In [13]:
# list of all training captions
all_train_captions = []
for key, val in train_descriptions.items():
	all_train_captions.append(val)

# consider only words which occur atleast 10 times
vocabulary = vocab
threshold = 1 # you can change this value according to your need
word_counts = {}
for cap in all_train_captions:
	for word in cap.split(' '):
		word_counts[word] = word_counts.get(word, 0) + 1

vocab = [word for word in word_counts if word_counts[word] >= threshold]

# word mapping to integers
ixtoword = {}
wordtoix = {}

ix = 1
for word in vocab:
	wordtoix[word] = ix
	ixtoword[ix] = word
	ix += 1

# find the maximum length of a description in a dataset
max_length = max(len(des.split()) for des in all_train_captions)
max_length


18

In [14]:
train_descriptions

{'miso-butter-roast-chicken-acorn-squash-panzanella': 'misobutter roast chicken with acorn squash panzanella',
 'crispy-salt-and-pepper-potatoes-dan-kluger': 'crispy salt and pepper potatoes',
 'thanksgiving-mac-and-cheese-erick-williams': 'thanksgiving mac and cheese',
 'italian-sausage-and-bread-stuffing-240559': 'italian sausage and bread stuffing',
 'newtons-law-apple-bourbon-cocktail': 'newtons law',
 'warm-comfort-tequila-chamomile-toddy': 'warm comfort',
 'spiced-lentil-and-caramelized-onion-baked-eggs': 'spiced lentil and caramelized onion baked eggs',
 'hot-pimento-cheese-dip-polina-chesnakova': 'hot pimento cheese dip',
 'caesar-salad-roast-chicken': 'caesar salad roast chicken',
 'enfrijoladas': 'enfrijoladas',
 'caramelized-plantain-parfait': 'caramelized plantain parfait',
 'roasted-beets-with-crispy-sunchokes-and-pickled-orange-ginger-puree': 'roasted beets with crispy sunchokes and pickled orangeginger',
 'maple-chile-roasted-pumpkin-with-quinoa-tabouli': 'maple and chil

In [15]:
encoding_train

{'vodka-spiked-cherry-tomatoes-with-pepper-salt-354490.jpg': array([0.        , 0.        , 0.05740191, ..., 1.9707215 , 0.        ,
        0.        ], dtype=float32),
 'spicy-adobo-shrimp-cocktail-354493.jpg': array([0.      , 0.      , 0.      , ..., 2.777634, 0.      , 0.      ],
       dtype=float32),
 'plum-kuchen-354489.jpg': array([0.        , 0.        , 0.21714541, ..., 2.6937757 , 0.01037704,
        0.        ], dtype=float32),
 'tangy-frozen-greek-yogurt-354476.jpg': array([0.0000000e+00, 0.0000000e+00, 8.0982520e-04, ..., 1.8832017e+00,
        0.0000000e+00, 0.0000000e+00], dtype=float32),
 'grilled-herb-potatoes-354496.jpg': array([0.        , 0.        , 0.01966239, ..., 2.4299996 , 0.        ,
        0.        ], dtype=float32),
 'lemon-ice-cream-sandwiches-with-blueberry-swirl-354515.jpg': array([0.     , 0.     , 0.     , ..., 3.74556, 0.     , 0.     ],
       dtype=float32),
 'roasted-tomato-soup-with-parmesan-wafers-354491.jpg': array([0.0000000e+00, 0.0000000e

In [32]:
X1, X2, y = list(), list(), list()
vocab_size = len(vocab) + 1
for img_name, title in train_descriptions.items():
    pic = encoding_train[img_name + '.jpg']
    seq = [wordtoix[word] for word in title.split(' ') if word in wordtoix]
    #print(title)
    #print(seq)
    for i in range(1, len(seq)):
        in_seq, out_seq = seq[:i], seq[i]

        in_seq = pad_sequences([in_seq], maxlen = max_length)[0]
        out_seq = to_categorical([out_seq], num_classes = vocab_size)[0]
        #print(in_seq)
        #print(out_seq)
        # store
        X1.append(pic)
        X2.append(in_seq)
        y.append(out_seq)

In [1]:
X2 = np.array(X2)
X1 = np.array(X1)
y = np.array(y)

# load glove vectors for embedding layer
embeddings_index = {}
golve_path ='Kaggle data/glove.6B.200d.txt'
glove = open(golve_path, 'r', encoding = 'utf-8').read()
for line in glove.split("\n"):
	values = line.split(" ")
	word = values[0]
	indices = np.asarray(values[1: ], dtype = 'float32')
	embeddings_index[word] = indices

emb_dim = 200
emb_matrix = np.zeros((vocab_size, emb_dim))
for word, i in wordtoix.items():
	emb_vec = embeddings_index.get(word)
	if emb_vec is not None:
		emb_matrix[i] = emb_vec
emb_matrix.shape

NameError: ignored

In [27]:
embed_dim = 256  # Dimensionalidad del embedding
num_heads = 4  # Número de cabezales en la atención multi-cabeza
ff_dim = 256  # Dimensión oculta en la red feed-forward

# Crear y agregar el bloque del Transformer
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)

In [30]:
se3

<KerasTensor: shape=(None, 18, 200) dtype=float32 (created by layer 'transformer_block_2')>

In [29]:
# define the model
ip1 = Input(shape = (2048, ))
fe1 = Dropout(0.2)(ip1)
fe2 = Dense(256, activation = 'relu')(fe1)
ip2 = Input(shape = (max_length, ))
se1 = Embedding(vocab_size, emb_dim, mask_zero = True)(ip2)
se2 = Dropout(0.2)(se1)
#se3 = LSTM(256)(se2)
se3 = transformer_block(se2)
decoder1 = add([fe2, se3])
decoder2 = Dense(256, activation = 'relu')(decoder1)
outputs = Dense(vocab_size, activation = 'softmax')(decoder2)
model = Model(inputs = [ip1, ip2], outputs = outputs)

ValueError: ignored

In [None]:
model.layers[2].set_weights([emb_matrix])
model.layers[2].trainable = False
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam')
model.fit([X1, X2], y, epochs = 500, batch_size = 256)
# you can increase the number of epochs for better results

In [None]:
def greedy_search(pic):
	start = 'startseq'
	print(pic.shape)
	for i in range(max_length):
		seq = [wordtoix[word] for word in start.split() if word in wordtoix]
		seq = pad_sequences([seq], maxlen = max_length)
		#print(seq)
		#print(seq.shape)
		yhat = model.predict([pic, seq])
		yhat = np.argmax(yhat)
		word = ixtoword[yhat]
		start += ' ' + word
		if word == 'endseq':
			break
	final = start.split()
	final = final[1:-1]
	final = ' '.join(final)
	return final


In [None]:
# resnet and lstm
for j in range(6):
  img_path = f'data/demo_imgs/{j}.jpg'
  encoded = encode(img_path)
  greedy_search(encoded.reshape(1,-1))