MIS 285N Cognitive Computing<br>
Final Project<br>
Jerry Che - Jose Guerrero - Riley Moynihan - Noah Placke - Sarah Teng - Palmer Wenzel

# Instructions Generation Model

Following techniques from:
- https://towardsdatascience.com/image-captioning-with-keras-teaching-computers-to-describe-pictures-c88a46a311b8
- https://github.com/hlamba28/Automatic-Image-Captioning/blob/master/Automatic%20Image%20Captioning.ipynb

#### Read data from CSV.

In [1]:
import pandas as pd
# pd.options.display.max_columns = 500


df = pd.read_csv('../data/kaggle/processed/recipes_processed.csv').sample(frac=0.1, random_state=42)

df.head(3)

Unnamed: 0,name,steps,crabmeat,creamcheese,greenonions,garlicsalt,refrigeratedcrescentdinnerrolls,eggyolk,water,sesameseeds,...,tex-mexseasoning,lightnon-dairywhippedtopping,stelladoroanginetticookies,viennabread,beefroundrumproast,romaineleaf,nuocnam,thaiholybasil,driedblacktrumpetmushrooms,driedwoodearmushrooms
10023,grilled prawns in butter,"clean prawns and butterfly, put in a rectangul...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15838,moroccan chicken with couscous,slice chicken into thin strips about an inch l...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11107,sun dried tomato pasta sauce,sautee onions and garlic in olive oil until so...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Drop any all-zero cols.

In [2]:
nunique = df.apply(pd.Series.nunique)
cols_to_drop = nunique[nunique == 1].index
df = df.drop(cols_to_drop, axis=1)

df.head(3)

Unnamed: 0,name,steps,crabmeat,creamcheese,greenonions,garlicsalt,refrigeratedcrescentdinnerrolls,eggyolk,water,sesameseeds,...,smallwhitepotatoes,vegetarianpepperoni,powderedclove,cookedsushirice,almondbreezenon-dairybeverage,chocolateraspberrymoussemix,candiedcitronpeel,mincedclamswithjuice,dooleyliqueur,self-risingcornmealmix
10023,grilled prawns in butter,"clean prawns and butterfly, put in a rectangul...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15838,moroccan chicken with couscous,slice chicken into thin strips about an inch l...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11107,sun dried tomato pasta sauce,sautee onions and garlic in olive oil until so...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Drop unnecessary columns.

In [3]:
df = df.drop(['name'], axis=1)

df.head(3)

Unnamed: 0,steps,crabmeat,creamcheese,greenonions,garlicsalt,refrigeratedcrescentdinnerrolls,eggyolk,water,sesameseeds,garbanzobeans,...,smallwhitepotatoes,vegetarianpepperoni,powderedclove,cookedsushirice,almondbreezenon-dairybeverage,chocolateraspberrymoussemix,candiedcitronpeel,mincedclamswithjuice,dooleyliqueur,self-risingcornmealmix
10023,"clean prawns and butterfly, put in a rectangul...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15838,slice chicken into thin strips about an inch l...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11107,sautee onions and garlic in olive oil until so...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Separate steps from ingredients.

In [4]:
steps = df['steps']
ingredients = df.drop(['steps'], axis=1)

#### Create vocabulary of unique words.

In [5]:
unique = set(steps.str.replace('[^a-zA-Z0-9 ]', '').str.split(' ').sum())
print(f'Number of unique tokens: {len(unique)}')
vocab_size = len(unique) + 1  # +1 for appended 0s
# vocab_size = 16903

Number of unique tokens: 5673


#### Create index to word mappings.

In [6]:
ixtoword = {}
wordtoix = {}

ix = 1
for w in unique:
    wordtoix[w] = ix
    ixtoword[ix] = w
    ix += 1
    
ixtoword[3]

'doublestick'

#### Get maximum sequence length.

In [7]:
max_seq_len = steps.str.len().max()

max_seq_len

5983

#### Define data generator.

In [11]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from numpy import array


# data generator, intended to be used in a call to model.fit_generator()
def data_generator(steps, recipes, wordtoix, max_seq_len, batch_size):
    X1, X2, y = list(), list(), list()
    n = 0
    
    # loop for ever over images
    while 1:
        for i, step_text in enumerate(steps):
            n += 1
            
            # encode the sequence
            seq = [wordtoix[word] for word in step_text.split(' ') if word in wordtoix]
            
            # split one sequence into multiple X, y pairs
            for i in range(1, len(seq)):
                # split into input and output pair
                in_seq, out_seq = seq[:i], seq[i]
                
                # pad input sequence
                in_seq = pad_sequences([in_seq], maxlen=max_seq_len)[0]
                
                # encode output sequence
                out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                
                # store
                X1.append(recipes.iloc[i])
                X2.append(in_seq)
                y.append(out_seq)
                
            # yield the batch data
            if n == batch_size:
                yield ([array(X1), array(X2)], array(y))
                X1, X2, y = list(), list(), list()
                n = 0

#### Glove Vectors.

In [12]:
import os
import numpy as np


# Load Glove vectors
glove_dir = '../data/glove_vectors'
embeddings_index = {} # empty dictionary
f = open(os.path.join(glove_dir, 'glove.6B.50d.txt'), encoding="utf-8")

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


#### Create embedding matrix.

In [13]:
embedding_dim = 50

# Get 200-dim dense vector for each of the 10000 words in out vocabulary
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in wordtoix.items():
    # If i < max_words:
    embedding_vector = embeddings_index.get(word)
    
    if embedding_vector is not None:
        # Words not found in the embedding index will be all zeros
        embedding_matrix[i] = embedding_vector
        
embedding_matrix.shape

(5674, 50)

#### Model architecture.

In [18]:
import tensorflow as tf
from tensorflow.keras import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Input, LSTM, Embedding
from tensorflow.keras.layers import Add


inputs1 = Input(shape=(ingredients.shape[1],))
fe1 = Dropout(0.5)(inputs1)
fe2 = Dense(256, activation='relu')(fe1)

inputs2 = Input(shape=(max_seq_len,))
se1 = Embedding(vocab_size, embedding_dim, mask_zero=True)(inputs2)
se2 = Dropout(0.5)(se1)
se3 = LSTM(256)(se2)

decoder1 = Add()([fe2, se3])
decoder2 = Dense(256, activation='relu')(decoder1)
outputs = Dense(vocab_size, activation='softmax')(decoder2)

model = Model(inputs=[inputs1, inputs2], outputs=outputs)
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 5983)]       0                                            
__________________________________________________________________________________________________
input_1 (InputLayer)            [(None, 3049)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 5983, 50)     283700      input_2[0][0]                    
__________________________________________________________________________________________________
dropout (Dropout)               (None, 3049)         0           input_1[0][0]                    
______________________________________________________________________________________________

In [19]:
model.layers[2]

<tensorflow.python.keras.layers.embeddings.Embedding at 0x7f0ab552af10>

In [20]:
model.layers[2].set_weights([embedding_matrix])
model.layers[2].trainable = False

model.compile(loss='categorical_crossentropy', optimizer='adam')

#### Train.

In [None]:
epochs = 10
batch_size = 5

train_steps = len(steps) // batch_size

for i in range(epochs):
    generator = data_generator(steps, ingredients, wordtoix, max_seq_len, batch_size)
    model.fit(generator, epochs=1, steps_per_epoch=train_steps, verbose=1)