In [1]:
import os
import pandas as pd
import numpy as np
import collections
import tensorflow as tf
import pickle

In [2]:
models_folder_name = os.path.join(os.getcwd(),'models')
summaries_folder_name = os.path.join(os.getcwd(),'summaries')
path_to_preprocessed_texts = os.path.join(os.getcwd(),
                                          'texts','preprocessed_texts_for_doc2vec.pkl')

df_preprocessed_texts = pd.read_pickle(path_to_preprocessed_texts)

preprocessed_texts = df_preprocessed_texts.preprocessed_texts.values.tolist()
labels = df_preprocessed_texts['labels'].values.tolist()

unique_labels=sorted(set(labels))
number_categories=len(unique_labels)
categories_indices=np.linspace(0,number_categories-1,number_categories,dtype=int)
labels2integers=dict(zip(unique_labels,categories_indices))

print(labels2integers)

{'steak': 1, 'sashimi': 0, 'tiramisu': 3, 'sushi': 2}


In [3]:
batch_size = 2
generations = 100000
model_learning_rate = 0.001

embedding_size = 24   #word embedding size
doc_embedding_size = 12  #document embedding size
concatenated_size = embedding_size + doc_embedding_size

save_embeddings_every = 5000
print_valid_every = 5000
print_loss_every = 50

In [4]:
def build_dictionary(preprocessed_texts):
    words=[w for words_in_recipe in preprocessed_texts for w in words_in_recipe]
    count = []
    count.extend(collections.Counter(words))
    count=sorted(count)
    word_dict = {}
    for word in count:
        word_dict[word] = len(word_dict)
    
    return (word_dict)

#replace each word in texts with integer value
def text_to_numbers(preprocessed_texts, word_dict):
    data = []
    for prepr_text in preprocessed_texts:
        text_data = []
        for word in prepr_text:
            if word in word_dict:
                word_ix = word_dict[word]
            else:
                word_ix = 0
            text_data.append(word_ix)
        data.append(text_data)
    return (data)


def create_batch_data(text_with_words_conv_to_numbers, batch_size=batch_size):
    batch_data = []
    label_data = []
    
    rand_text_ix = int(np.random.choice(len(text_with_words_conv_to_numbers), size=1))
    rand_text = text_with_words_conv_to_numbers[rand_text_ix]
    word_to_predict_label=np.random.choice(list(set(rand_text)), size=1,replace=False)
    
    while len(batch_data) < batch_size:
        item_in_batch=[]        
        
        label_words=np.random.choice(rand_text, size=1,replace=False)

        item_in_batch.extend(word_to_predict_label)
        item_in_batch.append(rand_text_ix)     
        label_data.extend(label_words)
        batch_data.append(item_in_batch)

        
    batch_data = np.array(batch_data)
    label_data = np.transpose(np.array(label_data))

    return (batch_data, label_data)

In [5]:
word_dictionary=build_dictionary(preprocessed_texts)
vocabulary_size=len(word_dictionary)
print(word_dictionary)
print(vocabulary_size)

word_dictionary_rev = dict(zip(word_dictionary.values(), word_dictionary.keys()))

{'press': 86, 'cutting': 35, 'pressure': 87, 'mixture': 70, 'shrimp': 103, 'angle': 1, 'carrot': 18, 'tomato': 129, 'chive': 21, 'position': 82, 'avocado': 3, 'air': 0, 'salt': 95, 'soy': 108, 'stick': 117, 'truffle': 134, 'sesame': 101, 'torch': 132, 'vegetable': 137, 'cheese': 19, 'towel': 133, 'brush': 15, 'sheet': 102, 'oil': 74, 'spread': 112, 'seed': 100, 'bamboo': 6, 'strawberry': 119, 'blade': 11, 'raspberry': 89, 'mushroom': 71, 'leg': 58, 'cucumber': 32, 'butter': 16, 'matchstick': 64, 'yolk': 147, 'stone': 118, 'mascarpone': 62, 'mixer': 69, 'beat': 8, 'peak': 78, 'skewer': 106, 'sushi': 123, 'sea': 99, 'sauce': 97, 'strip': 120, 'cling': 24, 'starch': 115, 'guacamole': 53, 'cream': 31, 'lime': 59, 'rice': 90, 'tempura': 124, 'whisk': 143, 'espresso': 40, 'powder': 84, 'sirloin': 105, 'meat': 66, 'water': 141, 'tuna': 135, 'mat': 63, 'filling': 42, 'slice': 107, 'cut': 34, 'nori': 73, 'wine': 145, 'wasabi': 140, 'sprinkle': 114, 'pinch': 81, 'quantity': 88, 'fryer': 46, 'spa

In [6]:
text_data = text_to_numbers(preprocessed_texts, word_dictionary)
print(text_data)

[[94, 3, 75, 140, 100, 90, 72, 80, 94, 95, 94, 141, 44, 53, 75, 129, 95, 95, 95, 95, 95, 95, 95, 95, 34, 3, 3, 3, 3, 80, 75, 70, 3, 70, 53, 92, 59, 54, 59, 59, 54, 59, 54, 3, 54, 59, 53, 70, 140, 140, 10, 140, 53, 10, 10, 101, 100, 108, 97, 129, 129, 129, 3, 29, 57, 57, 70, 29, 53, 24, 94, 95, 94, 94, 44, 96, 34, 107, 24, 10, 115, 24, 107, 94, 115, 24, 115, 94, 94, 115, 94, 94, 94, 46, 107, 46, 94, 74, 94, 94, 117, 46, 94, 94, 94, 94, 94, 94, 46, 74, 94, 74, 94, 47, 94, 90, 72, 72, 94, 53, 29, 90, 72, 53], [135, 140, 108, 97, 48, 49, 82, 43, 49, 107], [96, 94, 41, 101, 100, 74, 101, 100, 97, 48, 41, 101, 100, 74, 122, 101, 100, 74, 43, 94, 41, 101, 100, 122, 43, 41, 41, 44, 101, 100, 74, 117, 41, 41, 44, 82, 41, 43, 41, 41, 34, 41, 97, 122, 11, 94, 107, 43, 82, 94, 97, 48, 94, 96, 107, 97], [123, 41, 12, 123, 41, 12, 29, 57, 101, 100, 74, 135, 135, 41, 107, 107, 41, 34, 135, 107, 12, 12, 135, 100, 74, 43, 74, 135, 29, 41, 122, 41, 12, 11, 44, 135, 12, 44, 44, 82, 41, 12, 1, 34, 1, 11, 

In [7]:
valid_words = ['tuna', 'rice', 'sushi', 'roll', 'sashimi','steak','grill', 'sauce', 'cream']

valid_examples = [word_dictionary[x] for x in valid_words]
print(valid_examples)

[135, 90, 123, 92, 96, 116, 51, 97, 31]


In [8]:
batch_data, label_data = create_batch_data(preprocessed_texts)
print(batch_data)
print(label_data)
print(np.shape(label_data))

[['oil' '13']
 ['oil' '13']]
['ginger' 'steak']
(2,)


In [9]:
sess = tf.InteractiveSession()

In [10]:
print('Creating Model')

embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0), name="word_embeddings")
doc_embeddings = tf.Variable(tf.random_uniform([len(preprocessed_texts), doc_embedding_size], -1.0, 1.0), name="doc_embeddings")

decoder_weights = tf.Variable(tf.truncated_normal([vocabulary_size, concatenated_size],
                                               stddev=1.0 / np.sqrt(concatenated_size)),
                                               name="decoder_weights")
decoder_biases = tf.Variable(tf.zeros([vocabulary_size]), name="decoder_biases")


x_inputs = tf.placeholder(tf.int32, shape=[None, 2]) #1 for word index and 1 for doc index
y_target = tf.placeholder(tf.int32, shape=[batch_size])
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

embed= tf.nn.embedding_lookup(embeddings, x_inputs[:, 0])
    
doc_indices = tf.slice(x_inputs, [0,1],[batch_size,1])
doc_embed = tf.nn.embedding_lookup(doc_embeddings,doc_indices)
final_embed = tf.concat([embed, tf.squeeze(doc_embed)],1)

logits = tf.matmul(final_embed, tf.transpose(decoder_weights)) + decoder_biases


loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,labels=y_target))
optimizer = tf.train.AdamOptimizer(learning_rate=model_learning_rate)
train_step = optimizer.minimize(loss)

#cosine similarity between words
norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
normalized_embeddings = embeddings / norm
valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True, name="cosine_similarity")

with tf.name_scope("performance"):
    loss_ph = tf.placeholder(tf.float32,shape=None,name='loss_summary')
    loss_summary = tf.summary.scalar('loss', loss_ph)
performance_summaries = tf.summary.merge([loss_summary])

saver = tf.train.Saver({"embeddings": embeddings, "doc_embeddings": doc_embeddings})
summ_writer = tf.summary.FileWriter(summaries_folder_name, sess.graph)

init = tf.initialize_all_variables()
sess.run(init)


print('Starting Training')

#loss_vec = []
#loss_x_vec = []
for i in range(generations):
    batch_inputs, batch_labels = create_batch_data(text_data)
    feed_dict = {x_inputs : batch_inputs, y_target : batch_labels}

    #run the train step
    sess.run(train_step, feed_dict=feed_dict)

    #return the loss
    if (i+1) % print_loss_every == 0:
        loss_val = sess.run(loss, feed_dict=feed_dict)
        summ = sess.run(performance_summaries, feed_dict={loss_ph:loss_val})
        summ_writer.add_summary(summ, i+1)
        #loss_vec.append(loss_val)
        #loss_x_vec.append(i+1)
        print('Loss at step {} : {}'.format(i+1, loss_val))
        
    #validation
    if (i+1) % print_valid_every == 0:
        sim = sess.run(similarity, feed_dict=feed_dict)
        for j in range(len(valid_words)):
            valid_word = word_dictionary_rev[valid_examples[j]]
            top_k = 5 # number of nearest neighbors
            nearest = (-sim[j, :]).argsort()[1:top_k+1]
            log_str = "Nearest to {}:".format(valid_word)
            for k in range(top_k):
                close_word = word_dictionary_rev[nearest[k]]
                log_str = '{} {},'.format(log_str, close_word)
            print(log_str)
            
    #save dictionary + embeddings
    if (i+1) % save_embeddings_every == 0:
        #save vocabulary dictionary
        with open(os.path.join(models_folder_name,'doc2vec_recipes_dict_words_integers.pkl'), 'wb') as f:
            pickle.dump(word_dictionary, f)
        
        #save embeddings
        model_checkpoint_path = os.path.join(os.getcwd(),models_folder_name,'doc2vec_recipes_checkpoint.ckpt')
        save_path = saver.save(sess, model_checkpoint_path)
        print('Model saved in file: {}'.format(save_path))
        
sess.close()

Creating Model
Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
Use `tf.global_variables_initializer` instead.
Starting Training
Loss at step 50 : 5.785186767578125
Loss at step 100 : 5.244414329528809
Loss at step 150 : 5.202678203582764
Loss at step 200 : 4.671710014343262
Loss at step 250 : 4.7154998779296875
Loss at step 300 : 4.962398529052734
Loss at step 350 : 4.250115871429443
Loss at step 400 : 5.452340602874756
Loss at step 450 : 4.718405246734619
Loss at step 500 : 4.806320667266846
Loss at step 550 : 4.470209121704102
Loss at step 600 : 4.7410712242126465
Loss at step 650 : 5.1305036544799805
Loss at step 700 : 4.800546169281006
Loss at step 750 : 4.510857582092285
Loss at step 800 : 4.665602207183838
Loss at step 850 : 4.712397575378418
Loss at step 900 : 4.3110198974609375
Loss at step 950 : 5.345371246337891
Loss at step 1000 : 4.690176010131836
Loss at step 1050 : 5.059058666229248
Loss at step 1100 : 4.370900630950928


KeyboardInterrupt: 