In [30]:
import pandas as pd
import numpy as np
import json
import time
import matplotlib.pyplot as plt

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import tensorflow as tf
import keras
from keras.preprocessing.text import one_hot , Tokenizer

In [31]:
path = '/Users/rpindale/Desktop/Yummly28K/metadata27638/'

def read_meta_into_df(path, n_files):
    recipe_info = pd.DataFrame({'id':[], 'title':[], 'time_to_cook_(seconds)':[], 'ingredients':[], 'cuisine':[], 'course':[]})
    for meta_num in range(1,n_files+1):
        num_of_digits = len(str(meta_num))
        num_of_zeros_in_fname = 5 - num_of_digits
        final_path = path + 'meta' + str(0)*num_of_zeros_in_fname + str(meta_num) + '.json' #makes path for the json file for that recipe
        f = open(final_path)
        temp = json.load(f)
        id_ = str(0)*num_of_zeros_in_fname + str(meta_num)
        title = temp['name']
        time = temp['totalTimeInSeconds']
        ingredients = np.array(temp['ingredientLines'], dtype='object')
        course = temp['attributes']['course'][0] #returns as list, so take first element. Idk if there are multiple labels for some.
        cuisine = temp['attributes']['cuisine'][0]
        new_df = pd.DataFrame({'id':id_, 'title':title, 'time_to_cook_(seconds)':time, 'ingredients':[ingredients], 'cuisine':cuisine, 'course':course})
        recipe_info = recipe_info.append(new_df)
    return(recipe_info)

        

    
#start = time.time()
#temp = read_meta_into_df(path, 27638)
#end = time.time()
#print(end-start)

#temp.to_csv('recipe_info.csv') #save this to a .csv so I do not have to do this again!

In [32]:
temp = pd.read_csv('recipe_info.csv')

In [33]:
temp.head()

Unnamed: 0.1,Unnamed: 0,id,title,time_to_cook_(seconds),ingredients,cuisine,course
0,0,1,Mushroom Risotto,1800.0,"['2 cups Baby Bella mushrooms, sliced' '2 cups...",Italian,Side Dishes
1,0,2,Filipino BBQ Pork Skewers,2400.0,"['2.5 lb pork country style ribs, all fat trim...",Barbecue,Main Dishes
2,0,3,Mushroom and Roasted Garlic Risotto,5100.0,['2 whole garlic heads'\n '2 tablespoons plus ...,Italian,Main Dishes
3,0,4,Gratin Dauphinois (Scalloped Potatoes with Che...,3300.0,"['1 garlic clove, halved' 'Cooking spray'\n '6...",French,Side Dishes
4,0,5,Delicious Grilled Hamburgers,900.0,['1 pound lean ground beef' '1 tablespoon Worc...,Barbecue,Main Dishes


In [34]:
temp.cuisine.value_counts()

American                11729
Italian                  4571
Mexican                  3394
Asian                    1951
French                   1562
Indian                   1457
Kid-Friendly              962
Southwestern              600
Thai                      347
Barbecue                  338
Chinese                   266
Southern & Soul Food      151
Greek                      80
Mediterranean              72
Spanish                    58
Cuban                      31
Cajun & Creole             18
Moroccan                   11
Japanese                   10
Irish                       9
English                     6
Hawaiian                    5
German                      4
Hungarian                   3
Portuguese                  2
Vietnamese                  1
Name: cuisine, dtype: int64

In [35]:
temp.course.value_counts()

Main Dishes              13665
Desserts                  3204
Salads                    2406
Side Dishes               2147
Soups                     1804
Appetizers                1513
Condiments and Sauces      830
Lunch and Snacks           649
Breakfast and Brunch       587
Breads                     467
Beverages                  250
Cocktails                   99
Afternoon Tea               17
Name: course, dtype: int64

In [36]:
keys = temp['cuisine'].unique()
vals = range(26)
dict_cuisine = {}
for i in range(len(keys)):
    dict_cuisine[temp['cuisine'].unique()[i]] = i
dict_cuisine

{'Italian': 0,
 'Barbecue': 1,
 'French': 2,
 'American': 3,
 'Asian': 4,
 'Kid-Friendly': 5,
 'Southwestern': 6,
 'Mexican': 7,
 'Indian': 8,
 'Southern & Soul Food': 9,
 'Thai': 10,
 'Japanese': 11,
 'Chinese': 12,
 'Spanish': 13,
 'Mediterranean': 14,
 'Cajun & Creole': 15,
 'Cuban': 16,
 'Greek': 17,
 'German': 18,
 'Moroccan': 19,
 'Irish': 20,
 'Hungarian': 21,
 'Vietnamese': 22,
 'English': 23,
 'Portuguese': 24,
 'Hawaiian': 25}

In [37]:
keys = temp['course'].unique()
vals = range(26)
dict_course = {}
for i in range(len(keys)):
    dict_course[temp['course'].unique()[i]] = i
dict_course

{'Side Dishes': 0,
 'Main Dishes': 1,
 'Desserts': 2,
 'Salads': 3,
 'Soups': 4,
 'Condiments and Sauces': 5,
 'Appetizers': 6,
 'Breads': 7,
 'Lunch and Snacks': 8,
 'Breakfast and Brunch': 9,
 'Beverages': 10,
 'Cocktails': 11,
 'Afternoon Tea': 12}

In [38]:

def load_glove():
    embeddings_index = dict()
    f = open('glove.42B.300d.txt') # replace this with the path to your downloaded txt file
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    print('Loaded %s word vectors.' % len(embeddings_index))
    # create a weight matrix for words in training docs
    
    return embeddings_index

In [39]:
embeddings_index = load_glove()


Loaded 1917495 word vectors.


In [40]:

stop_words = set(stopwords.words('english'))
punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
word_tokens = word_tokenize(temp.ingredients[1])
filter1 = [w for w in word_tokens if not w.lower() in stop_words] #removes stopwords
#filter2 = [w for w in filter1 if not w.isdigit()] #removes numbers
filter2 = [''.join([l for l in w.lower() if not l.isdigit()]) for w in filter1]
filter3 = [w for w in filter2 if not w.lower() in punc] #removes punctuation
filter3 = [''.join([l for l in w.lower() if not l in punc]) for w in filter2]
filter4 = [w for w in filter3 if w != '']
filter4

['lb',
 'pork',
 'country',
 'style',
 'ribs',
 'fat',
 'trimmed',
 'cut',
 'x',
 'cubes',
 'oz',
 'up',
 'cup',
 'soy',
 'sauce',
 'cup',
 'white',
 'vinegar',
 'lemon',
 'juice',
 'cup',
 'brown',
 'sugar',
 'cloves',
 'garlic',
 'crushed',
 'tsp',
 'black',
 'pepper',
 'crushed',
 'red',
 'pepper',
 'flakes',
 'optional']

In [41]:
embeddings_index.get('')

In [42]:
def get_avg_embedding(text):
    stop_words = set(stopwords.words('english'))
    punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
    word_tokens = word_tokenize(text)
    filter1 = [w for w in word_tokens if not w.lower() in stop_words] #removes stopwords
    #filter2 = [w for w in filter1 if not w.isdigit()] #removes numbers
    filter2 = [''.join([l for l in w.lower() if not l.isdigit()]) for w in filter1]
    filter3 = [w for w in filter2 if not w.lower() in punc] #removes punctuation
    filter3 = [''.join([l for l in w.lower() if not l in punc]) for w in filter2]
    filter4 = [w for w in filter3 if w != '']
    embeds = np.zeros((len(filter4), 300))
    for idx, w in enumerate(filter4):
        if type(embeddings_index.get(w)) != type(embeddings_index.get('-w')):
            embeds[idx] = embeddings_index.get(w)
        else:
            pass
    
    return(np.mean(embeds, axis=0))
    

In [43]:
def get_my_embedding(index = 0, data = temp, cols = ['title', 'ingredients', 'cuisine', 'course']):
    the_embedding = np.concatenate((get_avg_embedding(data[cols[0]][index]), get_avg_embedding(data[cols[1]][index]), get_avg_embedding(data[cols[2]][index]), get_avg_embedding(data[cols[3]][index])))
    return the_embedding




In [44]:
temp['id'].unique().astype(str)

array(['1', '2', '3', ..., '27636', '27637', '27638'], dtype='<U21')

In [None]:
#make a dictinoary of the ID to embedding
#keys = temp['id'].unique()
keys = [str(i) for i in range(1,27639)]
embedding_dict = {}
for i in keys:
    embedding_dict[i] = list(get_my_embedding(int(i)-1))
embedding_dict

In [47]:
import json
with open('text_embs.json', 'w') as fp:
    json.dump(embedding_dict, fp)