# Compute Similarity

## 0. Pre-processing from Feature Engineering (same as in feature_engineering.ipynb)

### A. Setup

In [24]:
# pre-processing
import os
import nltk
from nltk.stem import WordNetLemmatizer, PorterStemmer
import re
import pandas as pd
#from sklearn import feature_extraction, model_selection, pipeline, manifold, preprocessing
from nltk.corpus import stopwords
#from nltk.tokenize import sent_tokenize, word_tokenize

# feature engineering
#from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# similarity
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import os

# cusine data
cuisine = pd.read_json('./data/train.json')

# recipe data
all_recipes = pd.read_json('./data/recipes_raw_nosource_allrecipes.json', orient='index')
epicurious = pd.read_json('./data/recipes_raw_nosource_epicurious.json', orient='index')
food_network = pd.read_json('./data/recipes_raw_nosource_foodnetwork.json', orient='index')
recipes = pd.concat([all_recipes, epicurious, food_network], axis=0)

import sqlite3 as sq

In [2]:
#format recipe data
recipes = recipes.reset_index()
recipes = recipes.drop(columns=['index', 'picture_link'])

### B. Pre-processing

In [3]:
stopword_list = stopwords.words("english")
addl_stop_words = ['advertisement', 'advertisments', 'cup', 'cups', 'tablespoon', 'tablespoons', 'teaspoon', 'teaspoons', 'ounce', 'ounces', 'salt', 'pepper', 'pound', 'pounds']
stopword_list.extend(addl_stop_words)

String Cleaning Function

In [4]:
def clean_string(list, lemmatize = True, stemming = False):
    str = ' '.join(list) #converting the list to string
    clean_text = ''
    
    lower = str.lower().split() #lowercase and tokenize
    
    clean_words = []
    for word in lower:
        if len(word) > 2:
            digit = re.sub(r'\d+','', word) #removing digits
            text = re.sub(r'[^\w\s]', '', digit) #removing punc and characters
            
            
            if lemmatize:
                lm = WordNetLemmatizer()  #lemmatize
                lemm = lm.lemmatize(text)
                clean_words.append(lemm)
                
                if stemming:
                    stemmer = PorterStemmer #stemming
                    stemm = stemmer.stem(text)
                    clean_words.append(stemm)
         
    rem_stop = [i for i in clean_words if i not in stopword_list]  #remove stopwords
    
    clean_text = ' '.join(rem_stop) #join as a string
    space = re.sub(' +', ' ', clean_text) #remove multi-spaces
    
    return space    

Clean Ingredients in Recipe Dataset

In [5]:
recipes['clean_ingredients_r'] = recipes['ingredients'].apply(lambda x: clean_string(x))
recipes.dropna(inplace=True)

Clean Ingredients in Cuisine Dataseet

In [6]:
cuisine['clean_ingredients'] = cuisine['ingredients'].apply(lambda x: clean_string(x))

## 1. Similarity

### A. Embeddings

I. Doc 2 Vec Embeddings:
    1. Ingredients to list
    2. Use TaggedDocument to create tagged data as list
    3. Set hyperparameters (tune this later)
        a. max_epochs
        b. vec_size
        c. alpha
    4. Create model embeddings using Doc2Vec with preset vec_size and alpha from #3
    5. Build vocab from tagged data using model embeddings
    6. Train embeddings with a for loop for epoches in max_epochs range


In [7]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import os

In [8]:
def d2v_embeddings(data): 
    # data is the cuisine pandas dataframe 
    data = data['clean_ingredients'].tolist()

    # create tags for d2v 
    tagged_data = [TaggedDocument(words = row.split(), tags=[str(index)]) for index, row in enumerate(data)]

    # hyperparameters ?--- TUNE THIS WITH A LOOP ---?
    max_epochs = 20
    vec_size = 50
    alpha = 0.025

    # model
    model_embedding = Doc2Vec(vector_size = vec_size, alpha=alpha, min_count=1, dm=1)

    model_embedding.build_vocab(tagged_data)

    # training
    for epoch in range(max_epochs):
        print('iteration {0}'.format(epoch))
        model_embedding.train(tagged_data,
                    total_examples=model_embedding.corpus_count,
                    epochs=model_embedding.epochs)
        # decrease the learning rate
        model_embedding.alpha -= 0.0002
        # fix the learning rate, no decay
        model_embedding.min_alpha = model_embedding.alpha

    return model_embedding


In [9]:
cuisine_d2v_model = d2v_embeddings(cuisine)

iteration 0
iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14
iteration 15
iteration 16
iteration 17
iteration 18
iteration 19


In [10]:
cuisine_classes = ['brazilian','british','cajun_creole','chinese','filipino','french','greek','indian','irish','italian','jamaican','japanese','korean','mexican','moroccan','russian','southern_us','spanish','thai','vietnamese']

In [25]:
def train_model_embeddings(data):
    db = sq.connect('recipes.db')
    cursor = db.cursor()

    for cuisine in cuisine_classes:
        sql_query = "SELECT title, instructions, ingredients, ingredients_query FROM main_recipes WHERE cuisine = ?"
        data = pd.read_sql(sql_query, db, params=(cuisine,))
        
        model_embedding = d2v_embeddings(data)
        save_pkl(model_embedding, f'd2v_{cuisine}.pkl')

In [26]:
train_model_embeddings(cuisine)

DatabaseError: Execution failed on sql 'SELECT title, instructions, ingredients, ingredients_query FROM main_recipes WHERE cuisine = ?': no such table: main_recipes

### B. Identifying Similar Recipes

In [18]:
def get_similar_recipes(input_text, cuisine, top_k=3):
    # Tokenize text
    tokenize_text = clean_string(input_text).split()
    
    # Load model from the selected cuisine
    d2v = load_pkl(f'd2v_{cuisine}.pkl')

    # Get embeddings
    embeddings = d2v.infer_vector(tokenize_text)
    best_recipes = d2v.docvecs.most_similar([embeddings]) #gives you top 10 document tags and their cosine similarity

    # Get recipes
    best_recipes_index = [int(output[0]) for output in best_recipes]
    
    # Get dDtaFrame
    df = get_df_from_db(cuisine)
    
    return df[df.index.isin(best_recipes_index)].head(top_k)

In [15]:
test = ['onions', 'tomato', 'garlic', 'spinach']

In [20]:
get_similar_recipes(test, 'french')

NameError: name 'load_pkl' is not defined