In [1]:
import numpy as np
import pandas as pd
import nltk
import re
import os
import codecs
from sklearn import feature_extraction
import mpld3
import gensim

# To display full text
pd.set_option('display.max_colwidth', -1)

# Load the Lemmatisation function --------------------------------------------------------
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()


# Load the Stop words ---------------------------------------------------------------------
from sklearn.feature_extraction.text import CountVectorizer
 
# Defining custom stop words
stopwords = nltk.corpus.stopwords.words('english')
stopwords = stopwords  + ['cup','teaspoon','tablespoon','sweet', 'low', 'high','medium', 'chopped', 'crushed', 'pound', 'small', 'fresh', 'clove', 'oz', 
                'ounce','cut','taste','thinly','lengthwis','extra','garnish','finely','long','short','inch','thin','pieces',
                'wide','lightly','country','discarded','across','package','packed','pieces','extra','squeezed','sometimes',
                'half','free','box','container','jar','equipment','pale','lengthwise','perferably','note','divided','piece',
                'part','separated','bunch','large','lb','kosher','salt','freshly_ground','ground','plus_more','minced',
                'cut_into','peeled','thinly_sliced','tbsp','tsp','gram','dice','room_temperature','coarsely','coarse_kosher','coarse','dash_of',
                'ml','plus','inch_cubes','water','diced','seeded','dried','frozen','such_as','red','white','green','brown','oil','chilled',
                'grated','slice','sliced','thick','to_taste','leaf','ounce_can','peel','hot','cold','wedge','pinch_of',
                'whole','size','remove','removed','strip','special_equipment','optional','trimmed','crosswise','xa','fine','sliced_thin',
                'halved','halved_lengthwise','quartered','inch_thick_slices','crumbled','inch_dice','drained','loosely_packed',
                'preferably','cooked','uncooked','thawed','pitted','cored','canned_low','powder','stick','round','tied',
                'dry_white','softened','soft','head','some_supermarkets','one','two','eight','add','grade','torn','left_intact','dash',
                'deveined','freshly','melted','for_garnish','strong','ounce bag','ounce_package','several','by_inch','quart','quarter',
                'lightly_beaten','if_needed','packed_golden_brown','picked','pale_green_parts_only','prepared','double','mild',
                'slivered','special_equipment_an_instant','at_room_temperature','medium','plus_more_if_needed','medium_size',
                'very_thinly_sliced','wear_rubber_gloves']

In [2]:
# Pre Processing function -------------------------------------------------------
import re
def pre_process(text):
    
    # lowercase
    text=text.lower()
    
    #remove tags
    text=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)
    
    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    
    return text

In [3]:
# Reading the data
import pandas as pd
recipes = pd.read_json('full_format_recipes.json')

In [4]:
# remove records with missing title
recipes_final=recipes.dropna(subset=["title"], inplace=False)
recipes_final.reset_index(drop=True, inplace=True)
print(recipes_final.shape)
recipes_final.columns

(20111, 11)


Index(['calories', 'categories', 'date', 'desc', 'directions', 'fat',
       'ingredients', 'protein', 'rating', 'sodium', 'title'],
      dtype='object')

In [5]:
# remove duplicate recipes
recipes_final=pd.DataFrame(recipes_final)

recipes_final = recipes_final.iloc[recipes_final.astype(str).drop_duplicates().index]
recipes_final.reset_index(drop=True, inplace=True)

print(recipes_final.shape)
recipes_final.columns

(18295, 11)


Index(['calories', 'categories', 'date', 'desc', 'directions', 'fat',
       'ingredients', 'protein', 'rating', 'sodium', 'title'],
      dtype='object')

In [6]:
# combine "title" and "ingredients" as one string variable
recipes_final.iloc[:,6] = [str(ingrd) for ingrd in (recipes_final.iloc[:,6])]
recipes_final.iloc[:,10] = [str(title) for title in (recipes_final.iloc[:,10])]

recipes_final["titleingrd"]=recipes_final["title"]+recipes_final["ingredients"]

In [7]:
# Make sure the time format is correct
# We will consider only hours and minutes in the time component
recipes_final.date = recipes_final.date.dt.strftime('%m/%d/%Y %H:%M')

In [8]:
# Apply the above function to preprocess ingredients
recipes_final['ingredients_processed'] = recipes_final['titleingrd'].apply(lambda x:pre_process(x))

In [9]:
recipes_final.columns
recipes_final.iloc[1,12]

'boudin blanc terrine with red onion confit cups whipping cream medium onions chopped teaspoons salt bay leaves whole cloves large garlic clove crushed teaspoon pepper teaspoon ground nutmeg pinch of dried thyme crumbled large shallots minced tablespoon butter pound trimmed boneless center pork loin sinew removed cut into inch chunks well chilled eggs tablespoon all purpose flour cup tawny port tablespoons dried currants minced lettuce leaves cracked peppercorns minced fresh parsley bay leaves french bread baguette slices tablespoons olive oil large red onions halved sliced tablespoons dried currants tablespoons red wine vinegar tablespoons canned chicken broth teaspoons chopped fresh thyme or teaspoon dried crumbled teaspoon sugar '

In [10]:
# Define the sentence to be lemmatized
sentence = recipes_final.iloc[:,12]

# Perform lemmatization for each recipe (row)
lemmatized_output = []
for rcp in sentence:
    word_list = nltk.word_tokenize(rcp)
    lo = ' '.join([lemmatizer.lemmatize(w) for w in word_list])
    lemmatized_output.append(lo)

In [11]:
# Checking lemmatization
print("before:",sentence[10])
print("after:",lemmatized_output[10])

before: yams braised with cream rosemary and nutmeg teaspoons olive oil cup finely chopped shallots teaspoons minced fresh rosemary pounds yams red skinned sweet potatoes peeled cut into inch thick rounds rounds cut in half cups canned low salt chicken broth cup whipping cream ground nutmeg 
after: yam braised with cream rosemary and nutmeg teaspoon olive oil cup finely chopped shallot teaspoon minced fresh rosemary pound yam red skinned sweet potato peeled cut into inch thick round round cut in half cup canned low salt chicken broth cup whipping cream ground nutmeg


In [12]:
# Tokenize the words in each recipe
lem_out_wordlist = [nltk.word_tokenize(x) for x in lemmatized_output]

In [13]:
# Subset columns to display
recipes_list = recipes_final[[ 'title','date','rating','calories', 'ingredients', 'directions', 'fat',
        'protein','sodium']]
# Reset index name 
recipes_list = pd.concat([pd.DataFrame(list(recipes_list.index)),recipes_list], axis=1)

# Rename the columns
recipes_list.columns = [ 'row_num' ,'title','date','rating','calories', 'ingredients', 'directions', 'fat',
        'protein','sodium']

In [14]:
# Jaccard similarity function
# Lower the  better
from math import*
  
def jaccard_similarity(x,y):
  
 intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
 union_cardinality = len(set.union(*[set(x), set(y)]))
 return intersection_cardinality/float(union_cardinality)

In [15]:
from nltk.util import ngrams 

def bigram_recipe(txt):
    if not txt: return None
    ng = ngrams(txt, 2)
    #ng3 = ngrams(txt, 3)
    ub = txt + list(ng)
    return(ub)


In [16]:
#bigram_recipe(lem_out_wordlist[0])


In [17]:
def getRecipe_Jaccard(query = "", sort = True):
    
    # Define empty arrays
    similarity = [] # Distance between document
    row_numj = [] # Row number of the document 
    
    # Pre-process query
    query_processed = pre_process(str(query))
    
    # Lemmatize the query
    query_wlist = nltk.word_tokenize(query_processed)
    #lemmatized_query = ' '.join([lemmatizer.lemmatize(w) for w in query_wlist])
    
    for x in range(len(lem_out_wordlist)):
        vector = lem_out_wordlist[x]
        jacc = jaccard_similarity(bigram_recipe(vector), bigram_recipe(query_wlist)) ## bigram
        #jacc = jaccard_similarity(vector, query_wlist) ## Unigram
        if jacc > 0:
            similarity.append(jacc)
            row_numj.append(x)
    
    # Concatenate the columns into a dataframe
    matches = pd.concat([pd.DataFrame(similarity),pd.DataFrame(row_numj)],axis = 1)
    if matches.empty:
        return(print(" No Recipes with "+ query))
# Provide descriptive the column names
    matches.columns = ['similarity_val','row_num']
# Rank the results
    matches = matches.sort_values(by=['similarity_val'],ascending=False)
    print(matches.shape[0], "recipes matched")
    
    # Save the top 10 in a dataframe
    results = pd.DataFrame(recipes_list.iloc[matches.row_num[0:10],:])
    
    results_score = pd.merge(results,matches[:10], how = 'left', on = 'row_num')
    
    if(sort):
        # Sort by time
        results = results.sort_values(by = ['date'], ascending=False)
        # sort my ratings
        results = results.sort_values(by = ['rating'], ascending=False)
    else:
        results = pd.DataFrame(recipes_list.iloc[matches.row_num[:10],:])
        results_score = pd.merge(results,matches[:10], how = 'left', on = 'row_num')
        results_score = results_score.sort_values(by = ['similarity_val'],ascending=False)
        #results.to_excel('query_results_jc.xlsx')
        
    #Return the dataset
    if query != '':
        return(results_score)
    

# Recipe Search

In [18]:
# Try the retrieval for a few test queries
query = input("Enter your ingredients search here  ")

Enter your ingredients search here  carrot cake


In [19]:
%%time
getRecipe_Jaccard(str(query))

1971 recipes matched
Wall time: 1.53 s


Unnamed: 0,row_num,title,date,rating,calories,ingredients,directions,fat,protein,sodium,similarity_val
0,16187,Boiled Carrots with Prepared Horseradish,03/11/2013 04:00,0.0,,[],"[Cut 3 medium carrots into 1/4-inch thick rounds. Place in a small saucepan. Add water to cover, 1/2 teaspoon salt, and a grind or two of black pepper. Bring to boiling over high heat; boil until the carrots are tender—about 10 minutes. Drain in a colander and serve warm, with prepared horseradish on the side.]",,,,0.090909
1,17385,Carrot Cake,01/13/2012 04:00,4.375,315.0,"['1 pound carrots (6 or 7 large carrots), peeled', '2 1/2 cups flour', '1 1/4 teaspoons baking powder', '1 teaspoon baking soda', '1 1/4 teaspoons ground cinnamon', '1/2 teaspoon ground nutmeg', '1/8 teaspoon ground cloves', '1/2 teaspoon salt', ""1 1/2 cups confectioners' sugar"", '1/2 cup packed dark brown sugar', '1 1/4 cups vegetable oil', '4 eggs, lightly beaten']","[1 Preheat the oven to 350°F. Grease two 9-inch round pans or one 9 by 13-inch pan or with cooking spray or oil. In addition, you can also place a cut-out parchment circle on the bottom of the pan., 2 Grate the carrots with a grater (the old-school way!) or a food processor. I prefer medium-grated carrots—not too big, not too fine., 3 In a large bowl, mix together the flour, baking powder, baking soda, cinnamon, nutmeg, cloves, salt, confectioner's sugar, and brown sugar. Stir in the oil and eggs, then the carrots., 4 Pour the batter into the prepared pan and bake for 30 to 40 minutes, or until a toothpick comes out clean., 5 Cool completely, then frost cream cheese frosting.]",19.0,4.0,219.0,0.047619
2,10119,Carrot Cake Smoothie,11/04/2014 04:00,5.0,403.0,"['2 tablespoons unsweetened shredded coconut (or flakes)', '2 tablespoons roughly chopped walnuts', '3/4 cup grated carrot (1 large carrot)', '1/2 cup frozen peeled orange segments', '1/2 banana, sliced and frozen', '1/2 cup low-fat Greek yogurt', '1 teaspoon honey', '1/4 teaspoon ground cinnamon', '1 teaspoon vanilla extract', '1/2 cup coconut water']",[Place ingredients in blender in the order listed and blend until smooth.],20.0,20.0,233.0,0.04
3,10006,Carrot and Squash Ribbons,08/20/2004 04:00,3.125,97.0,"['2 medium carrots', '2 medium yellow squash', '2 medium zucchini', '1 tablespoon olive oil']","[Trim vegetables and cut lengthwise into 116-inch-thick ribbons with a U-shaped vegetable peeler., Have ready a bowl of ice and cold water. Cook carrots in a large pot of boiling salted water 2 minutes. Add both squashes and cook until vegetables are crisp-tender, 1 to 2 minutes., Drain vegetables and transfer to ice water, then drain in a colander., Heat oil in a large skillet over moderate heat until hot but not smoking, then cook vegetables, tossing, until heated through. Season with salt and pepper.]",3.0,2.0,24.0,0.04
4,4145,Sesame Carrots,12/11/2012 04:00,3.75,28.0,"['4 medium carrots', '1 tablespoon toasted sesame oil', 'kosher salt', 'freshly ground black pepper']","[Peel and cut 4 medium carrots into long matchstick-size pieces. Heat 1 tablespoon toasted sesame oil in a large skillet, preferably nonstick, over medium heat. Add carrots and cook, stirring occasionally, until just tender, 3-4 minutes. Season with kosher salt and freshly ground black pepper.]",2.0,0.0,26.0,0.037037
5,7171,Layer Cake,08/20/2004 20:58,0.0,,"['1/2 ounce crème de cacao', '1/2 ounce apricot brandy', '1/2 ounce heavy cream', 'Maraschino cherry']",[Pour in the order listed so that each ingredient floats on top of the one before it. Carefully place the cherry on top. Chill before serving. .],,,,0.037037
6,7470,Potato Cakes with Leek and Carrot,08/20/2004 04:00,3.75,425.0,"['1 pound russet potatoes, peeled, coarsely grated', '1 cup sliced leek (white and pale green parts only)', '2/3 cup coarsely grated peeled carrot', '4 tablespoons (1/2 stick) butter', 'Sour cream (optional)']","[Wrap grated potatoes in several layers of paper towels and squeeze dry. Place potatoes in large bowl. Add leek and carrot and toss to combine. Season generously with salt and pepper., Melt 2 tablespoons butter in each of 2 heavy medium skillets over medium-low heat. Add half of vegetable mixture (about 2 cups) to each skillet. Using metal spatula, flatten vegetables in each skillet to 7- to 8-inch-diameter cake. Cover skillets and cook cakes until crisp and brown at edges, about 12 minutes. Turn cakes over. Cook uncovered until vegetables are cooked through and cakes are crisp and brown on bottom, about 5 minutes longer. (Can be prepared 3 hours ahead. Transfer to baking sheet and let stand at room temperature. Rewarm in 375°F oven until crisp, about 10 minutes.) Transfer cakes to plates. Serve with sour cream, if desired.]",23.0,6.0,48.0,0.036364
7,11276,Garlic Soup,08/20/2004 04:00,1.25,91.0,"['2 cups water', '1 cup garlic cloves, peeled', '1 cup chopped peeled potatoes', '1 cup chopped carrots', 'Whipping cream (optional)']","[Combine first 4 ingredients in medium saucepan. Bring to boil. Cover, reduce heat to low and simmer until vegetables are tender, about 25 minutes. Puree in batches in blender. Season with salt and pepper. Rewarm over medium heat. Drizzle with cream, if desired.]",0.0,3.0,31.0,0.033333
8,9300,3-Ingredient Gingersnap Icebox Cake,09/01/2016 15:05,0.0,325.0,"['2 cups heavy cream', '42 gingersnap cookies (about 10 ounces), divided', '3/4 cup orange marmalade, divided']","[Using an electric mixer on medium-high speed, whip cream to stiff peaks in a large bowl., Arrange 10 cookies in a circle (8 around the edge and 2 in the middle) on a cake stand or large plate, breaking 1 center cookie to fit. Top with one-quarter of whipped cream, spreading gently almost to edge of cookies (you’ll want to see them peeking out). Dollop one-quarter of marmalade over. Repeat with 30 cookies and remaining cream and marmalade to make 4 layers, ending with marmalade., Place remaining 2 cookies in a resealable plastic bag. Using the back of a spoon, rolling pin, or meat mallet, crush to fine crumbs. Sprinkle crumbs over top layer, then chill at least 8 hours and up to 2 days.]",15.0,3.0,206.0,0.033333
9,937,Inside-Out Carrot Cake Cookies,08/20/2004 04:00,4.375,297.0,"['1 1/8 cups all-purpose flour', '1 teaspoon cinnamon', '1/2 teaspoon baking soda', '1/2 teaspoon salt', '1 stick (1/2 cup) unsalted butter, softened', '1/3 cup plus 2 tablespoons packed light brown sugar', '1/3 cup plus 2 tablespoons granulated sugar', '1 large egg', '1/2 teaspoon vanilla', '1 cup coarsely grated carrots (2 medium)', '1 scant cup walnuts (3 ounces), chopped', '1/2 cup raisins (2 1/2 ounces)', '8 ounces cream cheese', '1/4 cup honey']","[Put oven racks in upper and lower thirds of oven and preheat oven to 375°F. Butter 2 baking sheets., Whisk together flour, cinnamon, baking soda, and salt in a bowl., Beat together butter, sugars, egg, and vanilla in a bowl with an electric mixer at medium speed until pale and fluffy, about 2 minutes. Mix in carrots, nuts, and raisins at low speed, then add flour mixture and beat until just combined., Drop 1 1/2 tablespoons batter per cookie 2 inches apart on baking sheets and bake, switching position of sheets halfway through baking, until cookies are lightly browned and springy to the touch, 12 to 16 minutes total. Cool cookies on sheets on racks 1 minute, then transfer cookies to racks to cool completely., While cookies are baking, blend cream cheese and honey in a food processor until smooth., Sandwich flat sides of cookies together with a generous tablespoon of cream cheese filling in between.]",18.0,4.0,216.0,0.032967


# Evaluation of Document Retrieval 

### Precision & Recall

Precision measures "of all the documents we retrieved as relevant how many are actually relevant?"
Recall measures "Of all the actual relevant documents how many did we retrieve as relevant?"
F-Score / F-measure is the weighted harmonic mean of precision and recall. The traditional F-measure or balanced F-score is:

In [117]:
# Precision = No. of relevant documents retrieved / No. of total documents retrieved
precision = 6/10
print ("Precision =", precision)

# Recall = No. of relevant documents retrieved / No. of total relevant documents
recall = 6/20 # orbitary number here
print ("Recall = ", recall)

Fscore = 2 * precision * recall /(precision + recall)
print ("F-Score =", Fscore)

Precision = 0.6
Recall =  0.3
F-Score = 0.4
