In [1]:
import numpy as np
import pandas as pd
import nltk
import re
import os
import codecs
from sklearn import feature_extraction
import mpld3
import gensim
#from spellchecker import SpellChecker

# To display full text
pd.set_option('display.max_colwidth', -1)

# Load the Lemmatisation function --------------------------------------------------------
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()


# Load the Stop words ---------------------------------------------------------------------
from sklearn.feature_extraction.text import CountVectorizer
 
# Defining custom stop words
stopwords = nltk.corpus.stopwords.words('english')
stopwords = stopwords  + ['cup','teaspoon','tablespoon','sweet', 'low', 'high','medium', 'chopped', 'crushed', 'pound', 'small', 'fresh', 'clove', 'oz', 
                'ounce','cut','taste','thinly','lengthwis','extra','garnish','finely','long','short','inch','thin','pieces',
                'wide','lightly','country','discarded','across','package','packed','pieces','extra','squeezed','sometimes',
                'half','free','box','container','jar','equipment','pale','lengthwise','perferably','note','divided','piece',
                'part','separated','bunch','large','lb','kosher','salt','freshly_ground','ground','plus_more','minced',
                'cut_into','peeled','thinly_sliced','tbsp','tsp','gram','dice','room_temperature','coarsely','coarse_kosher','coarse','dash_of',
                'ml','plus','inch_cubes','water','diced','seeded','dried','frozen','such_as','red','white','green','brown','oil','chilled',
                'grated','slice','sliced','thick','to_taste','leaf','ounce_can','peel','hot','cold','wedge','pinch_of',
                'whole','size','remove','removed','strip','special_equipment','optional','trimmed','crosswise','xa','fine','sliced_thin',
                'halved','halved_lengthwise','quartered','inch_thick_slices','crumbled','inch_dice','drained','loosely_packed',
                'preferably','cooked','uncooked','thawed','pitted','cored','canned_low','powder','stick','round','tied',
                'dry_white','softened','soft','head','some_supermarkets','one','two','eight','add','grade','torn','left_intact','dash',
                'deveined','freshly','melted','for_garnish','strong','ounce bag','ounce_package','several','by_inch','quart','quarter',
                'lightly_beaten','if_needed','packed_golden_brown','picked','pale_green_parts_only','prepared','double','mild',
                'slivered','special_equipment_an_instant','at_room_temperature','medium','plus_more_if_needed','medium_size',
                'very_thinly_sliced','wear_rubber_gloves']



In [2]:
# Pre Processing function -------------------------------------------------------
import re
def pre_process(text):
    
    # lowercase
    text=text.lower()
    
    #remove tags
    text=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)
    
    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    
    return text

In [3]:
# Define a function to calculate cosine similarity
import numpy.linalg as LA
cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)

In [4]:
# Reading the data
import pandas as pd
recipes = pd.read_json('full_format_recipes.json')

In [5]:
# remove records with missing title
# 19 recipes removed
# recipes_final = recipes[recipes["ingredients"].notnull()]
recipes_final=recipes.dropna(subset=["title"], inplace=False)
recipes_final.reset_index(drop=True, inplace=True)
print(recipes_final.shape)
recipes_final.columns

(20111, 11)


Index(['calories', 'categories', 'date', 'desc', 'directions', 'fat',
       'ingredients', 'protein', 'rating', 'sodium', 'title'],
      dtype='object')

In [6]:
# remove duplicate recipes
recipes_final=pd.DataFrame(recipes_final)
#recipes_final.drop_duplicates(subset="title", keep='first', inplace=True)
recipes_final = recipes_final.iloc[recipes_final.astype(str).drop_duplicates().index]
recipes_final.reset_index(drop=True, inplace=True)

print(recipes_final.shape)
recipes_final.columns

(18295, 11)


Index(['calories', 'categories', 'date', 'desc', 'directions', 'fat',
       'ingredients', 'protein', 'rating', 'sodium', 'title'],
      dtype='object')

In [7]:
# combine "title" and "ingredients" as one string variable
recipes_final.iloc[:,6] = [str(ingrd) for ingrd in (recipes_final.iloc[:,6])]
recipes_final.iloc[:,10] = [str(title) for title in (recipes_final.iloc[:,10])]

recipes_final["titleingrd"]=recipes_final["title"]+recipes_final["ingredients"]

In [8]:
# Make sure the time format is correct
# We will consider only hours and minutes in the time component
recipes_final.date = recipes_final.date.dt.strftime('%m/%d/%Y %H:%M')

In [9]:
# Apply the above function to preprocess ingredients
recipes_final['ingredients_processed'] = recipes_final['titleingrd'].apply(lambda x:pre_process(x))

In [10]:
recipes_final.columns
recipes_final.iloc[1,12]

'boudin blanc terrine with red onion confit cups whipping cream medium onions chopped teaspoons salt bay leaves whole cloves large garlic clove crushed teaspoon pepper teaspoon ground nutmeg pinch of dried thyme crumbled large shallots minced tablespoon butter pound trimmed boneless center pork loin sinew removed cut into inch chunks well chilled eggs tablespoon all purpose flour cup tawny port tablespoons dried currants minced lettuce leaves cracked peppercorns minced fresh parsley bay leaves french bread baguette slices tablespoons olive oil large red onions halved sliced tablespoons dried currants tablespoons red wine vinegar tablespoons canned chicken broth teaspoons chopped fresh thyme or teaspoon dried crumbled teaspoon sugar '

In [11]:
# Define the sentence to be lemmatized
sentence = recipes_final.iloc[:,12]

# Perform lemmatization for each recipe (row)
lemmatized_output = []
for rcp in sentence:
    word_list = nltk.word_tokenize(rcp)
    lo = ' '.join([lemmatizer.lemmatize(w) for w in word_list])
    lemmatized_output.append(lo)

In [12]:
# Checking lemmatization
print("before:",sentence[10])
print("after:",lemmatized_output[10])

before: yams braised with cream rosemary and nutmeg teaspoons olive oil cup finely chopped shallots teaspoons minced fresh rosemary pounds yams red skinned sweet potatoes peeled cut into inch thick rounds rounds cut in half cups canned low salt chicken broth cup whipping cream ground nutmeg 
after: yam braised with cream rosemary and nutmeg teaspoon olive oil cup finely chopped shallot teaspoon minced fresh rosemary pound yam red skinned sweet potato peeled cut into inch thick round round cut in half cup canned low salt chicken broth cup whipping cream ground nutmeg


In [13]:
lem_out_wordlist = [nltk.word_tokenize(x) for x in lemmatized_output]

In [14]:
# Create the word count vector ------------------------------------------------------
# Here the stopwords are removed, and 
#ignore words that appear in 85% of documents and below 0.01%
from sklearn.feature_extraction.text import TfidfVectorizer

# cv=CountVectorizer(stop_words=stopwords,max_df=0.85,min_df= 0.0001)
# word_count_vector=cv.fit_transform(lemmatized_output)

TfidfVec = TfidfVectorizer(stop_words=stopwords,max_df=0.85,min_df= 0.0001,ngram_range=(1,2))
word_count_vector = TfidfVec.fit_transform(lemmatized_output)

# Convert to an array
trainVectorizerArray = word_count_vector.toarray()

In [15]:
# Subset columns to display
recipes_list = recipes_final[[ 'title','date','rating','calories', 'ingredients', 'directions', 'fat',
        'protein','sodium']]

# Reset index name 
recipes_list = pd.concat([pd.DataFrame(list(recipes_list.index)),recipes_list], axis=1)

# Rename the columns
recipes_list.columns = [ 'row_num' ,'title','date','rating','calories', 'ingredients', 'directions', 'fat',
        'protein','sodium']

In [16]:
# Function to retrieve recipes using the cosine distance between them

def getRecipe_cosine(query = "", sort = True):
    
    if query == "":
        return(print('no match found '))
    
    # Define empty arrays
    distance = [] # Distance between document
    row_num = [] # Row number of the document 
    
    # Pre-process query
    query_processed = pre_process(str(query))    
#     # Spell check the query
#     spell = SpellChecker()
#     misspelled = spell.unknown(query_processed)
    
#     corrected =""
#     spellcheck_query = ""
    
#     for word in misspelled:
#         print(spell.correction(word))
#         corrected=[spell.correction(word)]
#         if len(corrected)==0:
#             corrected.remove('non') 
#         #print(corrected)

#         spellcheck_query = str(query_processed) + str(corrected)

        
    # Lemmatize the query
    query_wlist = nltk.word_tokenize(query_processed)
    lemmatized_query = ' '.join([lemmatizer.lemmatize(w) for w in query_wlist])
    
    # Vectorize the words and convert to an array
    testVectorizerArray = TfidfVec.transform([lemmatized_query]).toarray()
    
    for x in range(len(trainVectorizerArray)):
        vector = trainVectorizerArray[x]
        for testV in testVectorizerArray:
            cosine = cx(vector, testV)
            if cosine > 0:
                distance.append(cosine)
                row_num.append(x)
    
    
    # Concatenate the columns into a dataframe
    matches = pd.concat([pd.DataFrame(distance),pd.DataFrame(row_num)],axis = 1)
    if matches.empty:
        return(print(" No Recipes with "+ query))
# Provide descriptive the column names
    matches.columns = ['similarity_val','row_num']
# Sort by the highest similarity value
    matches = matches.sort_values(by=['similarity_val'],ascending=False)
    print(matches.shape[0], "recipes matched")
    
    # Save the top 10 in a dataframe
    results = pd.DataFrame(recipes_list.iloc[matches.row_num[:10],:])
    results_score = pd.merge(results,matches[:10], how = 'left', on = 'row_num')
    
    if(sort):
        # Sort by time
        results = results.sort_values(by = ['date'], ascending=False)
        # sort my ratings
        results = results.sort_values(by = ['rating'], ascending=False)
    else:
        results = pd.DataFrame(recipes_list.iloc[matches.row_num[0:10],:])
        results_score = pd.merge(results,matches[0:10], how = 'left', on = 'row_num')
        #results_score = results_score.sort_values(by = ['similarity_val'],ascending=False)
        #results.to_excel('query_results_co.xlsx')
        
    #Return the dataset
    if query != '':
        return(results_score)

# Recipe Search

In [17]:
# To display full text
pd.set_option('display.max_colwidth', -1)

In [21]:
# Try the retrieval for a few test queries
query = input("Enter your ingredients search here  ")

Enter your ingredients search here  fish, chilli and pasta


In [22]:
%%time
getRecipe_cosine(query,sort = False)

960 recipes matched
Wall time: 3.24 s


Unnamed: 0,row_num,title,date,rating,calories,ingredients,directions,fat,protein,sodium,similarity_val
0,4595,Chilli Oil,03/14/2014 04:00,0.0,411.0,"['600ml (1 pint) rapeseed oil', '1 large red chilli, split in half', '1 lemon grass stalk', '1 garlic clove', '20g (3/4oz) root ginger, sliced but not peeled']","[Gently warm the oil through in a heavy-based pan but do not allow it to boil. Bring to a gentle simmer, then add the chilli, lemon grass, garlic and ginger. Continue to simmer very gently for 20–30 minutes, until the flavours are well infused. It's important not to allow it to boil at any stage. Pour into a squeezy bottle, leaving the bits in, as the flavours will continue to infuse. Use as required.]",46.0,0.0,1.0,0.427
1,15244,Sweet Potato and Coconut Soup,03/14/2014 04:00,4.375,,"['450g (1lb) sweet potatoes, cut into cubes', '2 tbsp sunflower oil', '1 onion, finely chopped', '1 leek, trimmed and finely chopped', '1 lemon grass stalk, trimmed and halved', '1 red chilli, halved, seeded and thinly sliced', '1 tsp freshly grated root ginger', '1.25 litres (2 1/4 pints) vegetable stock', '4 tsp tomato purée', '400g can coconut milk', 'sea salt and freshly ground black pepper', 'chilli oil , to garnish', 'fresh micro coriander, to garnish']","[Preheat the oven to 200°C (400°F/gas mark 6)., Place the sweet potatoes in a baking tin, drizzle over 1 tablespoon of the sunflower oil and roast for 20–30 minutes, until tender. Set aside., Heat the remaining 1 tablespoon of oil in a pan over a medium heat. Add the onion, leek, lemon grass, half the chilli and the ginger and sweat for 4 minutes, stirring occasionally. Add the roasted sweet potato with the stock and tomato purée, then bring to the boil. Reduce the heat and simmer for 10 minutes, until the liquid has slightly reduced and all the vegetables are completely tender., Pour the coconut milk into the pan, reserving about 3 tablespoons from the top of the can as a garnish, and cook for another 5 minutes, stirring constantly. Season to taste. Remove the lemon grass and then blend with a hand blender until smooth., To serve, ladle the soup into warmed bowls and swirl in the reserved coconut milk. Add a drizzle of the chilli oil, then sprinkle over the reserved chilli slices and micro coriander.]",,,,0.286
2,3401,Quince Sambal,05/16/2006 20:12,2.5,,"['1 ripe quince', 'salt', '1 small onion, grated', '2 ml (1/2 teaspoon) crushed garlic', '1 small red or green chilli, sliced, seeded and finely chopped', '30 ml (2 tablespoons) sugar', '30 ml (2 tablespoons) lemon juice']","[Peel and core the quince, and grate coarsely or slice into the finest slivers. Pile in a bowl, sprinkle with salt and set aside for 1-2 hours. Rinse under cold running water, drain well, and dry thoroughly with a clean tea towel. Mix with the remaining ingredients, cover and chill until required.]",,,,0.268
3,13461,Dukkah-Crusted Salmon With Cucumber and Chilli Salad,09/01/2016 19:14,1.875,,"['1 1/2 cups (75g) puffed amaranth', '2 tablespoons store-bought dukkah', '1 teaspoon sea salt flakes', '4 (200g) salmon fillets, skin removed', '2 eggs, lightly beaten', '2 tablespoons extra-virgin olive oil', '1 long green chilli, thinly sliced', '1/4 cup (60ml) extra virgin olive oil, plus more', '1/4 cup (60ml) lime juice', '1 clove garlic, crushed', '2 tablespoons chopped cilantro', 'Sea salt and cracked black pepper', '4 cups (50g) snow pea tendrils', '2 Lebanese cucumbers (260g), thinly sliced', 'Chervil sprigs, to serve']","[Place the amaranth, dukkah and salt on a small tray and toss to combine. Dip each salmon fillet in the egg and press into the dukkah mixture to coat. Heat the oil in a large non-stick frying pan over medium heat. Cook the salmon, turning every 3–4 minutes, for 10–12 minutes or until just cooked through and the crumb is golden. Place the chilli, extra oil, lime juice, garlic, cilantro, salt and pepper in a medium bowl and whisk to combine. Add the snow pea tendrils and cucumber and toss to coat. Divide the salmon and salad between serving plates and top with chervil to serve.]",,,,0.242
4,9153,Fish Stock,10/12/2011 04:00,0.0,22.0,"['5 tsp olive oil', '14 oz crabs', '3 3/4 lb fish', '1 gallon water']","[Put a large pot over medium heat, then add the olive oil. Add the crabs. Cook for 3-5 minutes. Add the fish. Pour in the water and bring to a simmer. Skim the foam from the surface. Simmer for 20 minutes, then strain through a fine-mesh sieve. Cool before putting into containers.]",1.0,4.0,53.0,0.199
5,14523,Crab Fritters with Spicy Lime Sauce,08/20/2004 04:00,3.75,311.0,"['1/2 cup fresh lime juice', '6 tablespoons fish sauce (nam pla)', '1/4 cup pure maple syrup', '3 jalapeño chillies, seeded, chopped', '2 garlic cloves, chopped', '1/4 cup mayonnaise', '1 tablespoon Dijon mustard', '1/2 teaspoon grated lemon peel', '8 ounces fresh crabmeat, drained', '2 grapefruits', '2 oranges', '1 1/2 cupspanko (japanese breadcrumbs)', '2 firm but ripe avocados, halved, pitted, peeled, sliced', '2 tablespoons chopped fresh cilantro', '2 tablespoons chopped fresh chives']","[Mix all ingredients in large bowl to blend. Cover and chill at least 1 hour or overnight to develop flavors., Mix mayonnaise, mustard, lemon peel and hot pepper sauce in medium bowl to blend. Mix in crabmeat. Season to taste with salt and pepper. Shape crab mixture into six 2 1/4-inch-diameter patties, using 1/4 cup mixture for each. Place fritters on baking sheet. Cover and refrigerate at least 3 hours., Cut all peel and white pith from grapefruits. Using small sharp knife, cut between membranes to release segments. Repeat with oranges. Chill segments until ready to serve., Place panko in shallow dish. Cost each fritter with panko, pressing to adhere. Pour oil into heavy large skillet to depth of 1/2 inch and heat to 350°F. Cook fritters in batches until brown and heated through, about 2 minutes per side. Drain fritters on paper towels., Fan avocado slices and grapefruit and orange segments on plates. Drizzle 1 tablespoon sauce over each plate. Top each with 1 fritter. Sprinkle cilantro and chives over and serve.]",18.0,10.0,1717.0,0.197
6,706,Blatjang,05/16/2006 20:12,5.0,342.0,"['250 g (8 ounces) dried apricots, chopped', '250 g (8 ounces) seedless raisins', '3 litres (12 cups) grape (wine or cider) vinegar', '4 large onions, finely chopped', '4 cloves garlic, crushed', '500 g (1 pound) brown sugar', '200 g (6 1/2 ounces) flaked almonds', '30 ml (2 tablespoons) salt', '45 ml (3 tablespoons) ground ginger', '30 ml (2 tablespoons) ground coriander', '30 ml (2 tablespoons) mustard seeds', '10 ml (2 teaspoons) chilli powder']","[Combine the apricots, raisins and vinegar in a 5-litre (5-quart) saucepan. Soak overnight to plump the fruit. Alternatively, if time is tight, simply cover, bring to the boil and set aside for about 2 hours., Add the remaining ingredients, and cook uncovered over medium heat, stirring occasionally at first, then constantly towards the end of the cooking time, until the chutney has reduced to about one-third, and is beautifully thick. It should take 1 1/2-2 hours. To know when it is ready for bottling, test the consistency by putting a little in the freezer to cool. Pour into hot, sterilized jars, seal and store in a cool, dark cupboard.]",6.0,5.0,437.0,0.186
7,3400,Lobster Curry,05/16/2006 20:12,5.0,,"['4 lobsters', '250 ml (1 cup) Fish Stock', 'vegetable oil', '12 pickling onions, peeled', '10 ml (2 teaspoons) crushed garlic', '1-2 fresh chillies, finely sliced and seeded', '4-5 curry leaves', '5 ml (1 teaspoon) ground cumin', '5 ml (1 teaspoon) ground coriander', '2 ml (1/2 teaspoon) turmeric', '1 ml (1/4 teaspoon) ground cardamom', '1 stick cinnamon', '4 large, ripe tomatoes blanched, skinned, and chopped', 'lemon juice', 'salt, milled black pepper', '60 ml (1/4 cup) plain yoghurt (optional)']","[Kill the lobsters by plunging the tip of a large sharp knife straight down behind the lobsters' eyes. Separate tails from bodies. Wash well. Cut off fanned tail shells; set aside. Pull out the alimentary canal. Slice tails through lengthwise (first snip through the under-shell with scissors)., Bring the fish stock to the boil in a large saucepan, add lobster bodies and tail fans and boil for 6 minutes (no longer or you will overcook the legs). Remove from the pot. Pull off legs and claws, cover and set aside with the tail fans (they will later decorate the completed dish). Strain and retain the stock for the sauce., Heat a little oil in a large saucepan and lightly cook the lobster tail pieces for about 1 minute. Set aside., Lightly brown the onions and garlic in the same pot (add extra oil if necessary), then add the chilli, curry leaves, cumin, coriander, turmeric, cardamom and cinnamon. Sizzle the spices for about 30 seconds, then add the chopped tomato, lemon juice, stock and a little salt and pepper. Cover and simmer for about 10 minutes., Just before serving return the lobster to the sauce and simmer for 2-3 minutes until cooked. Add the yoghurt and heat through. Tip into a warm bowl, or serve directly from the pan. Garnish with the reserved tail fans and legs, and serve with Yellow Rice and Quince Sambal.]",,,,0.182
8,11571,Vegetable Patch Pasta Salad,08/20/2004 04:00,3.125,359.0,"['1 cup (about 4 ounces) small pasta shells or orzo (rice-shaped pasta)', '1 large lemon', '2 tablespoons olive oil', '1 carrot, grated', '4 radishes, trimmed, thinly sliced', '1/3 cup chopped fresh chives or green onions']","[Cook pasta in medium pot of boiling salted water until just tender but still firm to bite. Drain pasta; rinse under cold water and drain well., Grate enough lemon peel to measure 1/2 teaspoon. Squeeze enough juice from lemon to measure 1 1/2 tablespoons. Place lemon peel and juice in medium bowl. Whisk in oil. Mix in carrot, radishes and chives, then pasta. Season to taste with salt and pepper. Let stand 10 minutes to blend flavors.]",15.0,9.0,35.0,0.179
9,11578,Three Dipping Sauces,08/20/2004 04:00,0.0,162.0,"['50 ml (3 tablespoons plus 1 teaspoon) peanut oil', '2 dried chillies', '3 spring onions (scallions), white part, with 2 cm (3/4-inch) of green left on, finely sliced', '1 large knob ginger, finely diced', '2 cloves garlic', '15 ml (1 tablespoon) shaohsing wine', '15 ml (1 tablespoon) rice-wine vinegar', '2 tablespoons sea salt', '2 tablespoons superfine sugar', '100 ml (1/2 cup) Chinese black rice vinegar', '2 large knobs ginger, finely diced', '30 ml (2 tablespoons) peanut oil', '4 spring onions (scallions), sliced into rounds', '2 large knobs ginger, finely diced', '3 cloves garlic, finely diced', '2 red chillies, sliced', '60 ml (1/4 cup) bean paste', '60 ml (1/4 cup) shaohsing wine', '60 ml (1/4 cup) rice-wine vinegar', '4 tablespoons (1/4 cup) crushed yellow rock (or light brown) sugar']","[Heat the peanut oil in a wok and fry the chillies until they blacken. Discard the chillies and allow the oil to cool. In a mortar and pestle crush all the other ingredients lightly. As the oil cools add to the mortar and mix well. Leave for a little while to allow the flavours to marry., Very good on boiled and fried dishes, and a great dressing for grilled scallop salad., Mix the diced ginger with the vinegar and allow to stand for 1 hour before serving., You'll find this easy to make but very effective on boiled meats., In a wok, heat up the peanut oil and fry the spring onions, ginger, garlic and chillies for 3 minutes. Add all the other ingredients and reduce the sauce by half. Remove from the heat and cool., Can be used as a cold dipping sauce or heated up and tossed with steamed vegetables.]",10.0,2.0,1763.0,0.179


# Evaluation of Document Retrieval 

### Precision & Recall

Precision measures "of all the documents we retrieved as relevant how many are actually relevant?"
Recall measures "Of all the actual relevant documents how many did we retrieve as relevant?"
F-Score / F-measure is the weighted harmonic mean of precision and recall. The traditional F-measure or balanced F-score is:

In [117]:
# Precision = No. of relevant documents retrieved / No. of total documents retrieved
precision = 6/10
print ("Precision =", precision)

# Recall = No. of relevant documents retrieved / No. of total relevant documents
recall = 6/20 # orbitary number here
print ("Recall = ", recall)

Fscore = 2 * precision * recall /(precision + recall)
print ("F-Score =", Fscore)

Precision = 0.6
Recall =  0.3
F-Score = 0.4
