In [1]:
import numpy as np
import pandas as pd
import nltk
import re
import os
import codecs
from sklearn import feature_extraction
import mpld3
import gensim
#from spellchecker import SpellChecker

# To display full text
pd.set_option('display.max_colwidth', -1)

# Load the Lemmatisation function --------------------------------------------------------
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()


# Load the Stop words ---------------------------------------------------------------------
from sklearn.feature_extraction.text import CountVectorizer
 
# Defining custom stop words
stopwords = nltk.corpus.stopwords.words('english')
stopwords = stopwords  + ['cup','teaspoon','tablespoon','sweet', 'low', 'high','medium', 'chopped', 'crushed', 'pound', 'small', 'fresh', 'clove', 'oz', 
                'ounce','cut','taste','thinly','lengthwis','extra','garnish','finely','long','short','inch','thin','pieces',
                'wide','lightly','country','discarded','across','package','packed','pieces','extra','squeezed','sometimes',
                'half','free','box','container','jar','equipment','pale','lengthwise','perferably','note','divided','piece',
                'part','separated','bunch','large','lb','kosher','salt','freshly_ground','ground','plus_more','minced',
                'cut_into','peeled','thinly_sliced','tbsp','tsp','gram','dice','room_temperature','coarsely','coarse_kosher','coarse','dash_of',
                'ml','plus','inch_cubes','water','diced','seeded','dried','frozen','such_as','red','white','green','brown','oil','chilled',
                'grated','slice','sliced','thick','to_taste','leaf','ounce_can','peel','hot','cold','wedge','pinch_of',
                'whole','size','remove','removed','strip','special_equipment','optional','trimmed','crosswise','xa','fine','sliced_thin',
                'halved','halved_lengthwise','quartered','inch_thick_slices','crumbled','inch_dice','drained','loosely_packed',
                'preferably','cooked','uncooked','thawed','pitted','cored','canned_low','powder','stick','round','tied',
                'dry_white','softened','soft','head','some_supermarkets','one','two','eight','add','grade','torn','left_intact','dash',
                'deveined','freshly','melted','for_garnish','strong','ounce bag','ounce_package','several','by_inch','quart','quarter',
                'lightly_beaten','if_needed','packed_golden_brown','picked','pale_green_parts_only','prepared','double','mild',
                'slivered','special_equipment_an_instant','at_room_temperature','medium','plus_more_if_needed','medium_size',
                'very_thinly_sliced','wear_rubber_gloves']

In [2]:
# Pre Processing function -------------------------------------------------------
import re
def pre_process(text):
    
    # lowercase
    text=text.lower()
    
    #remove tags
    text=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)
    
    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    
    return text

In [3]:
# Define a function to calculate cosine similarity
import numpy.linalg as LA
cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)

In [4]:
# Reading the data
# Rename the "combined data" sheet to s1
import pandas as pd
recipes = pd.read_excel('recipes_topic_mixture_final.xlsx', sheet_name="s1")

In [5]:
recipes1 = recipes[['title','date','rating','calories', 'ingredients', 'directions', 'fat',
        'protein','sodium', 'Max Value', 'by Topic Name ']]

In [6]:
# remove records with missing title
# 19 recipes removed
# recipes_final = recipes[recipes["ingredients"].notnull()]
recipes_final=recipes1.dropna(subset=["title"], inplace=False)
recipes_final.reset_index(drop=True, inplace=True)
print(recipes_final.shape)
recipes_final.columns

(18295, 11)


Index(['title', 'date', 'rating', 'calories', 'ingredients', 'directions',
       'fat', 'protein', 'sodium', 'Max Value', 'by Topic Name '],
      dtype='object')

In [7]:
# remove duplicate recipes
recipes_final=pd.DataFrame(recipes_final)
#recipes_final.drop_duplicates(subset="title", keep='first', inplace=True)
recipes_final = recipes_final.iloc[recipes_final.astype(str).drop_duplicates().index]
recipes_final.reset_index(drop=True, inplace=True)

print(recipes_final.shape)
recipes_final.columns

(18295, 11)


Index(['title', 'date', 'rating', 'calories', 'ingredients', 'directions',
       'fat', 'protein', 'sodium', 'Max Value', 'by Topic Name '],
      dtype='object')

In [8]:
recipes_final.iloc[1,10]

'Festive Season'

In [9]:
# combine "title" and "ingredients" and labels as one string variable
recipes_final.iloc[:,4] = [str(ingrd) for ingrd in (recipes_final.iloc[:,4])]
recipes_final.iloc[:,0] = [str(title) for title in (recipes_final.iloc[:,0])]
recipes_final.iloc[:,10] = [str(title) for title in (recipes_final.iloc[:,10])]


recipes_final["titleingrd"]=recipes_final["title"]+recipes_final["ingredients"] +recipes_final['by Topic Name ']

In [10]:
# Make sure the time format is correct
# We will consider only hours and minutes in the time component
recipes_final.date = recipes_final.date.dt.strftime('%m/%d/%Y %H:%M')

In [11]:
# Apply the above function to preprocess ingredients
recipes_final['ingredients_processed'] = recipes_final['titleingrd'].apply(lambda x:pre_process(x))

In [12]:
recipes_final.columns
recipes_final.iloc[1,12]

'boudin blanc terrine with red onion confit cups whipping cream medium onions chopped teaspoons salt bay leaves whole cloves large garlic clove crushed teaspoon pepper teaspoon ground nutmeg pinch of dried thyme crumbled large shallots minced tablespoon butter pound trimmed boneless center pork loin sinew removed cut into inch chunks well chilled eggs tablespoon all purpose flour cup tawny port tablespoons dried currants minced lettuce leaves cracked peppercorns minced fresh parsley bay leaves french bread baguette slices tablespoons olive oil large red onions halved sliced tablespoons dried currants tablespoons red wine vinegar tablespoons canned chicken broth teaspoons chopped fresh thyme or teaspoon dried crumbled teaspoon sugar festive season'

In [13]:
# Define the sentence to be lemmatized
sentence = recipes_final.iloc[:,12]

# Perform lemmatization for each recipe (row)
lemmatized_output = []
for rcp in sentence:
    word_list = nltk.word_tokenize(rcp)
    lo = ' '.join([lemmatizer.lemmatize(w) for w in word_list])
    lemmatized_output.append(lo)

In [14]:
# Checking lemmatization
print("before:",sentence[10])
print("after:",lemmatized_output[10])

before: yams braised with cream rosemary and nutmeg teaspoons olive oil cup finely chopped shallots teaspoons minced fresh rosemary pounds yams red skinned sweet potatoes peeled cut into inch thick rounds rounds cut in half cups canned low salt chicken broth cup whipping cream ground nutmeg festive season
after: yam braised with cream rosemary and nutmeg teaspoon olive oil cup finely chopped shallot teaspoon minced fresh rosemary pound yam red skinned sweet potato peeled cut into inch thick round round cut in half cup canned low salt chicken broth cup whipping cream ground nutmeg festive season


In [15]:
lem_out_wordlist = [nltk.word_tokenize(x) for x in lemmatized_output]

In [16]:
# Create the word count vector ------------------------------------------------------
# Here the stopwords are removed, and 
#ignore words that appear in 85% of documents and below 0.01%
from sklearn.feature_extraction.text import TfidfVectorizer

# cv=CountVectorizer(stop_words=stopwords,max_df=0.85,min_df= 0.0001)
# word_count_vector=cv.fit_transform(lemmatized_output)

TfidfVec = TfidfVectorizer(stop_words=stopwords,max_df=0.85,min_df= 0.0001,ngram_range=(1,2))
word_count_vector = TfidfVec.fit_transform(lemmatized_output)

# Convert to an array
trainVectorizerArray = word_count_vector.toarray()

In [17]:
recipes_final.columns

Index(['title', 'date', 'rating', 'calories', 'ingredients', 'directions',
       'fat', 'protein', 'sodium', 'Max Value', 'by Topic Name ', 'titleingrd',
       'ingredients_processed'],
      dtype='object')

In [18]:
# Subset columns to display
recipes_list = recipes_final[['title', 'date', 'rating', 'calories', 'ingredients', 'directions',
       'fat', 'protein', 'sodium', 'Max Value', 'by Topic Name ']]

# Reset index name 
recipes_list = pd.concat([pd.DataFrame(list(recipes_list.index)),recipes_list], axis=1)

# Rename the columns
recipes_list.columns = ['row_num' ,'title','date','rating','calories', 'ingredients', 'directions', 'fat',
        'protein','sodium','Max Value', 'by Topic Name ']

In [19]:
# Function to retrieve recipes using the cosine distance between them

def getRecipe_cosine(query = "", sort = True):
    
    if query == "":
        return(print('no match found '))
    
    # Define empty arrays
    distance = [] # Distance between document
    row_num = [] # Row number of the document 
    
    # Pre-process query
    query_processed = pre_process(str(query))    
#     # Spell check the query
#     spell = SpellChecker()
#     misspelled = spell.unknown(query_processed)
    
#     corrected =""
#     spellcheck_query = ""
    
#     for word in misspelled:
#         print(spell.correction(word))
#         corrected=[spell.correction(word)]
#         if len(corrected)==0:
#             corrected.remove('non') 
#         #print(corrected)

#         spellcheck_query = str(query_processed) + str(corrected)

        
    # Lemmatize the query
    query_wlist = nltk.word_tokenize(query_processed)
    lemmatized_query = ' '.join([lemmatizer.lemmatize(w) for w in query_wlist])
    
    # Vectorize the words and convert to an array
    testVectorizerArray = TfidfVec.transform([lemmatized_query]).toarray()
    
    for x in range(len(trainVectorizerArray)):
        vector = trainVectorizerArray[x]
        for testV in testVectorizerArray:
            cosine = cx(vector, testV)
            if cosine > 0:
                distance.append(cosine)
                row_num.append(x)
    
    
    # Concatenate the columns into a dataframe
    matches = pd.concat([pd.DataFrame(distance),pd.DataFrame(row_num)],axis = 1)
    if matches.empty:
        return(print(" No Recipes with "+ query))
# Provide descriptive the column names
    matches.columns = ['similarity_val','row_num']
# Sort by the highest similarity value
    matches = matches.sort_values(by=['similarity_val'],ascending=False)
    print(matches.shape[0], "recipes matched")
    
    # Save the top 10 in a dataframe
    results = pd.DataFrame(recipes_list.iloc[matches.row_num[:10],:])
    results_score = pd.merge(results,matches[:10], how = 'left', on = 'row_num')
    results_score = results_score.sort_values(by = ['Max Value','similarity_val'], ascending=False)
    if(sort):
        # Sort by time
        results = results.sort_values(by = ['date'], ascending=False)
        # sort my ratings
        results = results.sort_values(by = ['rating'], ascending=False)
    else:
        results = pd.DataFrame(recipes_list.iloc[matches.row_num[0:10],:])
        results_score = pd.merge(results,matches[0:10], how = 'left', on = 'row_num')
        #results_score = results_score.sort_values(by = [],ascending=False)
        #results.to_excel('query_results_co.xlsx')
        results_score = results_score.sort_values(by = ['similarity_val','Max Value'], ascending=False)
        
    #Return the dataset
    if query != '':
        return(results_score)

In [20]:
# To display full text
pd.set_option('display.max_colwidth', -1)

# Recipe Search

In [21]:
# Try the retrieval for a few test queries
print("Categories to choose from:\n 1.Asian Delights 2.Cakes and Deserts 3.Festive Season 4.Italian 5.Light bites 6.Rich & Flavourful  7. Summertime  8.Western  \n")
query = input(" Enter your ingredients search here \n")

Categories to choose from:
 1.Asian Delights 2.Cakes and Deserts 3.Festive Season 4.Italian 5.Light bites 6.Rich & Flavourful  7. Summertime  8.Western  

 Enter your ingredients search here 
chicken asian


In [22]:
%%time
getRecipe_cosine(query,sort = False)

5416 recipes matched
Wall time: 30.8 s


Unnamed: 0,row_num,title,date,rating,calories,ingredients,directions,fat,protein,sodium,Max Value,by Topic Name,similarity_val
0,14794,Spitted Roast Chicken,08/20/2004 04:00,5.0,,[],"['Put a good square of butter and a little salt and pepper in the cavity of each chicken. Truss well and brush them with melted butter or oil seasoned to taste with salt, pepper and paprika. Spit them carefully. Run the spit through the backbone just above the tail and guide it to the top part of the breast at the base of the neck. This way you achieve a good balance.', 'When your fire has burned down to a good bed of coals, make a ring of the briquets or charcoal leaving the center area directly under the chickens clear to catch the drippings. Arrange the spitted chickens over this space and roast, basting them frequently with equal parts of melted butter and white wine or dry vermouth. The cooking time will take from 45 minutes to 1 1/4 hours, depending on the size of the birds.', 'Plain roast chicken goes best with crisp sautÃ©ed potatoes and a fresh green salad with a minimum of dressing. As for wine, most people prefer a white, such as a Pinot Blanc from California, or a Meursault or Pouilly FuissÃ© from France.']",,,,0.125,"Asian Delights, Asian Delights, Asian Delights",0.537
1,4475,Braised Chicken Teriyaki,08/20/2004 04:00,3.75,1253.0,"['1 cup soy sauce', '3/4 cup sake or dry white wine', '3/4 cup canned low-salt chicken broth', '1/4 cup sugar', '1 1/2 tablespoons minced peeled fresh ginger', '4 large garlic cloves, minced', '1 4 1/2-pound fryer chicken']","['Stir first 6 ingredients in heavy large pot over low heat until sugar dissolves. Add chicken. Cover pot and simmer until chicken is just cooked through, turning occasionally, about 50 minutes. Transfer chicken to platter; cover with foil to keep warm. Increase heat and boil sauce until reduced to 1 1/2 cups, about 15 minutes. Spoon sauce over.']",78.0,102.0,3874.0,0.492326,"Asian Delights, Festive Season",0.364
2,15195,Beer-Basted Chicken with Asian Flavors,08/20/2004 04:00,4.375,1101.0,"['1 3 1/2- to 4-pound chicken', '1 12-ounce can beer', '6 green onions, chopped', '1/2 cup soy sauce', '1/4 cup fresh lemon juice', '2 tablespoons (packed) golden brown sugar', '2 tablespoons chopped peeled fresh ginger', '1 tablespoon chopped garlic', '1 tablespoon oriental sesame oil']","['Combine all ingredients in heavy large resealable plastic bag. Refrigerate 1 hour and up to 1 day, turning bag occasionally.', 'Preheat oven to 350Â°F. Place chicken and marinade in 13 x 9 x 2-inch baking pan. Roast chicken until juices run clear when thigh is pierced, basting occasionally, about 1 hour 20 minutes.', 'Transfer chicken to platter. Pour pan juices into medium saucepan; spoon off fat and discard. Boil until sauce is reduced to 1 cup, about 6 minutes. Serve chicken with sauce.']",72.0,88.0,2078.0,0.460461,"Rich & Flavourful, Asian Delights",0.31
3,6470,Chicken Vedova,08/20/2004 04:00,3.125,,[],"['In Udine the chicken were free-range, small and scrawny. To make this dish you need poussins, or very small chickens. Cut four 1 1/2-pound chickens in two. Rub the chicken with garlic cloves and then with 2 tablespoons lime juice. Sprinkle the chickens with salt and pepper. Mix 2 tablespoons oregano with 2 tablespoons thyme. Rub the chicken with the herbs and refrigerate for at least 1 hour. Just before broiling, rub the chickens with 2 tablespoons olive oil. Broil the chickens on one side for 8 minutes, turn them, and broil them for another 8 minutes. The broiling time depends on how large the chickens are. Serve the chickens with a salad of watercress.']",,,,0.125,"Asian Delights, Asian Delights, Asian Delights",0.17
4,2039,Chicken Tarragon,08/20/2004 04:00,5.0,,[],"['Put 2 sprigs of fresh tarragon and a sprig of parsley into the cavity of the bird along with butter, salt and pepper. Truss the bird and then slip a few tarragon leaves under the skin of the breast, working down from the neck and being careful not to puncture the skin as you separate it from the flesh. Brush the bird with seasoned melted butter in which you have steeped a few tarragon leaves. Roast according to directions for spitted roast chicken , basting with more tarragon butter during the cooking.', 'Serve with tarragon butter and potatoes that have been wrapped in foil and roasted in the coals. A bowl of fresh raw vegetables is an excellent accompaniment.']",,,,0.125,"Asian Delights, Asian Delights, Asian Delights",0.15
5,6772,Risotto,01/14/2010 04:00,4.375,,[],"['For enough risotto to serve four, heat about 2 tablespoons of butter in a sauce pan. Add 1 cup of a round-grain rice such as Arborio or Carnaroli and stir the rice over medium heat. Add the broth, one cup at a time, until the rice has absorbed 3 cups of liquid. You should stir the risotto frequently as it cooks to get a good creamy texture. (If you like, replace 1/2 cup of the broth with 1/2 cup dry white wine.) For a saffron risotto, add a few threads of crushed saffron with the first cup of broth.', 'Once the rice is tender and creamy, pull it off the heat and add 2 tablespoon butter and 1/3 to 1/2 cup grated Parmesan cheese. Stir the risotto vigorously until the butter and cheese are blended in. Serve at once on heated plates.']",,,,0.125,"Asian Delights, Asian Delights, Asian Delights",0.12
6,15950,Kebabs,08/20/2004 04:00,2.5,,[],"['1. String cubes of meat on skewers, brush well with olive oil and broil, turning often. If you crowd the cubes together, you will have rare juicy meat. If the cubes are placed further apart, you will have medium well done meat. (Remember, pork must be well done.) Salt and pepper the kebabs to taste as they cook.', 'Serve these plain kebabs with rice mixed with pistachio nuts and a plate of crisp French fried onion rings.', '2. Marinate meat cubes in a mixture of olive oil, lemon juice and a pinch of dried thyme. Let the meat soak for 2 hours or more. Alternate the cubes on skewers with tiny tomatoes, tiny whole onions that have been parboiled for a few minutes and strips of green pepper. Broil as above, brushing with the marinade during the cooking.', '3. Soak the meat in a marinade of olive oil, lemon juice, oregano and plenty of coarse black pepper. Proceed as above.', '4. Alternate lamb cubes with pieces of sweetbread and marinate in olive oil, lemon juice and several crushed bay leaves. Proceed as above.', '5. Alternate cubes of lamb or beef with squares of eggplant. Marinate in olive oil, lemon juice, grated garlic and black pepper. Broil as above.']",,,,0.125,"Asian Delights, Asian Delights, Asian Delights",0.119
7,2683,Asian Lamb Stir-Fry in Radicchio Wraps,08/20/2004 04:00,0.0,,[],[],,,,0.125,"Asian Delights, Asian Delights, Asian Delights",0.115
8,12523,To Zest Citrus Fruits,08/20/2004 04:00,5.0,,[],"['To zest citrus fruits, remove the colored part of the rind only (avoid the bitter white pith). For strips, use a vegetable peeler. For grated zest, we prefer using a rasplike Microplane zester, which results in fluffier zest, so pack to measure.']",,,,0.125,"Asian Delights, Asian Delights, Asian Delights",0.114
9,15540,Cracking and Grating Coconut,08/20/2004 04:00,3.75,,[],"[""Extracting the meat from a coconut is not as difficult as you may think. Pierce several of the coconutâ€™s eyes with a screwdriver or sharp metal skewer. Drain the thin, clear liquid into a bowl and taste it: If it's oily rather than sweet, the nut is rancid and should be tossed out. Bake the whole coconut in a shallow baking pan at 400Â°F until it cracks, about 20 minutes. When it's cool, wrap it in a towel and break it apart with a hammer. Pry the flesh from the shell with a screwdriver or a dull table knife and peel off the brown membrane with a vegetable peeler. Rinse the coconut under cold water and dry. Grate the coconut, using the medium shredding disk of a food processor or the medium teardrop-shaped holes of a four-sided grater, and chill, covered, until ready to use. â€”""]",,,,0.125,"Asian Delights, Asian Delights, Asian Delights",0.112


# Evaluation of Document Retrieval 

### Precision & Recall

Precision measures "of all the documents we retrieved as relevant how many are actually relevant?"
Recall measures "Of all the actual relevant documents how many did we retrieve as relevant?"
F-Score / F-measure is the weighted harmonic mean of precision and recall. The traditional F-measure or balanced F-score is:

In [23]:
# Precision = No. of relevant documents retrieved / No. of total documents retrieved
precision = 6/10
print ("Precision =", precision)

# Recall = No. of relevant documents retrieved / No. of total relevant documents
recall = 6/20 # orbitary number here
print ("Recall = ", recall)

Fscore = 2 * precision * recall /(precision + recall)
print ("F-Score =", Fscore)

Precision = 0.6
Recall =  0.3
F-Score = 0.4
