In [3]:
import requests, time, datetime
import os, re
from bs4 import BeautifulSoup
import pandas as pd

from numpy import sqrt
from math import log

from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [4]:
### DATA GATHERING FUNCTIONS

def requestURL(url):
    '''Request routine to url. Catch error and keep the connection active'''
    while True:
        try:
            r = requests.get(url, timeout = 5)    
            r.raise_for_status()
        #except requests.RequestException:
        #    time.sleep(5)
        #    continue
        except requests.Timeout:
            print("\nTimeout!\n")
            continue
        
        except requests.ConnectionError:
            print("\nConnectionError!\n")
            continue
    
        except requests.HTTPError:
            print("\nHTTPError!\n")
            continue
        break
    return r


def getIngredients(delay = 1, write = False):
    '''Explore the bbc site to capture all ingredients.
    If write is True, the ingredient list is written in ingredient_list.txt.
    '''
    
    BASE_URL = "http://www.bbc.co.uk"
    r = requestURL(BASE_URL+"/food/ingredients")
    
    Soup = BeautifulSoup(r.text,"lxml")
    
    p = re.compile("/food/ingredients/by/letter/*") # create regular expression
    ing_pag = [BASE_URL+i.get("href") for i in Soup.find_all("a", {"href" :p})]

    ingredients = []
    for link in ing_pag:
        r = requestURL(link)
        Soup = BeautifulSoup(r.text,"lxml")
        l = [
            i.get("id") for i 
            in Soup.find_all("li", {"class":"resource food"})]
        ingredients.extend(l)
        time.sleep(delay)
    if write:    # optionally write the ingredient-list on file
        with open("ingredient_list.txt", "w") as f:
            for ing in ingredients:
                f.write(ing+"\n")
    else:    
        return ingredients


def extractRecipe(ingredient, l, delay = 0, verbose = False):
    '''Extract all recipes addresses from one ingredient. The
    url are saved in l list.'''
    
    BASE_URL = "http://www.bbc.co.uk"
    ingredient = "+".join(re.split("_", ingredient))
    url = BASE_URL+"/food/recipes/search?&keywords="+ingredient
    
    r = requestURL(url)
    
    
    Soup = BeautifulSoup(r.text, "lxml")
    
    p = re.compile("/food/recipes/search*")
    Next_page = Soup.find_all("a", {"href":p})
    if Next_page == [] : pages_number = 1
    else: pages_number = max([int(i.string) for i in Next_page if i.string != "Next"])
    
    p = re.compile("/food/recipes/*") # reg express
    
    if verbose: print("< "+ingredient+" > Start fetching. Page: ") # verbose
    
    for page in range(1,pages_number+1):
        r = requestURL(BASE_URL+"/food/recipes/"+"search?page="+str(page)+"&keywords="+ingredient)
        Soup = BeautifulSoup(r.text, "lxml")
        l_temp = [BASE_URL+i.get("href") for i in Soup.find_all("a", {"href":p})if "search" not in i.get("href") 
                 if len(BASE_URL)+1 < len(i.get("href"))
                ]
        l.extend(l_temp)
        if verbose: print(page, end = " ")
        time.sleep(delay)


def getRecipeLinks(ingredients, delay = 1, verbose = False):
    """
    Starting from a list of ingredients crawls http://www.bbc.co.uk/food
    to collect all linked recipes links.
    
    Parameters
    ----------
    ingredients : list of str
    delay : int
        seconds of delay between requests to site
    """
    for ingredient_name in ingredients:
        recipes_address = []
        extractRecipe(ingredient_name, recipes_address, delay, verbose)
        f = open("recipe_links.txt", "a")
        for add in recipes_address:
            f.write(add+"\n")
        if verbose: print("< "+ingredient_name+" > complete!")
        f.close()


# Extract informations from recipe url
def parseRecipe(url):
    """
    Parse a recipe from http://www.bbc.co.uk/food.
    
    Parameters
    ----------
    url : 
    Returns
    -------
    recipe : dict
        dictionary containing Title, Author, CookTime, Prep_Time, Serves,
        Description, Dietary, Ingredient_list, Instructions
    """
    
    r = requestURL(url)
    Soup = BeautifulSoup(r.text, "lxml")
    recipe = {}
    
    Title = Soup.find("meta", {"property":"og:title"})
    recipe["Title"] = Title.get("content") if Title else "Empty" 

    Author = Soup.find("a", {"class":"chef__link"})
    recipe["Author"] = Author.text if Author else "Empty"

    CookTime = Soup.find("p", {"class":"recipe-metadata__cook-time"})
    recipe["CookTime"] = CookTime.text if CookTime else "Empty"
    
    Prep_Time = Soup.find("p", {"class":"recipe-metadata__prep-time"})
    recipe["Prep_Time"] = Prep_Time.text if Prep_Time else "Empty"
        
    Serves = Soup.find("p", {"class":"recipe-metadata__serving"})
    recipe["Serves"] = Serves.text if Serves else "Empty"

    Description = Soup.find("div", {"class":"recipe-description"})
    recipe["Description"] = Description.get_text(strip = True) if Description else "Empty"
    
    Dietary = Soup.find("div", {"class":"recipe-metadata__dietary"})
    recipe["Dietary"] = Dietary.get_text(strip = True) if Dietary else "Empty"


    ingredients_body = Soup.find_all("div", {"class":"recipe-ingredients"})[0]    
    headers = []
    headers.extend([i.string for i in ingredients_body.find_all("h2")])
    headers.extend([i.string for i in ingredients_body.find_all("h3")])
    ingredients_sub = ingredients_body.find_all("ul")


    Ingredient_list = {}
    for i in range(len(ingredients_sub)):
        if len(headers) != len(ingredients_sub):
            Ingredient_list[headers[i+1]] = ["".join(j.strings) 
                                            for j in ingredients_sub[i].find_all("li")]


        else: 
            Ingredient_list[headers[i]] = ["".join(j.strings) 
                                        for j in ingredients_sub[i].find_all("li")]




    recipe["Ingredient_list"] = Ingredient_list
    recipe["Instructions"] = ["".join(j.stripped_strings) 
                            for j in Soup.find_all("li", {"itemprop":"recipeInstructions"})]



    return recipe


def recipeWrite(filename, recipe):
    '''Write recipe on disk as docID.txt'''

    header = ["Title", "Author", "CookTime", "Prep_Time", 
             "Serves", "Description", "Dietary", "Ingredient_list", 
             "Instructions"]
    s_first = "\t".join(["".join(recipe[i]) for i in header[:7] ])

    s_instr = "".join(recipe["Instructions"])
    s_ing = ""
    for key in recipe["Ingredient_list"].keys():
        s_ing = s_ing + "\t".join([i for i in recipe["Ingredient_list"][key]])

    s = s_first + "\t" + s_instr + "\t" + s_ing

    f = open(filename,"w")
    f.write(s)
    f.close()    


def buildCollection(recipesAddresses, delay = 1, last_num = 0):
    """
    Parse every recipe on from http://www.bbc.co.uk/food.
    Each repice is written in a tab separated text file with
    a hierarchy of folders, in the folderPath.
    """
    if not os.path.exists("recipes"):
        os.makedirs("recipes")
    recipesAddresses = list(set(recipesAddresses))
    for url in recipesAddresses:
        #try: recipe = parseRecipe(url)
        #except: 
        #    print("\nParsing error occurred. At: "+str(last_num)+"\n")
        #    return last_num
        recipe = parseRecipe(url)
        filename = "recipes/"+str(last_num).zfill(5)+".txt" 
        recipeWrite(filename, recipe)
        print(str(last_num).zfill(5))
        last_num +=1
        time.sleep(delay)
    return last_num

# Linguistic preprocessing of recipe, see NLTK
def processRecipe(path):
    """
    Stopword removal, normalization, stemmings.
    """
    ps = PorterStemmer()
    stop = stopwords.words('english')
    with open(path, encoding = "utf-8") as f:
        s = f.read()

    l = word_tokenize(s)
    l = [i for word in l for i in word.split(".") if i]
    l = [ps.stem(re.sub(r'[^a-zA-Z]', '', i.lower())) for i in word_tokenize(s) if i not in stop ]
    l = list(filter(None, l))
    l = [i for i in l if len(i)>2]
    
    return l

# Term frequency in document
def tf(path):
    '''Compute the term frequency of a word in a document as
    the number of the term appear in the document divided the total 
    count of the word in the document.'''
    freq = {}
    l = processRecipe(path)    # the document is preprocessed 
    tot_count = len(l)    # so we don't keep in consideration stopword etc.
    for word in l:
        if word in freq:
            freq[word] += 1
        if word not in freq:
            freq[word] = 1

    for key in freq.keys():
        freq[key] = round(freq[key]/tot_count,4)
    
    return freq


# Compile dictionary used to write vocabulary and index
def recipeDict():
    '''Create a dictionary who has parsed recipes keywords has keys,
    a list of lists containing [path_to_file in which keyword appear, 
    frequency of the word in the document, position]. The frequency is
    the tf (term frequency)'''
    my_dict = {}
    for file_path in os.listdir("recipes/"):
        freq = tf("recipes/"+file_path)
        word_list = processRecipe("recipes/"+file_path)
        
        for word in set(word_list):
            pos = [n for n, i in enumerate(word_list) if i == word]
            if word in my_dict.keys():
                my_dict[word].append([file_path.strip(".txt"), freq[word], pos]) 
                                      
            if word not in my_dict.keys():
                my_dict[word] = [[file_path.strip(".txt"),freq[word], pos]]
            
    return my_dict

# Add skip pointers to index
def add_skip(index):
    '''Add skip to posting lists. Skip has step equal to square root
    of posting list length.'''
    for term in index.keys():
        pos_length = len(index[term])
        step = int(sqrt(pos_length))
        for n in range(pos_length):
            if n in range(0, pos_length-step, step): index[term][n].insert(2, n+step)
            else : index[term][n].insert(2, 0)


def vocabulary(my_dict):
    '''Write the vocabulary on disk, each term has associated termID
    and overall frequency (# documents occur / tot doc)'''
    word_list = list(my_dict.keys())
    word_list.sort()
    word_num = len(os.listdir("./recipes/"))
    with open("vocabulary.txt","w") as f:
        i = 0
        for word in word_list:
            idf = log(word_num/len(my_dict[word]))
            f.write(word+"\t" +str(i).zfill(len(str(word_num)))+ "\t" +str(round(idf,3))+"\n")
            i+=1
            
# Write the index on disk
def index(my_dict):
    '''Write the index on disk.'''
    vocabulary = {}
    with open("vocabulary.txt") as f:
        for line in f:
            l = line.split()
            vocabulary[l[0]] = (l[1],l[2])
    g = open("index.txt", "w")
    for word in vocabulary.keys():
        g.write(vocabulary[word][0]+"\t")
        for file_ref in my_dict[word]:
            pos = "-".join([str(i) for i in file_ref[3]])
            tfidf = str( round( file_ref[1]*float(vocabulary[word][1]),3 ) )
            g.write(file_ref[0]+ " " + tfidf + " " + str(file_ref[2]) + " " + pos +"\t")
        g.write("\n")
    g.close()
  

In [5]:
### FUNCTIONS TO RECOVER DATA FROM THE HARD-DISK

def retrieveRecipe(rec_num):
    '''Retrieve a recipe from the disk give the recipe number.'''
    headers = ["Title", "Author", "CookTime", "Prep_Time", 
                 "Serves", "Description", "Dietary", "Instructions", 
                 "Ingredient_list"]
    path = "recipes/"+str(rec_num).zfill(5)+ ".txt"
    rec = pd.read_csv(path, sep = "\t", header = None)

    rec = rec.rename(columns = {i:headers[i] for i in range(0,8)})
    rec = rec.rename(columns = {i:"Ingredients" 
                      for i in range(8,rec.shape[1])})
    return rec

def loadVocabulary():
    '''Load Vocabulary File'''
    vocabulary = {}
    with open("vocabulary.txt") as f:
        for line in f:
            l = line.split()
            vocabulary[l[0]] = (l[1], l[2])
    return vocabulary

def loadIndex():
    '''Load Index file.'''
    index = {}
    with open("index.txt") as f:
        for line in f:
            l = line.split("\t")
            l = [i.strip() for i in l if i.strip()]
            l1 = [i.split() for i in l[1:]]
            for i in range(len(l1)):    
                pos = [int(i) for i in l1[i][3].split("-")]
                l1[i].pop(3)
                l1[i].append(pos)
            index[l[0]] = l1 
    return index

def posting(term, index, vocabulary):
    '''Take a term and return its posting list.'''
    term_num = vocabulary[term][0]
    return index[term_num]

In [6]:
### FUNCTIONS TO PERFORM SEARCH AND OUTPUT RESULTS

# Intersection of two posting lists
def intersect(a,b, k):
    '''Algorithm to intersect two posting lists. k is the distance
    parameter in the proximity search.'''
    i = 0
    j = 0
    answer = []

    while j < len(b) and i < len(a):
        if a[i][0] == b[j][0]:    # same filenumber
            if k == 0:    # not proximity search
                answer.append(a[i])
            if k != 0 and proximitySearch(a[i][3], b[j][3],k)!=[]:
                answer.append(a[i])
            i +=1; j+=1
        else:
            if (int(a[i][0]) > int(b[j][0])):
                skip = hasSkip(b, j)
                if skip : j = int(skip) 
                else: j+=1
            else:
                skip = hasSkip(a, i)
                if skip : i = int(skip) 
                else: i+=1
    return answer

def hasSkip(posting, pos):
    '''Check if a posting list has a skip in a certain position.'''
    skip = int(posting[pos][2]) 
    if skip != 0: return skip
    else : return False

def proximitySearch(pp1,pp2, k):
    '''Perform a proximity search.'''
    answer = []
    l = []
    ii = 0
    jj = 0
    while ii< len(pp1) :    # 
        while jj < len(pp2):
            if abs(pp1[ii] - pp2[jj]) <= k:
                l.append(pp2[jj])
            elif pp2[jj] > pp1[ii]:
                break
            jj += 1
        while l != [] and abs(l[0]-pp1[ii]) > k:
            l.pop(0)
        for i in l:
            answer.append([pp1[ii]]+[i])
        ii += 1
    return answer
    
def processQuery(query):
    """
    Stopword removal, normalization, stemmings.
    """
    ps = PorterStemmer()
    stop = stopwords.words('english')
    l = word_tokenize(query)
    l = [i for word in query for i in word.split(".") if i]
    l = [ps.stem(re.sub(r'[^a-zA-Z]', '', i.lower())) 
         for i in word_tokenize(query) 
         if i not in stop ]
    l = list(filter(None, l))
    l = [i for i in l if len(i)>2]
    if len(l) == 1 : l = str(*l)
    return l


def checkProx(s):
    '''check if there query contains a proximity search with sintax [word1] /[distance] [word2]}.
    Return the couples of word to search and the proximity range.'''
    couples = {}
    s = s.split("/")
    for i in range(len(s)-1):
        couples[processQuery(s[i].split()[-1]), processQuery(s[i+1].split()[1])] = s[i+1].split()[0] 
    return couples


def search(query, index, vocabulary):
    '''Conjunctive search algorithm.'''
    
    couples = checkProx(query) # control if the query contain proximity
    query = processQuery(query)    # process query like recipes
    
    if type(query) == str : query = [query]  # transform single query in list
    
    for term in query:    # remove terms not in vocabulary
        if term not in voc.keys(): query.remove(term)
    
    sbf = sorted([(term, voc[term][1]) for term in query],
                 reverse = True,
                 key = lambda x : x[1] )
    terms = []
    if couples:    # if there is a proximity search
        couples_temp = couples.copy()
        for i,j in couples.keys():
            if i or j not in voc.keys(): del couples_temp[i,j]
        couples = couples_temp.copy()
        for i,j in couples.keys():
            sbf = [k for k in sbf if k[0] != i ]
            sbf = [k for k in sbf if k[0] != j ]
            a = posting(i, *X)
            b = posting(j, *X)
            terms.extend(intersect(a,b, int(couples[i,j])))
    if sbf: 
        results = posting(sbf[0][0], index, vocabulary)
        terms.extend([posting(term[0],index, vocabulary) for term in sbf[1:]])
    else: return terms
    while terms != [] and results != []:
        results = intersect(results, 
                            terms[0], 0)
        terms = terms[1:]   
    return results


def cosSim(query, docIDResults, ind, voc):
    '''Cosine similarity between documents.'''    # To select search results.
    N = len(docIDResults)
    query = processQuery(query)
    if type(query) == str : query = [query]
    
    for term in query:
        if term not in voc.keys(): query.remove(term)

    Scores = {i:0 for i in docIDResults}
    Length = []
    
    for doc in docIDResults:
        Length.append(len(processRecipe("recipes/"+doc+".txt")))
    
    for term in query:
        wtq = float(voc[term][1])  # idf
        for i in ind[ voc[term][0] ]:
            if i[0] in docIDResults:
                Scores[i[0]] += wtq*float(i[1])  # idf-tf
    for n,i in enumerate(docIDResults):
        Scores[i] = Scores[i]/Length[n]
    
    Scores = sorted(Scores, key = Scores.get, reverse= True)
    return Scores

def printResults(results):
    '''Printout search results.'''
    for doc in results:
        df = retrieveRecipe(doc)
        print("Title: "+df["Title"][0])
        print("Author: "+ df["Author"][0])
        print("-"*(len(df["Title"][0])+7))
        print("Description :"+df["Description"][0])
        print("-"*(len(df["Title"][0])+7))
        print("Cooking Time :"+df["CookTime"][0])
        print("Preparation Time: "+df["Prep_Time"][0])
        print("Serves: "+df["Serves"][0])
        print("Dietary: "+df["Dietary"][0])
        print("-"*(len(df["Title"][0])+7))
        l = [i.strip() for i in df["Ingredients"].to_csv().split("\"") if not i.startswith("Ingredients")  if i !=""][1:]
        l = [i.replace(",", "") for i in l if i]
        print("Ingredient List:\n")
        print(*l, sep = "\n")
        print("-"*(len(df["Title"][0])+7))
        print("Instructions:\n"+df["Instructions"][0])
        print("\n")

In [None]:
## Load Index and vocabulary

print("Loading Index...\n")
ind = loadIndex()    # load index in memory
voc = loadVocabulary()    # load vocabulary in memory
X = (ind, voc)    # create ind,voc tuple
print("Index loaded.\n")

Loading Index...

Index loaded.



In [None]:
## SEARCH INTERFACE 

print("Search : ",end = '')
query = str(input( ))    # input query

searchResults = search(query, *X)
docIDResults  = [d[0] for d in searchResults]

print("Found "+str(len(docIDResults))+ " results.\n")

max_results = 10    # set max number of result to output

results = cosSim(query, docIDResults, *X)[:max_results]
printResults(results)

Search : 

In [None]:
results