In [3]:
import requests, time, datetime
import os, re
from bs4 import BeautifulSoup
import pandas as pd

from numpy import sqrt
from math import log

from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [4]:
### DATA GATHERING FUNCTIONS


# Extract informations from recipe url

# Linguistic preprocessing of recipe, see NLTK
def processRecipe(path):
    """
    Stopword removal, normalization, stemmings.
    """
    ps = PorterStemmer()
    stop = stopwords.words('english')
    with open(path, encoding = "utf-8") as f:
        s = f.read()

    l = word_tokenize(s)
    l = [i for word in l for i in word.split(".") if i]
    l = [ps.stem(re.sub(r'[^a-zA-Z]', '', i.lower())) for i in word_tokenize(s) if i not in stop ]
    l = list(filter(None, l))
    l = [i for i in l if len(i)>2]
    
    return l

# Term frequency in document
def tf(path):
    '''Compute the term frequency of a word in a document as
    the number of the term appear in the document divided the total 
    count of the word in the document.'''
    freq = {}
    l = processRecipe(path)    # the document is preprocessed 
    tot_count = len(l)    # so we don't keep in consideration stopword etc.
    for word in l:
        if word in freq:
            freq[word] += 1
        if word not in freq:
            freq[word] = 1

    for key in freq.keys():
        freq[key] = round(freq[key]/tot_count,4)
    
    return freq


# Compile dictionary used to write vocabulary and index
def recipeDict():
    '''Create a dictionary who has parsed recipes keywords has keys,
    a list of lists containing [path_to_file in which keyword appear, 
    frequency of the word in the document, position]. The frequency is
    the tf (term frequency)'''
    my_dict = {}
    for file_path in os.listdir("recipes/"):
        freq = tf("recipes/"+file_path)
        word_list = processRecipe("recipes/"+file_path)
        
        for word in set(word_list):
            pos = [n for n, i in enumerate(word_list) if i == word]
            if word in my_dict.keys():
                my_dict[word].append([file_path.strip(".txt"), freq[word], pos]) 
                                      
            if word not in my_dict.keys():
                my_dict[word] = [[file_path.strip(".txt"),freq[word], pos]]
            
    return my_dict

# Add skip pointers to index
def add_skip(index):
    '''Add skip to posting lists. Skip has step equal to square root
    of posting list length.'''
    for term in index.keys():
        pos_length = len(index[term])
        step = int(sqrt(pos_length))
        for n in range(pos_length):
            if n in range(0, pos_length-step, step): index[term][n].insert(2, n+step)
            else : index[term][n].insert(2, 0)


def vocabulary(my_dict):
    '''Write the vocabulary on disk, each term has associated termID
    and overall frequency (# documents occur / tot doc)'''
    word_list = list(my_dict.keys())
    word_list.sort()
    word_num = len(os.listdir("./recipes/"))
    with open("vocabulary.txt","w") as f:
        i = 0
        for word in word_list:
            idf = log(word_num/len(my_dict[word]))
            f.write(word+"\t" +str(i).zfill(len(str(word_num)))+ "\t" +str(round(idf,3))+"\n")
            i+=1
            
# Write the index on disk
def index(my_dict):
    '''Write the index on disk.'''
    vocabulary = {}
    with open("vocabulary.txt") as f:
        for line in f:
            l = line.split()
            vocabulary[l[0]] = (l[1],l[2])
    g = open("index.txt", "w")
    for word in vocabulary.keys():
        g.write(vocabulary[word][0]+"\t")
        for file_ref in my_dict[word]:
            pos = "-".join([str(i) for i in file_ref[3]])
            tfidf = str( round( file_ref[1]*float(vocabulary[word][1]),3 ) )
            g.write(file_ref[0]+ " " + tfidf + " " + str(file_ref[2]) + " " + pos +"\t")
        g.write("\n")
    g.close()
  

In [5]:
### FUNCTIONS TO RECOVER DATA FROM THE HARD-DISK

def retrieveRecipe(rec_num):
    '''Retrieve a recipe from the disk give the recipe number.'''
    headers = ["Title", "Author", "CookTime", "Prep_Time", 
                 "Serves", "Description", "Dietary", "Instructions", 
                 "Ingredient_list"]
    path = "recipes/"+str(rec_num).zfill(5)+ ".txt"
    rec = pd.read_csv(path, sep = "\t", header = None)

    rec = rec.rename(columns = {i:headers[i] for i in range(0,8)})
    rec = rec.rename(columns = {i:"Ingredients" 
                      for i in range(8,rec.shape[1])})
    return rec

def loadVocabulary():
    '''Load Vocabulary File'''
    vocabulary = {}
    with open("vocabulary.txt") as f:
        for line in f:
            l = line.split()
            vocabulary[l[0]] = (l[1], l[2])
    return vocabulary

def loadIndex():
    '''Load Index file.'''
    index = {}
    with open("index.txt") as f:
        for line in f:
            l = line.split("\t")
            l = [i.strip() for i in l if i.strip()]
            l1 = [i.split() for i in l[1:]]
            for i in range(len(l1)):    
                pos = [int(i) for i in l1[i][3].split("-")]
                l1[i].pop(3)
                l1[i].append(pos)
            index[l[0]] = l1 
    return index

def posting(term, index, vocabulary):
    '''Take a term and return its posting list.'''
    term_num = vocabulary[term][0]
    return index[term_num]

In [6]:
### FUNCTIONS TO PERFORM SEARCH AND OUTPUT RESULTS

# Intersection of two posting lists


In [None]:
## Load Index and vocabulary

print("Loading Index...\n")
ind = loadIndex()    # load index in memory
voc = loadVocabulary()    # load vocabulary in memory
X = (ind, voc)    # create ind,voc tuple
print("Index loaded.\n")

## SEARCH INTERFACE 

print("Search : ",end = '')
query = str(input( ))    # input query

searchResults = search(query, *X)
docIDResults  = [d[0] for d in searchResults]

print("Found "+str(len(docIDResults))+ " results.\n")

max_results = 10    # set max number of result to output

results = cosSim(query, docIDResults, *X)[:max_results]
printResults(results)

Loading Index...

Index loaded.



In [None]:
results