In [1]:
from pathlib import Path
import shutil

import pandas as pd
import numpy as np
import pickle

from bs4 import BeautifulSoup
import locale
#from html.parser import HTMLParser
# import website_func.py to use its functions
from website_func import *
from scrap import *
import os #To read the file
#reload every module each time
%load_ext autoreload
%autoreload 2

  'The soupsieve package is not installed. CSS selectors cannot be used.'


# Some quick analysis of the data
We received a data set of .html file containing the website content of recipes.
We wanted to sort them by website, in order to, more easily, find a pattern among them. This will enable us to do the "scraping" of the pages. First we thought about moving the files in a folder corresponding to their website, but it would be a waste of time and a big computational effort. Thus, we came up with a (probably) faster solution : we could simply write the name of the file within its corresponding website folder. By inspecting the files, we saw that the first line was always containing a comment with the name of the file and the complete website. Using readlines and split, we could easily retrieve the name of the website.

We launched this process, but an alarm appeared describing a Trojan virus in the file "1c2cb6f0df04cf5a9d0baa116c6aa7bb.html". 
We had then to quarantine or maybe remove the file, as we have quite enough data.
By doing so, we remarked the file "msg.log" that could help us into fastering the processus as its content is formed of the name of the file together with its website.

In [2]:
with open("recipePages/msg.log", 'r') as f:
    first_lines = f.readlines()[45:49]
    print(*first_lines, sep="--")

59b9d3c43ee8df52fcd04128475059f5.html	http://allrecipes.com/Recipe/bacon-and-potato-soup/detail.aspx	2012-09-09 10:55:41 URL:http://allrecipes.com/Recipe/bacon-and-potato-soup/detail.aspx [225697/225697] -> "59b9d3c43ee8df52fcd04128475059f5.tmp" [1]
--129b4dad66e6d0d59bbc6f3200f9e927.html	http://familyoven.com/recipes/search?terms=Dairy%20Free%20Alfredo%20Sauce	2012-09-09 10:55:42 URL:http://familyoven.com/recipes/search?terms=Dairy%20Free%20Alfredo%20Sauce [77640] -> "129b4dad66e6d0d59bbc6f3200f9e927.tmp" [1]
--bafcd30bc137238af0cd9954f7d0701e.html	http://www.yummly.com/recipes/chicken-brunswick-stew-lima-beans	Syntax error in Set-Cookie: fbsr_54208124338=""; Expires=Thu, 01-Jan-1970 00:00:10 GMT; Path=/, JSESSIONID=27D73083292D7FE540727DC4AD6C50D2; Path=/; HttpOnly, yv="lJSpLE5cHmHHgrNrJgtM/w=="; Version=1; Max-Age=31536000; Expires=Mon, 09-Sep-2013 17:55:42 GMT; Path=/ at position 120.
--2012-09-09 10:55:43 URL:http://www.yummly.com/recipes/chicken-brunswick-stew-lima-beans [286650]

We observed that occasionally a line containing other info that are not necessary can appear (in the example above, this is the case in the last line). Those lines are ignored during the sorting process.
Also we noticed there are no file extensions other than .html and .log (see below). Thus, we know exacty what the contents of the folder recipePages are.

In [3]:
# We can find in the folder that, excepted the html files, there is only the log
# Get all filenames (i.e. path) that are in recipePages folder 
pathlist = Path("recipePages/").glob('**/*')
i = 0
for path in pathlist:
     # because path is object not string
    path_in_str = str(path)
    if not path_in_str.endswith(".html"):
        print(path_in_str)

recipePages\msg.log


##### Sorting part of the content of the folder "recipePages" :

In [4]:
# If override True -> delete SortedFiles and re-execute the function (long)
sort_website_from_log(override=False)

Finished sorting the files


After the sorting comes the scraping part. In order to have the maximum number of data and to save some time, we'll first scrap the websites containing the higher number of files.

In [5]:
website_count=pd.Series()

for (root,dirs,files) in os.walk('SortedFiles', topdown="True"):   
        for website in dirs:
            if website == ".ipynb_checkpoints":
                continue
            filename = "SortedFiles/" + website +"/filesName.txt"
            
            f = open(filename, "r")
            number_of_link = len(f.readlines())
            line_to_add=pd.Series(number_of_link,index=[website])
            website_count = website_count.append(line_to_add)

In [6]:
#print(website_count)
print("Number of website with more than 1000 files:", len(website_count[website_count.values >1000]))
print("Number of website with less than 1000 files:", len(website_count[website_count.values <1000]))

Number of website with more than 1000 files: 19
Number of website with less than 1000 files: 85


As we can observe, 19 websites have a really high number of files, higher than a thousand. The number of files contained in the 3 first website represents 53% of the total number of files.Those are the ones that we'll focus our attention on.

In [7]:
website_count.sort_values(ascending=False,inplace=True)
website_count_used=website_count[website_count.values>1000]
print(website_count_used)
total_number_files = website_count_used.values.sum()
chosen_number_files = website_count_used[0:3].values.sum()

print("Number of files in the 3 first domains : ", chosen_number_files)
print("Percentage of chosen files among the total: ", chosen_number_files/total_number_files)

allrecipes.com             28355
food.com                   14661
foodnetwork.com            11997
yummly.com                  6590
cooks.com                   5547
epicurious.com              5060
tasteofhome.com             4820
myrecipes.com               3805
recipes.sparkpeople.com     3671
cdkitchen.com               2584
bettycrocker.com            2325
cookeatshare.com            2044
southernfood.about.com      2012
grouprecipes.com            1916
recipe.com                  1460
kraftrecipes.com            1420
eatingwell.com              1400
ifood.tv                    1295
delish.com                  1050
dtype: int64
Number of files in the 3 first domains :  55013
Percentage of chosen files among the total:  0.539279692585186


In [8]:
website_list_used=website_count_used.index.tolist()

There is a lot of words in the ingredients which aren't specific to the ingredient itself, but which are just linked to the quantity of the ingredients, or something uninteresting for us. It is difficult to extract the quantity of the ingredients, as there is a lot of different units for expressing them. Hence, we decided first to juste remove all those terms for the moment, and focus our attention only on the ingredients (not on the quantity). 

In [9]:
#Words that have to be removed from the string in the website to keep only the ingredient:
list_ingredient_to_remove = ["metrics","metric","of","teaspoon", "cup", "cups","teaspoons","ounce","ounces","pounds","pound","tablespoons","chopped","quarts","fresh",\
                            "light","plain","popped","medium","shredded","bunch","tablespoon","to","taste","pinch","freshly","ground",\
                            "canned","carcass","cubes","dried","frozen","all","purpose","grated","minced","degree","degrees","optional",\
                            "jar","quartered","marinated","strips","strip","asian","stalks","package","can","cans","box","container",\
                            "evaporated","bottle","cans","extract","squares","german","slices","crushed","uncooked","seasoning","small",\
                            "sweet","packed","sliced","heavy","condensed","finely","long","grain","sweetened","firmly","cooked","crusts","lean",\
                            "caps","mix","steaks","large","instant","crumbs","semisweet","distilled","packages","pint","miniature","preserves",\
                            "processed","chunky","prepared","seasoned","american","stock","top","sirloin","tip","peeled","spears","leaves",\
                            "belgian","bob","italian","bottles","boneless","roast","breast","meat","tenderloins","granules","chips","converted",\
                            "containers","coleslaw","florets","cube","cubed","diced","crumbled","head","thighs","halves","drumsticks","blend",\
                            "leaf","bow","tie","stuffing","dry","boiling","cover","coarse","kosher","extra","virgin","fat","free","fluid","hot",\
                            "chops","loin","shoulder","roasted","for","frying","diced","thawed","bone","hocks","shucked","leftover","corned",\
                            "weed","summer","wild","whole","very","ripe","unbaked","crust","unsalted","unsifted","unsweetened","drops","drop",\
                            "paste","thickly","swiss","skinless","flavored","quart","puff","kernels","kernel","cracked","as","tortillas","chunk",\
                            "dash","dark","hash","brown","hashbrown","hungarian","rinsed","thin","thinly","thick","stewed","cleaned","gelatin",\
                            "sifted","skim","slivered","sprigs","sage","roma","romano","pasta","reduced","sodium","refried","refrigerated",\
                            "portobello","nuggets","tater","pinto","pudding","pitted","pinches","relish","penne","deep","mild","melted",\
                            "maraschino","loaf","link","jumbo","inch","jars","jasmine","generous","gold","graham","granulated","fried","skins",\
                            "fire","firm","flaked","flank","family","size","eagle","curd","half","russet","round","vidalia","unbleached","toll",\
                            "morsels","toasted","square","soft","sharp","serrano","seedless","old","provolone","process","premium","pouch","betty",\
                            "piri","chunks","smoked","other","original","breaded","tender","tenders","flowerets","nonfat","no","calorie","tail",\
                            "greek","great","northern","southern","envelope","dill","cold","round","steak","brisket","baby","assorted"]
###################PASTA ??
list_ingredient_to_remove = sorted(list_ingredient_to_remove)
#print(list_ingredient_to_remove)

### Creation of the recipe data:
We have to be aware of certain things:
- we might get a website containing only a list of recipes
- we might find no rating -> we'll have to discard it
- we might not be able to open and read the file

For the pages containing lists of recipes, we tried to get informations about those recipes, but were unsucessful. As some links referenced in those were dead.

### Schema used for the scraping :

We will based our dataset on the 3 first websites as said before. The variables we are interested in are the following :
- The ranking, in order to have a value representing the success of the recipe.
- The number of reviews, in order to have an idea of the confidence interval for the ranking estimation.
- The time of preparation, which is a candidate which will be used to predict the ranking.
- The ingredients : The candidate which will be used to predict the ranking.

We haven't yet decided what to do exactly with the number of reviews, but as we want our ranking estimation to be significant enough, we could simply remove the recipes with a too small number of reviews.

## Pickle part of food.com

The scraping takes a lot of time, then, after each website scraped, we store the results in pickles files.

In [59]:
# Scrap all food.com files and put them in a pickle file
recipe_data_food = pd.DataFrame(columns = ['Website','Recipe','Prepare time', 'Ranking', 'Reviews', 'Ingredients'])  
list_unique_ingredients_food = []
unique_ingredients_data_food = pd.DataFrame(columns = ['Ingredient','Count'])
path_to_food_folder = Path('SortedFiles/food.com/filesName.txt')
print(path_to_food_folder)

with path_to_food_folder.open("r") as file:
    website = "food.com"
    for f in file.readlines():
        file_to_read =Path('recipePages') / f.strip()
        try:
            f = open(file_to_read,'r')
            f.close()
        except:
            #print("We can't read the page: ",file_to_read)
            continue
        recipe_data_food, list_unique_ingredients_food, unique_ingredients_data_food = scrap_food(website, file_to_read,list_ingredient_to_remove,list_unique_ingredients_food, recipe_data_food, website_list_used,unique_ingredients_data_food)
                                            
recipe_data_food


In [199]:
recipe_data_food

Unnamed: 0,Website,Recipe,Prepare time,Ranking,Reviews,Ingredients
0,food.com,Ecuadorean Quinoa and Vegetable Soup,75,4.86,31,"[quinoa, olive oil, onions, salt, potato, red ..."
1,food.com,Authentic Injera (aka Ethiopian Flat Bread),4330,3.13,17,"[teff, water, salt]"
2,food.com,Healthy Vegan Coleslaw,10,5.00,1,"[cabbage, vegan mayonnaise, apple cider vinega..."
3,food.com,Grilled Flatbread,35,4.67,14,"[active yeast, olive oil, flour, salt]"
4,food.com,Baked Margarita Pie,20,5.00,3,"[cracker, milk]"
...,...,...,...,...,...,...
12879,food.com,Healthy Italian Breadsticks or Pizza Crust,75,5.00,11,"[water, sugar, salt, olive oil, garlic powder,..."
12880,food.com,Onion Dip (Raw Vegan),15,4.67,3,"[water, sea salt, garlic clove, onion, chives]"
12881,food.com,Blink of an Eye Bell Pepper Saute,5,5.00,4,"[bell pepper, olive oil, red onion, garlic clo..."
12882,food.com,Frosted Pineapple Cookies,27,4.75,9,"[pineapple, butter, sugar, egg, vanilla, flour..."


Save data with pickle

In [207]:
recipe_data_food.to_pickle("data_pickles/recipe_data_food.pkl")
unique_ingredients_data_food.to_pickle("data_pickles/unique_ingredients_data_food.pkl")
with open('data_pickles/list_uni_ingr_food.pkl', 'wb') as f:
    pickle.dump(list_unique_ingredients_food, f)

Retrieve data with pickle

In [2]:
recipe_data = pd.read_pickle("data_pickles/recipe_data_food.pkl")
unique_ingredients_data = pd.read_pickle("data_pickles/unique_ingredients_data_food.pkl")
with open('data_pickles/list_uni_ingr_food.pkl', 'rb') as f:
    list_unique_ingredients = pickle.load(f)

In [3]:
unique_ingredients_data

Unnamed: 0,Ingredient,Count
0,quinoa,39
1,olive oil,1931
2,onions,1
3,salt,5515
4,potato,106
...,...,...
1634,chicory lettuce,1
1635,parma ham,1
1636,tagliatelle noodles,2
1637,vegan cheese,1


In [236]:
# Scrap all foodnetwork.com files and put them in a pickle file
path_to_foodnet_folder = Path('SortedFiles/foodnetwork.com/filesName.txt')
print(path_to_foodnet_folder)

with path_to_foodnet_folder.open("r") as file:
    website = "foodnetwork.com"
    for f in file.readlines():
        file_to_read =Path('recipePages') / f.strip()
        try:
            f = open(file_to_read,'r')
            f.close()
        except:
            #print("We can't read the page: ",file_to_read)
            continue
        recipe_data,list_unique_ingredients, unique_ingredients_data= scrap_foodnetwork(website, file_to_read,list_ingredient_to_remove,list_unique_ingredients, recipe_data, website_list_used,unique_ingredients_data)


SortedFiles\foodnetwork.com\filesName.txt
We don't care about this page (foodnetwork):  recipePages\1e225a1f2bcb521302e80fb044c0f65d.html
We don't care about this page (foodnetwork):  recipePages\dda0b0e7e5bfc0d6140a20eeb2f9b0bf.html
We don't care about this page (foodnetwork):  recipePages\1c30692699593f746b1d19e463d705f0.html
We don't care about this page (foodnetwork):  recipePages\d70bc1cfdf1faebe7df0c879f5fed6cc.html
We don't care about this page (foodnetwork):  recipePages\3630e8535fcc0b2649bb0b0f88f3f820.html
We don't care about this page (foodnetwork):  recipePages\9dab73ea06c10fbfff3044ecac9e9aa7.html
We don't care about this page (foodnetwork):  recipePages\7c69102ae79c57aaed8f4e32dc565679.html
We don't care about this page (foodnetwork):  recipePages\fef414731312604c42433d8b1ea24ced.html
We don't care about this page (foodnetwork):  recipePages\f35f7423789a11352b091a590eba03e5.html
We don't care about this page (foodnetwork):  recipePages\001ec670080a960a6fbcc5e4dd20b8cc.htm

In [239]:
recipe_data.to_pickle("data_pickles/recipe_data_foodnet.pkl")
unique_ingredients_data.to_pickle("data_pickles/unique_ingredients_data_foodnet.pkl")
with open('data_pickles/list_uni_ingr_foodnet.pkl', 'wb') as f:
    pickle.dump(list_unique_ingredients_food, f)

In [245]:
recipe_data = pd.read_pickle("data_pickles/recipe_data_foodnet.pkl")
unique_ingredients_data = pd.read_pickle("data_pickles/unique_ingredients_data_foodnet.pkl")
with open('data_pickles/list_uni_ingr_foodnet.pkl', 'rb') as f:
    list_unique_ingredients = pickle.load(f)

In [246]:
# Scrap allrecipes.com files and put them in a pickle file
path_to_allrec_folder = Path('SortedFiles/allrecipes.com/filesName.txt')
print(path_to_allrec_folder)

with path_to_allrec_folder.open("r") as file:
    website = "allrecipes.com"
    for f in file.readlines():
        file_to_read =Path('recipePages') / f.strip()
        try:
            f = open(file_to_read,'r')
            f.close()
        except:
            #print("We can't read the page: ",file_to_read)
            continue
        recipe_data, list_unique_ingredients, unique_ingredients_data = scrap_allrecipes(website, file_to_read,list_ingredient_to_remove,list_unique_ingredients, recipe_data, website_list_used,unique_ingredients_data)


SortedFiles\allrecipes.com\filesName.txt
We don't care about this page:  recipePages\c6a40a7de4b506a935093b67bccf4aac.html
We don't care about this page:  recipePages\d51be6ea99b26fec6a95d799df8f6ca6.html
We don't care about this page:  recipePages\b391f6f7b61e781328df944424c413bd.html
Beautifulsoup can't read the page: recipePages\64d2a03e54f71ecc8e6d1d3f1cf34336.html
We don't care about this page:  recipePages\0b21ba76572680e3b4a41f744d3b681f.html
We don't care about this page:  recipePages\6f9710404b888af96f6cc56bebe2ee0b.html
Beautifulsoup can't read the page: recipePages\6a19a34bf98e0669c3577068f10becca.html
Beautifulsoup can't read the page: recipePages\41f7188ce7f4b3b540ea640578d672dd.html
We don't care about this page:  recipePages\bd7bba296cd95856a163d21ed8dd2e83.html
Beautifulsoup can't read the page: recipePages\256d05081c27a092286c077a8cbdefd3.html
We don't care about this page:  recipePages\8b820d04d45ac14a967db8770a992585.html
Beautifulsoup can't read the page: recipePage

In [247]:
recipe_data.to_pickle("data_pickles/recipe_data_complete.pkl")
unique_ingredients_data.to_pickle("data_pickles/unique_ingredients_data_complete.pkl")
with open('data_pickles/list_uni_ingr_complete.pkl', 'wb') as f:
    pickle.dump(list_unique_ingredients_food, f)

# exécuter à partir de là pour éviter de refaire le scraping

In [12]:
recipe_data = pd.read_pickle("data_pickles/recipe_data_complete.pkl")
unique_ingredients_data = pd.read_pickle("data_pickles/unique_ingredients_data_complete.pkl")
with open('data_pickles/list_uni_ingr_complete.pkl', 'rb') as f:
    list_unique_ingredients = pickle.load(f)

In [13]:
recipe_data

Unnamed: 0,Website,Recipe,Prepare time,Ranking,Reviews,Ingredients
0,food.com,Ecuadorean Quinoa and Vegetable Soup,75,4.86,31,"[quinoa, olive oil, onions, salt, potato, red ..."
1,food.com,Authentic Injera (aka Ethiopian Flat Bread),4330,3.13,17,"[teff, water, salt]"
2,food.com,Healthy Vegan Coleslaw,10,5,1,"[cabbage, vegan mayonnaise, apple cider vinega..."
3,food.com,Grilled Flatbread,35,4.67,14,"[active yeast, olive oil, flour, salt]"
4,food.com,Baked Margarita Pie,20,5,3,"[cracker, milk]"
...,...,...,...,...,...,...
43944,allrecipes.com,Oven-Fried Chicken,,4.1,19,"[buttermilk baking, pecan, paprika, salt, poul..."
43945,allrecipes.com,Pumpkin Spice Cake with Cinnamon Cream Cheese ...,50,4.5,83,"[pumpkin puree, cinnamon, clove, nutmeg, yello..."
43946,allrecipes.com,Mom's Pie Crust for a Double Crust Pie,15,5.0,1,"[flour, salt, vegetable shortening, water]"
43947,allrecipes.com,Sopapilla Cheesecake Pie,180,4.8,882,"[cream cheese, white sugar, mexican vanilla, c..."


### Limit number of recipes : (to delete)

In [14]:
#limit = 2000
#recipe_data = recipe_data[:limit].copy()

#### Drop Duplicates

We drop the duplicate recipes. Once the recipes are unique, we drop duplicate ingredients in the ingredients list.

In [15]:
# Ingredient list to single coma-separated string (comparable by "drop_duplicates" function)
recipe_data['Ingredients'] = recipe_data['Ingredients'].apply(lambda ingr_list: ','.join(ingr_list))
recipe_data.drop_duplicates(inplace=True)

# Back to list of ingredients
recipe_data['Ingredients'] = recipe_data['Ingredients'].apply(lambda ingr: ingr.split(','))

In [16]:
def get_list_unique_ingredients(l):
    return list(set(l))

In [17]:
# Clean lists of ingredients : eliminate duplicates [sugar, sugar, onions,...] -> [sugar, onions,...]
recipe_data['Ingredients'] = recipe_data['Ingredients'].apply(lambda ingr: get_list_unique_ingredients(ingr))

In [18]:
len(recipe_data)

28083

### TODO :
recipe names may not be unique, check by hand

### Build Graph

In [10]:
recipe_data['Website'].value_counts()

allrecipes.com     11162
food.com            9823
foodnetwork.com     7098
Name: Website, dtype: int64

In [34]:
# build dictionary recipe -> recipe_idx and recipe_idx -> recipe
recipe_to_idx = {}
idx_to_recipe = {}
idx = 0

for _, r in recipe_data['Recipe'].iteritems():
    if r not in recipe_to_idx.keys():
        recipe_to_idx[r] = idx
        idx_to_recipe[idx] = r
        idx += 1

In [35]:
import networkx as nx

recipes_G = nx.Graph()

# Add nodes
recipes_G.add_nodes_from(list(idx_to_recipe.keys()))

In [36]:
def jaccard_dist(list_1, list_2):
    s_1 = set(list_1)
    s_2 = set(list_2)
    return len(s_1.intersection(s_2)) / len(s_1.union(s_2))

In [37]:
recipes_joined = recipe_data[['Recipe','Ingredients']].copy()
length = len(recipes_joined)

# Add columns with index of recipe
recipes_joined['Idx'] = recipes_joined['Recipe'].apply(lambda recipe: recipe_to_idx[recipe])

############################### Cartesian product of recipes DF with itself ###############################

# Build matrix tri that has triangle of ones above the diagonal and zeros below
tri = np.tri(length,length, -1, dtype=int)
i, j = np.where(tri.T)

# Change DF to have all pairs of different recipes
recipes_joined = recipes_joined.iloc[i].reset_index(drop=True).join(recipes_joined.iloc[j].reset_index(drop=True), \
                                                                    rsuffix='_2')

# Add jaccard distance column
recipes_joined['Distance'] = recipes_joined.apply(lambda row: jaccard_dist(row['Ingredients'], \
                                                                           row['Ingredients_2']), axis=1)

recipes_joined.head()

Unnamed: 0,Recipe,Ingredients,Idx,Recipe_2,Ingredients_2,Idx_2,Distance
0,Ecuadorean Quinoa and Vegetable Soup,"[red bell peppers, zucchini, olive oil, salt, ...",0,Authentic Injera (aka Ethiopian Flat Bread),"[salt, teff, water]",1,0.125
1,Ecuadorean Quinoa and Vegetable Soup,"[red bell peppers, zucchini, olive oil, salt, ...",0,Healthy Vegan Coleslaw,"[cabbage, vegan mayonnaise, pepper, salt, sple...",2,0.1
2,Ecuadorean Quinoa and Vegetable Soup,"[red bell peppers, zucchini, olive oil, salt, ...",0,Grilled Flatbread,"[flour, salt, olive oil, active yeast]",3,0.117647
3,Ecuadorean Quinoa and Vegetable Soup,"[red bell peppers, zucchini, olive oil, salt, ...",0,Baked Margarita Pie,"[cracker, milk]",4,0.0
4,Ecuadorean Quinoa and Vegetable Soup,"[red bell peppers, zucchini, olive oil, salt, ...",0,Lemon Tahini Dressing,"[water, tahini, garlic clove, lemon juice, soy...",5,0.111111


In [38]:
# Add edges to recipe graph
edge_list = []
threshold = 0.00001

for _,row in recipes_joined.iterrows():
    dist = row['Distance']
    
    if dist > threshold:
        edge_list.append((row['Idx'], row['Idx_2'], dist))
    
recipes_G.add_weighted_edges_from(edge_list)

In [39]:
# Save Graph
nx.write_gpickle(recipes_G,"recipes_G.gpickle")

- ajouter la liste des ingrédients comme attribut d'un noeud
- clustering du graph
- déterminer une liste d'ingrédients type de chaque cluster

## Clustering

In [20]:
recipes_G = nx.read_gpickle("recipes_G.gpickle")

In [38]:
adj = nx.adjacency_matrix(recipes_G)

In [39]:
from sklearn.cluster import SpectralClustering
clustering = SpectralClustering(n_clusters=20, assign_labels="discretize", random_state=0).fit(adj)



In [42]:
clustering.labels_

1964