In [1]:
import json
import time
import requests
from tqdm import tqdm
import concurrent.futures
from datetime import datetime
from tabulate import tabulate
from IPython.display import clear_output
from database_connect import get_database
from multiprocessing import Process, Lock
from recipe import Recipe, RecipeParseException

import hello_fresh as hf

In [2]:
db = get_database()
itm_cats = ["Produce", "Meat", "Seafood", "Bakery", "Dairy", "Frozen", "Grains", "Canned Goods", "Dry Goods", "Snacks", "Sauces", "Oils", "Spices", "Beverages", "Other"]

In [3]:
timing_details = {}

def start_timing(timing_id, header):
    timing_details[timing_id] = {}
    timing_details[timing_id]['start'] = datetime.now()
    
    print(header, flush=True)
    
def end_timing(timing_id, records, footer):
    timing_details[timing_id]['end'] = datetime.now()
    timing_details[timing_id]['records'] = records
    
    elapsed_time = timing_details[timing_id]['end'] - timing_details[timing_id]['start']
    iter_time = datetime.now() - datetime.now() if records == 0 else elapsed_time / records
    
    print(footer, flush=True)
    print('Elapsed Time: ' + str(elapsed_time) + '\tIter Time: ' + str(iter_time), flush=True)

# New Entry Process

#### Getting All Existing URLs

In [4]:
all_urls_old = []

start_timing(0, 'Finding All Exisiting URLs')
for recipe_details in db.recipe_details.find():
    all_urls_old.append(recipe_details['url'])
end_timing(0, len(all_urls_old), 'Found ' + str(len(all_urls_old)) + ' Existing URLs')

Finding All Exisiting URLs
Found 12 Existing URLs
Elapsed Time: 0:00:00.385416	Iter Time: 0:00:00.032118


#### Getting All New URLs

In [5]:
all_urls_new = []

start_timing(1, 'Finding all Hello Fresh URLs')
all_urls_hf = hf.find_recipe_urls(verbose=True)
all_urls_new.extend(all_urls_hf)
end_timing(1, len(all_urls_hf), 'Found ' + str(len(all_urls_hf)) + ' Hello Fresh URLs')

Finding all Hello Fresh URLs
0	/recipes
47	/recipes/american-recipes
54	/recipes/italian-recipes
58	/recipes/asian-recipes
58	/recipes/mediterranean-recipes
58	/recipes/mexican-recipes
61	/recipes/korean-recipes
60	/recipes/indian-recipes
59	/recipes/latin-american-recipes
59	/recipes/chinese-recipes
58	/recipes/spanish-recipes
57	/recipes/japanese-recipes
56	/recipes/thai-recipes
56	/recipes/french-recipes
55	/recipes/cuban-recipes
54	/recipes/african-recipes
53	/recipes/cajun-recipes
52	/recipes/middle-eastern-recipes
51	/recipes/vietnamese-recipes
50	/recipes/hawaiian-recipes
49	/recipes/taco-recipes
48	/recipes/burger-recipes
47	/recipes/pasta-recipes
46	/recipes/bowl-recipes
45	/recipes/flatbread-recipes
44	/recipes/stir-fry-recipes
43	/recipes/meatball-recipes
42	/recipes/noodle-recipes
41	/recipes/risotto-recipes
40	/recipes/skillet-recipes
39	/recipes/soup-recipes
38	/recipes/skewer-recipes
37	/recipes/quesadilla-recipes
36	/recipes/meatloaf-recipes
35	/recipes/fajita-recipes
3

#### Consolidating All URLs

In [6]:
all_urls = []
all_urls.extend(all_urls_old)
all_urls.extend(all_urls_new)
all_urls = list(set(all_urls))

#### Getting Recipes for all Good URLs

In [7]:
def get_recipe(url):
    to_return = None
    
    try:
        if 'www.hellofresh.com' in url:
            to_return = hf.get_recipe(url)
    except RecipeParseException:
        pass
        
    return to_return

start_timing(2, 'Getting Recipes from All URLs')
with concurrent.futures.ThreadPoolExecutor() as executor:
    recipe_list = list(tqdm(executor.map(get_recipe, all_urls), total=len(all_urls), position=0, leave=True))
final_recipes = [recipe_list[i] for i in range(len(recipe_list)) if recipe_list[i] is not None]  
end_timing(2, len(recipe_list), 'Got Recipes from All URLs: ' + str(len(final_recipes)) + '/' + str(len(recipe_list)) + ' URLs have GOOD Recipes')

Getting Recipes from All URLs


100%|██████████████████████████████████████████████████████████████████████████████| 821/821 [1:06:34<00:00,  4.87s/it]

Got Recipes from All URLs: 766/10 URLs have GOOD Recipes
Elapsed Time: 1:06:34.950049	Iter Time: 0:06:39.495005





In [10]:
final_recipes_dict = {'all_recipes': [recipe_obj.get_recipe_dict() for recipe_obj in final_recipes]}

with open('temp_recipe_save.json', 'w') as output_file:
    json.dump(final_recipes_dict, output_file)

In [4]:
def dict_clean(items):
    result = {}
    for key, value in items:
        if value is None:
            value = ''
        elif value != value:
            value = ''
        result[key] = value
    return result

all_recipes_dict = []
with open('temp_recipe_save.json', 'r') as input_file:
    all_recipes_dict = json.loads(input_file.read(), object_pairs_hook=dict_clean)

final_recipes = [Recipe(recipe_dict=recipe_dict) for recipe_dict in all_recipes_dict['all_recipes']]
final_recipes_url = [x.get_recipe_details()['url'] for x in final_recipes]

#### Removing all Exisiting Recipes that no Longer Exist

In [5]:
for recipe_details in db.recipe_details.find():
    if recipe_details['url'] not in final_recipes_url:
        db.recipe_details.delete_one({'url': recipe_details['url']})
        print('Deleted URL:\t' + recipe_details['url'])

#### Manual Check of All NEW Recipes

In [10]:
def update_recipe(recipe):
    query = {'url': recipe.get_recipe_details()['url']}
    insert = recipe.get_recipe_dict()

    if db.recipe_details.find_one(query) is not None:
        result = db.recipe_details.replace_one(query, insert)
    else:
        result = db.recipe_details.insert_one(insert)
    
    return result

In [11]:
for i in tqdm(range(len(final_recipes))):
    update_recipe(final_recipes[i])

100%|████████████████████████████████████████████████████████████████████████████████| 766/766 [01:08<00:00, 11.14it/s]


#### Checking no Duplicates on URL

In [12]:
urls_in_database = []
for recipe_details in db.recipe_details.find():
    urls_in_database.append(recipe_details['url'])
    
len(urls_in_database) == len(list(set(urls_in_database)))

True