/
microformats__foodnetwork_hrecipe.py
63 lines (49 loc) · 1.6 KB
/
microformats__foodnetwork_hrecipe.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# -*- coding: utf-8 -*-
import sys
import urllib2
import json
import HTMLParser
import BeautifulSoup
# Pass in a URL such as
# http://www.foodnetwork.com/recipes/alton-brown/pad-thai-recipe/index.html
url = sys.argv[1]
# Parse out some of the pertinent information for a recipe
# See http://microformats.org/wiki/hrecipe
def parse_hrecipe(url):
try:
page = urllib2.urlopen(url)
except urllib2.URLError, e:
print 'Failed to fetch ' + url
raise e
try:
soup = BeautifulSoup.BeautifulSoup(page)
except HTMLParser.HTMLParseError, e:
print 'Failed to parse ' + url
raise e
hrecipe = soup.find(True, 'hrecipe')
if hrecipe and len(hrecipe) > 1:
fn = hrecipe.find(True, 'fn').string
author = hrecipe.find(True, 'author').find(text=True)
ingredients = [i.string
for i in hrecipe.findAll(True, 'ingredient')
if i.string is not None]
instructions = []
for i in hrecipe.find(True, 'instructions'):
if type(i) == BeautifulSoup.Tag:
s = ''.join(i.findAll(text=True)).strip()
elif type(i) == BeautifulSoup.NavigableString:
s = i.string.strip()
else:
continue
if s != '':
instructions += [s]
return {
'name': fn,
'author': author,
'ingredients': ingredients,
'instructions': instructions,
}
else:
return {}
recipe = parse_hrecipe(url)
print json.dumps(recipe, indent=4)