-
Notifications
You must be signed in to change notification settings - Fork 1
/
word-images.py
72 lines (60 loc) · 2.27 KB
/
word-images.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
from stat_parser import Parser
from nltk.tree import *
import urllib2
import simplejson
from collections import OrderedDict
# This flag simply toggles whether to go out and do the Google image search or not
# It's False now so as to allow for testing of NLP stuff first
RETRIEVE_IMAGES = True
# This function came from http://www.monlp.com/2012/01/20/extracting-noun-phrases-from-parsed-trees/
# I modified it a bit for readability
# I'm just using it to extract the noun phrases from the parse tree
def extractTaggedPhrases(tree, tag):
phrases = []
if (tree.node == tag):
phrases.append( tree.copy())
for child in tree:
if (type(child) is Tree):
listOfPhrases = extractTaggedPhrases(child, tag)
if (len(listOfPhrases) > 0):
phrases.extend(listOfPhrases)
return phrases
# This function takes in a search term and returns four Google image URLs for it
def buildImageURLs(searchTerm):
# Just return an empty array if we don't need to bother retrieving images. This insures code that depends on this function continues to work.
if not RETRIEVE_IMAGES:
return []
urls = []
searchTerm = searchTerm.replace(' ','%20')
url = ('https://ajax.googleapis.com/ajax/services/search/images?' + 'v=1.0&q='+searchTerm)
response = urllib2.urlopen(url)
results = simplejson.load(response)
data = results['responseData']
dataInfo = data['results']
for myUrl in dataInfo:
urls.append(myUrl['unescapedUrl'])
return urls
# This function takes in an array of phrases and calls buildImageURLs on each one, returning the results in a OrderedDict with the phrase as the key
def buildImagesDict(phrases):
images = OrderedDict()
for phrase in phrases:
images[phrase] = buildImageURLs(phrase)
return images
# I've stuffed everything else in here for the time being
def main():
text = "Smoking Mothers May Alter the DNA of Their Children."
parser = Parser()
tree = parser.parse(text)
print "Parse Tree:\n"+str(tree)+"\n"
phrasesTree = extractTaggedPhrases(tree, 'NP')
print "Extracted Phrases:\n"+str(phrasesTree)+"\n"
phrases = []
for phrase in phrasesTree:
phrases.append(" ".join(phrase.leaves()))
imagesDict = buildImagesDict(phrases)
for phrase, images in imagesDict.iteritems():
print phrase+":"
print "\n".join([image for image in images])
print
if __name__ == "__main__":
main()