Browse files

Add a script to scan for favicons

  • Loading branch information...
1 parent 70810ff commit 68d8233010efd671f4c94b128b761132fac6a366 @rubys committed Feb 10, 2011
Showing with 79 additions and 0 deletions.
  1. +79 −0 favicon.py
View
79 favicon.py
@@ -0,0 +1,79 @@
+import sys, socket
+from planet import config, feedparser
+from planet.spider import filename
+from urllib2 import urlopen
+from urlparse import urljoin
+from html5lib import html5parser, treebuilders
+from ConfigParser import ConfigParser
+
+# load config files (default: config.ini)
+for arg in sys.argv[1:]:
+ config.load(arg)
+if len(sys.argv) == 1:
+ config.load('config.ini')
+
+from Queue import Queue
+from threading import Thread
+
+# determine which subscriptions have no icon but do have a html page
+fetch_queue = Queue()
+html = ['text/html', 'application/xhtml+xml']
+sources = config.cache_sources_directory()
+for sub in config.subscriptions():
+ data=feedparser.parse(filename(sources,sub))
+ if data.feed.get('icon'): continue
+ if not data.feed.get('links'): continue
+ for link in data.feed.links:
+ if link.rel=='alternate' and link.type in html:
+ fetch_queue.put((sub, link.href))
+ break
+
+# find the favicon for a given webpage
+def favicon(page):
+ parser=html5parser.HTMLParser(tree=treebuilders.getTreeBuilder('dom'))
+ doc=parser.parse(urlopen(page))
+ for link in doc.getElementsByTagName('link'):
+ if link.hasAttribute('rel') and link.hasAttribute('href'):
+ if 'icon' in link.attributes['rel'].value.split(' '):
+ return urljoin(page, link.attributes['href'].value)
+ favicon = urljoin(page, '/favicon.ico')
+ if urlopen(favicon).info()['content-length'] != '0':
+ return favicon
+
+# thread worker that fills in the dictionary which maps subs to favicon
+icons = {}
+def fetch(thread_index, fetch_queue, icons):
+ while 1:
+ sub, html = fetch_queue.get()
+ if not html: break
+ try:
+ icon = favicon(html)
+ if icon: icons[sub] = icon
+ except:
+ pass
+
+# set timeout
+try:
+ socket.setdefaulttimeout(float(config.feed_timeout()))
+except:
+ pass
+
+# (optionally) spawn threads, fetch pages
+threads = {}
+if int(config.spider_threads()):
+ for i in range(int(config.spider_threads())):
+ threads[i] = Thread(target=fetch, args=(i, fetch_queue, icons))
+ fetch_queue.put((None, None))
+ threads[i].start()
+ for i in range(int(config.spider_threads())):
+ threads[i].join()
+else:
+ fetch_queue.put((None, None))
+ fetch(0, fetch_queue, icons)
+
+# produce config file
+config = ConfigParser()
+for sub, icon in icons.items():
+ config.add_section(sub)
+ config.set(sub, 'favicon', icon)
+config.write(sys.stdout)

0 comments on commit 68d8233

Please sign in to comment.