Permalink
Browse files

refactoring

1. add images to article
2. add vim tags and footline
3. refactoring for more efficiency
  • Loading branch information...
1 parent 8cd3045 commit df9e7f6202b8c073a1295fe1cf6633b2b40dcb01 @stoneyrh committed Mar 9, 2012
Showing with 40 additions and 18 deletions.
  1. +40 −18 web2py_online_book.py
View
@@ -3,14 +3,24 @@
class WebArticle(object):
def __init__(self):
- self.title = ""
- self.content = ""
+ self.title_ = ""
+ self.content_ = ""
+ self.images_ = []
+
+ def title(self):
+ return self.title_
+
+ def content(self):
+ return self.content_
def set_title(self, title):
- self.title = title
+ self.title_ = title
def append(self, content):
- self.content += content
+ self.content_ += content
+
+ def add_image(self, url):
+ self.images_.append(url)
class WebDocParser(HTMLParser, object):
def __init__(self, url):
@@ -21,28 +31,29 @@ def __init__(self, url):
self.levels_ = 0
def article(self):
- return self.article_ if self.article_.title and self.article_.content else None
+ return self.article_ if self.article_.title() and self.article_.content() else None
def handle_starttag(self, tag, attrs):
- # If the title is still emtpy, then try to get it
- if not self.article_.title and tag == 'a':
- href, url = attrs[0]
- if self.url_.endswith(url):
- self.consumer_ = self.article_.set_title
- if self.levels_ == 0 and tag == 'div':
- for attr, value in attrs:
- if attr == 'class' and value == 'article':
- self.consumer_ = self.article_.append
- self.levels_ = 1
- if self.levels_ > 0:
+ if self.levels_ == 0:
+ if tag == 'div':
+ for attr, value in attrs:
+ if attr == 'class' and value == 'article':
+ self.consumer_ = self.article_.append
+ self.levels_ = 1
+ # If the title is still emtpy, then try to get it
+ elif tag == 'a' and not self.article_.title():
+ href, url = attrs[0]
+ if self.url_.endswith(url):
+ self.consumer_ = self.article_.set_title
+ elif self.levels_ > 0:
self.levels_ = self.levels_ + 1
if tag == 'p':
self.article_.append("\n")
def handle_endtag(self, tag):
if self.levels_ > 0:
self.levels_ = self.levels_ - 1
- if self.levels_ == 0:
+ elif self.levels_ == 0:
self.consumer_ = None
def handle_startendtag(self, tag, attrs):
@@ -53,6 +64,7 @@ def handle_startendtag(self, tag, attrs):
for attr, value in attrs:
if attr == 'src':
self.article_.append("\n**image** %s\n" % value)
+ self.article_.add_image(value)
break
def handle_data(self, data):
@@ -81,10 +93,20 @@ def main():
article = article_from(url)
if article:
articles.append(article)
+ print ' ' * 10, article.title()
if articles:
+ headline = 'web2py online book'
+ footline = 'vim:tw=78:fo=tcq2:isk=!-~,^*,^\|,^\":ts=8:ft=help:norl:'
+ index = '\n'.join(['|' + article.title().replace(' ', '-') + '|' for article in articles])
book = open('web2py_online_book.txt', 'w')
+ book.write(headline)
+ book.write('\n')
+ book.write('\n')
+ book.write(index)
for article in articles:
- book.write("\n".join([article.title, article.content]))
+ book.write('\n'.join(['*' + article.title().replace(' ', '-') + '*', article.content()]))
+ book.write('\n')
+ book.write(footline)
book.close()
if __name__ == '__main__':

0 comments on commit df9e7f6

Please sign in to comment.