Updated to calibre 0.8.32

readbeam · Dec 23, 2011 · fca9bfb · fca9bfb
1 parent 3403377
commit fca9bfb
Show file tree

Hide file tree

Showing 33 changed files with 1,933 additions and 649 deletions.
diff --git a/adventure_zone_pl.recipe b/adventure_zone_pl.recipe
@@ -1,19 +1,38 @@
 from calibre.web.feeds.news import BasicNewsRecipe
-
+import re
 class Adventure_zone(BasicNewsRecipe):
     title          = u'Adventure Zone'
     __author__        = 'fenuks'
     description   = 'Adventure zone - adventure games from A to Z'
     category       = 'games'
     language       = 'pl'
-    oldest_article = 15
-    max_articles_per_feed = 100
     no_stylesheets = True
+    oldest_article = 20
+    max_articles_per_feed = 100
+    use_embedded_content=False
+    preprocess_regexps     = [(re.compile(r"<td class='capmain'>Komentarze</td>", re.IGNORECASE), lambda m: '')]
     remove_tags_before= dict(name='td', attrs={'class':'main-bg'})
-    remove_tags_after= dict(name='td', attrs={'class':'main-body middle-border'})
+    remove_tags= [dict(name='img', attrs={'alt':'Drukuj'})]
+    remove_tags_after= dict(id='comments')
     extra_css              = '.main-bg{text-align: left;}  td.capmain{ font-size: 22px; }'
     feeds          = [(u'Nowinki', u'http://www.adventure-zone.info/fusion/feeds/news.php')]
 
+    def parse_feeds (self): 
+      feeds = BasicNewsRecipe.parse_feeds(self) 
+      soup=self.index_to_soup(u'http://www.adventure-zone.info/fusion/feeds/news.php')
+      tag=soup.find(name='channel')
+      titles=[]
+      for r in tag.findAll(name='image'):
+          r.extract()
+      art=tag.findAll(name='item')
+      for i in art:
+            titles.append(i.title.string)
+      for feed in feeds:
+        for article in feed.articles[:]:
+            article.title=titles[feed.articles.index(article)]
+      return feeds
+
+
     def get_cover_url(self):
         soup = self.index_to_soup('http://www.adventure-zone.info/fusion/news.php')
         cover=soup.find(id='box_OstatninumerAZ')
@@ -22,17 +41,10 @@ class Adventure_zone(BasicNewsRecipe):
 
 
     def skip_ad_pages(self, soup):
-        skip_tag = soup.body.findAll(name='a')
-        if skip_tag is not None:
-            for r in skip_tag:
-                 if 'articles.php?' in r['href']:
-                     if r.strong is not None:
-                         word=r.strong.string
-                         if ('zapowied' or 'recenzj') in word:
-                             return self.index_to_soup('http://www.adventure-zone.info/fusion/print.php?type=A&item_id'+r['href'][r['href'].find('_id')+3:], raw=True)
-        else:
-            None
-
-    def print_version(self, url):
-        return url.replace('news.php?readmore', 'print.php?type=N&item_id')
-
+        skip_tag = soup.body.find(name='td', attrs={'class':'main-bg'})
+        skip_tag = skip_tag.findAll(name='a')
+        for r in skip_tag:
+           if r.strong:
+                 word=r.strong.string
+                 if word and (('zapowied' in word) or ('recenzj' in word)  or ('solucj' in word)):
+                   return self.index_to_soup('http://www.adventure-zone.info/fusion/print.php?type=A&item'+r['href'][r['href'].find('article_id')+7:], raw=True)
diff --git a/astro_news_pl.recipe b/astro_news_pl.recipe
@@ -1,5 +1,4 @@
 from calibre.web.feeds.news import BasicNewsRecipe
-
 class AstroNEWS(BasicNewsRecipe):
     title          = u'AstroNEWS'
     __author__        = 'fenuks'
@@ -8,11 +7,16 @@ class AstroNEWS(BasicNewsRecipe):
     language       = 'pl'
     oldest_article = 8
     max_articles_per_feed = 100
-    auto_cleanup = True
+    #extra_css= 'table {text-align: left;}'
+    no_stylesheets=True
     cover_url='http://news.astronet.pl/img/logo_news.jpg'
-   # no_stylesheets= True
+    remove_tags=[dict(name='hr')]
     feeds          = [(u'Wiadomości', u'http://news.astronet.pl/rss.cgi')]
 
     def print_version(self, url):
         return url.replace('astronet.pl/', 'astronet.pl/print.cgi?')
 
+    def preprocess_html(self, soup):
+        for item in soup.findAll(align=True):
+            del item['align']
+        return soup
diff --git a/biolog_pl.recipe b/biolog_pl.recipe
@@ -0,0 +1,19 @@
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+
+from calibre.web.feeds.news import BasicNewsRecipe
+class Biolog_pl(BasicNewsRecipe):
+    title          = u'Biolog.pl'
+    oldest_article = 7
+    max_articles_per_feed = 100
+    remove_empty_feeds=True
+    __author__        = 'fenuks'
+    description   = u'Przyrodnicze aktualności ze świata nauki (codziennie aktualizowane), kurs biologii, testy i sprawdziany, forum dyskusyjne.'
+    category       = 'biology'
+    language       = 'pl'
+    cover_url='http://www.biolog.pl/naukowy,portal,biolog.png'
+    no_stylesheets = True
+    #keeps_only_tags=[dict(id='main')]
+    remove_tags_before=dict(id='main')
+    remove_tags_after=dict(name='a', attrs={'name':'komentarze'})
+    remove_tags=[dict(name='img', attrs={'alt':'Komentarze'})]
+    feeds          = [(u'Wszystkie', u'http://www.biolog.pl/backend.php'), (u'Medycyna', u'http://www.biolog.pl/medycyna-rss.php'), (u'Ekologia', u'http://www.biolog.pl/rss-ekologia.php'), (u'Genetyka i biotechnologia', u'http://www.biolog.pl/rss-biotechnologia.php'), (u'Botanika', u'http://www.biolog.pl/rss-botanika.php'), (u'Le\u015bnictwo', u'http://www.biolog.pl/rss-lesnictwo.php'), (u'Zoologia', u'http://www.biolog.pl/rss-zoologia.php')]
diff --git a/birmingham_post.recipe b/birmingham_post.recipe
@@ -0,0 +1,44 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+class AdvancedUserRecipe1306097511(BasicNewsRecipe):
+    title          = u'Birmingham post'
+    description = 'News for Birmingham UK'
+    timefmt = ''
+    __author__ = 'Dave Asbury'
+    cover_url = 'http://1.bp.blogspot.com/_GwWyq5eGw9M/S9BHPHxW55I/AAAAAAAAB6Q/iGCWl0egGzg/s320/Birmingham+post+Lite+front.JPG'
+    oldest_article = 1
+    max_articles_per_feed = 20
+    remove_empty_feeds = True
+    remove_javascript     = True
+    auto_cleanup = True
+    language = 'en_GB'
+
+
+    masthead_url        = 'http://www.pressgazette.co.uk/Pictures/web/t/c/g/birmingham_post.jpg'
+
+
+    keep_only_tags = [
+    #dict(name='h1',attrs={'id' : 'article-headline'}),
+                    #dict(attrs={'class':['article-meta-author','article-meta-date','article main','art-o art-align-center otm-1 ']}),
+    #dict(name='p')
+    #dict(attrs={'id' : 'three-col'})
+        ]
+    remove_tags    = [
+             # dict(name='div',attrs={'class' : 'span-33 last header-links'})
+
+                               ]
+    feeds          = [
+        #(u'News',u'http://www.birminghampost.net/news/rss.xml'),
+        (u'Local News', u'http://www.birminghampost.net/news/west-midlands-news/rss.xml'),
+        (u'UK News', u'http://www.birminghampost.net/news/uk-news/rss.xml'),
+        (u'Sports',u'http://www.birminghampost.net/midlands-birmingham-sport/rss.xml'),
+        (u'Bloggs & Comments',u'http://www.birminghampost.net/comment/rss.xml')
+
+         ]
+    extra_css  = '''
+                    body {font: sans-serif medium;}'
+    h1 {text-align : center; font-family:Arial,Helvetica,sans-serif; font-size:20px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold;}
+                h2 {text-align : center;color:#4D4D4D;font-family:Arial,Helvetica,sans-serif; font-size:15px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; }
+                    span{ font-size:9.5px; font-weight:bold;font-style:italic}
+                    p { text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;}
+
+     '''
diff --git a/computerworld_pl.recipe b/computerworld_pl.recipe
@@ -0,0 +1,22 @@
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+
+from calibre.web.feeds.news import BasicNewsRecipe
+class Computerworld_pl(BasicNewsRecipe):
+    title          = u'Computerworld.pl'
+    __author__        = 'fenuks'
+    description   = u'Serwis o IT w przemyśle, finansach, handlu, administracji oraz rynku IT i telekomunikacyjnym - wiadomości, opinie, analizy, porady prawne'
+    category       = 'IT'
+    language       = 'pl'
+    no_stylesheets=True
+    oldest_article = 7
+    max_articles_per_feed = 100
+    keep_only_tags=[dict(name='div', attrs={'id':'s'})]
+    remove_tags_after=dict(name='div', attrs={'class':'rMobi'})
+    remove_tags=[dict(name='div', attrs={'class':['nnav', 'rMobi']}), dict(name='table', attrs={'class':'ramka_slx'})]
+    feeds          = [(u'Wiadomo\u015bci', u'http://rssout.idg.pl/cw/news_iso.xml')]
+
+    def get_cover_url(self):
+        soup = self.index_to_soup('http://www.computerworld.pl/')
+        cover=soup.find(name='img', attrs={'class':'prawo'})
+        self.cover_url=cover['src']
+        return getattr(self, 'cover_url', self.cover_url)
diff --git a/cosmopolitan_uk.recipe b/cosmopolitan_uk.recipe
@@ -7,6 +7,7 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
     description = 'Fashion, beauty and Gossip for women from COSMOPOLITAN -UK'
 
     __author__ = 'Dave Asbury'
+    #last update 21/12/11
     # greyscale code by Starson
     cover_url = 'http://www.cosmopolitan.magazine.co.uk/files/4613/2085/8988/Cosmo_Cover3.jpg'
     no_stylesheets = True
@@ -31,8 +32,9 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
                               dict(name='div',attrs={'class' : ['blogInfo','viral_toolbar','comment_number','prevEntry nav']}),
                               dict(name='div',attrs={'class' : 'blog_module_about_the_authors'}),
                               dict(attrs={'id': ['breadcrumbs','comment','related_links_list','right_rail','content_sec_fb_more','content_sec_mostpopularstories','content-sec_fb_frame_viewfb_bot']}),
-                              dict(attrs={'class' : ['read_liked_that_header','fb_back_next_area']})
-                            ]
+                              dict(attrs={'class' : ['read_liked_that_header','fb_back_next_area']}),
+                              dict(name='li',attrs={'class' : 'thumb'})
+              ]
 
     feeds          = [
         (u'Love & Sex', u'http://www.cosmopolitan.co.uk/love-sex/rss/'), (u'Men', u'http://cosmopolitan.co.uk/men/rss/'), (u'Fashion', u'http://cosmopolitan.co.uk/fashion/rss/'), (u'Hair & Beauty', u'http://cosmopolitan.co.uk/beauty-hair/rss/'), (u'LifeStyle', u'http://cosmopolitan.co.uk/lifestyle/rss/'), (u'Cosmo On Campus', u'http://cosmopolitan.co.uk/campus/rss/'), (u'Celebrity Gossip', u'http://cosmopolitan.co.uk/celebrity-gossip/rss/')]
@@ -48,4 +50,3 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
             img.type = "GrayscaleType"
             img.save(iurl)
         return soup
-
diff --git a/datasport.recipe b/datasport.recipe
@@ -0,0 +1,15 @@
+__license__   = 'GPL v3'
+__author__    = 'faber1971'
+description   = 'Italian soccer news website - v1.00 (17, December 2011)'
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class AdvancedUserRecipe1324114272(BasicNewsRecipe):
+    title          = u'Datasport'
+    language = 'it'
+    __author__ = 'faber1971'
+    oldest_article = 1
+    max_articles_per_feed = 100
+    auto_cleanup = True
+
+    feeds          = [(u'Datasport', u'http://www.datasport.it/calcio/rss.xml')]
diff --git a/dziennik_pl.recipe b/dziennik_pl.recipe
@@ -0,0 +1,58 @@
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+
+from calibre.web.feeds.news import BasicNewsRecipe
+import re
+class Dziennik_pl(BasicNewsRecipe):
+    title          = u'Dziennik.pl'
+    __author__        = 'fenuks'
+    description   = u'Wiadomości z kraju i ze świata. Wiadomości gospodarcze. Znajdziesz u nas informacje, wydarzenia, komentarze, opinie.'
+    category       = 'newspaper'
+    language       = 'pl'
+    cover_url='http://6.s.dziennik.pl/images/og_dziennik.jpg'
+    no_stylesheets = True
+    oldest_article = 7
+    max_articles_per_feed = 100
+    remove_javascript=True
+    remove_empty_feeds=True
+    preprocess_regexps     = [(re.compile("Komentarze:"), lambda m: '')]
+    keep_only_tags=[dict(id='article')]
+    remove_tags=[dict(name='div', attrs={'class':['art_box_dodatki', 'new_facebook_icons2', 'leftArt', 'article_print', 'quiz-widget']}), dict(name='a', attrs={'class':'komentarz'})]
+    feeds          = [(u'Wszystko', u'http://rss.dziennik.pl/Dziennik-PL/'),
+		(u'Wiadomości', u'http://rss.dziennik.pl/Dziennik-Wiadomosci'),
+		(u'Gospodarka', u'http://rss.dziennik.pl/Dziennik-Gospodarka'),
+		(u'Kobieta', u'http://rss.dziennik.pl/Dziennik-Kobieta'),
+		(u'Auto', u'http://rss.dziennik.pl/Dziennik-Auto'),
+		(u'Rozrywka', u'http://rss.dziennik.pl/Dziennik-Rozrywka'),
+		(u'Film', u'http://rss.dziennik.pl/Dziennik-Film'),
+		(u'Muzyka' , u'http://rss.dziennik.pl/Dziennik-Muzyka'),
+		(u'Kultura', u'http://rss.dziennik.pl/Dziennik-Kultura'),
+		(u'Nauka', u'http://rss.dziennik.pl/Dziennik-Nauka'),
+		(u'Podróże', u'http://rss.dziennik.pl/Dziennik-Podroze/'),
+		(u'Nieruchomości', u'http://rss.dziennik.pl/Dziennik-Nieruchomosci')]
+
+    def append_page(self, soup, appendtag):
+        tag=soup.find('a', attrs={'class':'page_next'})
+        if tag:
+            appendtag.find('div', attrs={'class':'article_paginator'}).extract()
+        while tag:
+            soup2= self.index_to_soup(tag['href'])
+            tag=soup2.find('a', attrs={'class':'page_next'})
+            if not tag:
+                for r in appendtag.findAll('div', attrs={'class':'art_src'}):
+                    r.extract()
+            pagetext = soup2.find(name='div', attrs={'class':'article_body'})
+            for dictionary in self.remove_tags:
+                 v=pagetext.findAll(name=dictionary['name'], attrs=dictionary['attrs'])
+                 for delete in v:
+                     delete.extract()
+            pos = len(appendtag.contents)
+            appendtag.insert(pos, pagetext)
+            if appendtag.find('div', attrs={'class':'article_paginator'}):
+                appendtag.find('div', attrs={'class':'article_paginator'}).extract()
+
+
+
+
+    def preprocess_html(self, soup):
+         self.append_page(soup, soup.body)
+         return soup
diff --git a/echo_online.recipe b/echo_online.recipe
@@ -0,0 +1,47 @@
+__license__   = 'GPL v3'
+__copyright__ = '2011, Kovid Goyal <kovid at kovidgoyal.net>, Armin Geller'
+'''
+Fetch echo-online.de
+'''
+
+from calibre.web.feeds.recipes import BasicNewsRecipe
+class Echo_Online(BasicNewsRecipe):
+    title          = u' Echo Online'
+    description = '-Echo Online-'
+    publisher = 'Echo Online GmbH'
+    category = 'News, Germany'
+    __author__ = 'Armin Geller' # 2011-12-17
+    language = 'de'
+    lang = 'de-DE'
+    encoding = 'iso-8859-1'
+    timefmt = ' [%a, %d %b %Y]'
+
+    oldest_article = 7
+    max_articles_per_feed = 2
+    no_stylesheets = True
+    auto_cleanup = True
+    remove_javascript = True
+
+    feeds = [
+              (u'Topnews', u'http://www.echo-online.de/storage/rss/rss/topnews.xml'),
+              (u'Darmstadt', u'http://www.echo-online.de/rss/darmstadt.xml'),
+              (u'Darmstadt-Dieburg', u'http://www.echo-online.de/rss/darmstadtdieburg.xml'),
+              (u'Kreis Gro\xdf-Gerau', u'http://www.echo-online.de/rss/kreisgrossgerau.xml'),
+              (u'R\xfcsselsheim', u'http://www.echo-online.de/rss/ruesselsheim.xml'),
+              (u'Kreis Bergstra\xdfe', u'http://www.echo-online.de/rss/bergstrasse.xml'),
+              (u'Odenwaldkreis', u'http://www.echo-online.de/rss/odenwald.xml'),
+              (u'SV 98', u'http://www.echo-online.de/rss/sv98.xml'),
+              (u'Kino', u'http://www.echo-online.de/rss/kino.xml'),
+              (u'Ausstellungen', u'http://www.echo-online.de/rss/ausstellungen.xml'),
+              (u'Ausflug & Reise', u'http://www.echo-online.de/rss/ausflugreise.xml'),
+             ]
+
+    def print_version(self, url):
+          return self.browser.open_novisit(url).geturl() + '?_FRAME=33&_FORMAT=PRINT'
+
+    remove_tags = [dict(name='div', attrs={'class':["header", "name"]}),]
+    auto_cleanup_keep = '//div[@class="bild_gross w270"]'
+
+#    cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-ash2/41801_145340745513489_893927_n.jpg' # 2011-12-16 AGe
+    cover_url = 'http://adcounter.darmstaedter-echo.de/webdav/files/config/gui/images/Zeitungsfaecher.gif' # 2011-12-16 AGe
+
diff --git a/elet_es_irodalom.recipe b/elet_es_irodalom.recipe
@@ -0,0 +1,48 @@
+################################################################################
+#Description:	  http://es.hu/ RSS channel
+#Author: 	  Bigpapa (bigpapabig@hotmail.com)
+#Date:	  2010.12.01. - V1.0
+################################################################################
+
+from calibre.web.feeds.recipes import BasicNewsRecipe
+
+class elet_es_irodalom(BasicNewsRecipe):
+    title                  = u'Elet es Irodalom'
+    __author__             = 'Bigpapa'
+    oldest_article         = 7
+    max_articles_per_feed  = 20	# Az adott e-bookban tarolt cikkek feedenkenti maximalis szamat adja meg.
+    no_stylesheets         = True
+    #delay                  = 1
+    use_embedded_content   = False
+    encoding               = 'iso-8859-2'
+    category               = 'Cikkek'
+    language               = 'hu'
+    publication_type       = 'newsportal'
+    extra_css              = '.doc_title { font: bold 30px } .doc_author {font: bold 14px} '
+
+    keep_only_tags    = [
+                       dict(name='div', attrs={'class':['doc_author', 'doc_title', 'doc']})
+
+                       	 ]
+
+    remove_tags = [
+	 dict(name='a', attrs={'target':['_TOP']}),
+	dict(name='div', attrs={'style':['float: right; margin-left: 5px; margin-bottom: 5px;', 'float: right; margin-left: 5px; margin-bottom: 5px;']}),
+
+
+                      	  ]
+
+
+
+    feeds          = [
+	(u'Publicisztika', 'http://www.feed43.com/4684235031168504.xml'), 
+	(u'Interj\xfa', 'http://www.feed43.com/4032465460040618.xml'), 
+	(u'Visszhang', 'http://www.feed43.com/3727375706873086.xml'), 
+	(u'P\xe1ratlan oldal', 'http://www.feed43.com/2525784782475057.xml'), 
+	(u'Feuilleton', 'http://www.feed43.com/7216025082703073.xml'), 
+	(u'Pr\xf3za', 'http://www.feed43.com/8760248802326384.xml'), 
+	(u'Vers', 'http://www.feed43.com/1737324675134275.xml'), 
+	(u'K\xf6nyvkritika', 'http://www.feed43.com/1281156550717082.xml'), 	
+	(u'M\u0171b\xedr\xe1lat', 'http://www.feed43.com/1851854623681044.xml')
+
+                   	 ]
diff --git a/emuzica_pl.recipe b/emuzica_pl.recipe
@@ -0,0 +1,16 @@
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+
+from calibre.web.feeds.news import BasicNewsRecipe
+class eMuzyka(BasicNewsRecipe):
+    title          = u'eMuzyka'
+    __author__        = 'fenuks'
+    description   = u'Emuzyka to największa i najpopularniejsza strona o muzyce w Polsce'
+    category       = 'music'
+    language       = 'pl'
+    cover_url='http://s.emuzyka.pl/img/emuzyka_invert_small.jpg'
+    no_stylesheets = True
+    oldest_article = 7
+    max_articles_per_feed = 100
+    keep_only_tags=[dict(name='div', attrs={'id':'news_container'}), dict(name='h3'), dict(name='div', attrs={'class':'review_text'})]
+    remove_tags=[dict(name='span', attrs={'id':'date'})]
+    feeds          = [(u'Aktualno\u015bci', u'http://www.emuzyka.pl/rss.php?f=1'), (u'Recenzje', u'http://www.emuzyka.pl/rss.php?f=2')]