corriere_della_sera_en.recipe

#!/usr/bin/env  python
__license__   = 'GPL v3'
__author__    = 'Lorenzo Vigentini, based on Darko Miletic'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>, Lorenzo Vigentini <l.vigentini at gmail.com>'
__version__     = 'v1.02'
__date__        = '14, March 2010'
__description__ = 'Italian daily newspaper (english version)'
# NOTE: the feeds url are broken on the main site as the permalink structure has been changed erroneously ie:
# actual link in feed   http://www.corriere.it/english/10_marzo_11/legitimate_impediment_approved_de9ba480-2cfd-11df-a00c-00144f02aabe.shtml
# this needs to be change to
# real feed URL http://www.corriere.it/International/english/articoli/2010/03/11/legitimate_impediment_approved.shtml
'''
http://www.corriere.it/
'''

from calibre.web.feeds.news import BasicNewsRecipe

class ilCorriereEn(BasicNewsRecipe):
    author        = 'Lorenzo Vigentini, based on Darko Miletic'
    description   = 'Italian daily newspaper (english version)'

    cover_url      = 'http://images.corriereobjects.it/images/static/common/logo_home.gif?v=200709121520'
    title          = u'Il Corriere della sera (english) '
    publisher      = 'RCS Digital'
    category       = 'News, politics, culture, economy, general interest'

    language       = 'en'
    timefmt        = '[%a, %d %b, %Y]'

    oldest_article = 5
    max_articles_per_feed = 100
    use_embedded_content  = False
    recursion             = 10

    remove_javascript = True
    no_stylesheets = True

    def get_article_url(self, article):
        articleUrl= article.get('link')
        segments = articleUrl.split('/')
        basename = '/'.join(segments[:3]) + '/' + 'International/english/articoli/'

    #the date has to be redone with the url structure
        mlist1 = ['gennaio','febbraio','marzo','aprile','maggio','giugno','luglio','agosto','settembre','ottobre','novembre','dicembre']
        mlist2 = ['01','02','03','04','05','06','07','08','09','10','11','12']
        myDate = segments[4].split('_')
        x=0
        for x in range(11):
            if myDate[1] == mlist1[x]:
                noMonth=mlist2[x]
                break

        newDateUrl= '20'+ myDate[0] + '/' + noMonth + '/' + myDate[2] + '/'

    #clean the article title
        articleURLseg=segments[5].split('-')
        myArticle = (articleURLseg[0])[:-9] + '.shtml'

        myURL= basename + newDateUrl + myArticle
        #print myURL
        return myURL

    keep_only_tags = [dict(name='div', attrs={'class':['news-dettaglio article','article']})]

    remove_tags = [
                   dict(name=['base','object','link','embed']),
                   dict(name='div', attrs={'class':'news-goback'}),
                   dict(name='ul', attrs={'class':'toolbar'})
                  ]

    remove_tags_after = dict(name='p', attrs={'class':'footnotes'})

    feeds = [
             (u'News'  , u'http://www.corriere.it/rss/english.xml'  )
            ]