/
el_pais_babelia.recipe
49 lines (44 loc) · 1.95 KB
/
el_pais_babelia.recipe
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
from calibre.web.feeds.news import BasicNewsRecipe
class ElPaisBabelia(BasicNewsRecipe):
title = 'El Pais Babelia'
__author__ = 'oneillpt'
description = 'El Pais Babelia'
INDEX = 'http://www.elpais.com/suple/babelia/'
language = 'es'
remove_tags_before = dict(name='div', attrs={'class':'estructura_2col'})
keep_tags = [dict(name='div', attrs={'class':'estructura_2col'})]
remove_tags = [dict(name='div', attrs={'class':'votos estirar'}),
dict(name='div', attrs={'id':'utilidades'}),
dict(name='div', attrs={'class':'info_relacionada'}),
dict(name='div', attrs={'class':'mod_apoyo'}),
dict(name='div', attrs={'class':'contorno_f'}),
dict(name='div', attrs={'class':'pestanias'}),
dict(name='div', attrs={'class':'otros_webs'}),
dict(name='div', attrs={'id':'pie'})
]
#no_stylesheets = True
remove_javascript = True
def parse_index(self):
articles = []
soup = self.index_to_soup(self.INDEX)
feeds = []
for section in soup.findAll('div', attrs={'class':'contenedor_nuevo'}):
section_title = self.tag_to_string(section.find('h1'))
articles = []
for post in section.findAll('a', href=True):
url = post['href']
if url.startswith('/'):
url = 'http://www.elpais.es'+url
title = self.tag_to_string(post)
if str(post).find('class=') > 0:
klass = post['class']
if klass != "":
self.log()
self.log('--> post: ', post)
self.log('--> url: ', url)
self.log('--> title: ', title)
self.log('--> class: ', klass)
articles.append({'title':title, 'url':url})
if articles:
feeds.append((section_title, articles))
return feeds