-
Notifications
You must be signed in to change notification settings - Fork 15
/
iol.py
51 lines (39 loc) · 1.77 KB
/
iol.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# -*- coding: utf-8 -*-
from .sitemap import SitemapSpider
from scrapenews.items import ScrapenewsItem
from datetime import datetime
import pytz
SAST = pytz.timezone('Africa/Johannesburg')
class IOLSpider(SitemapSpider):
name = 'iol'
allowed_domains = ['www.iol.co.za']
sitemap_urls = ['https://www.iol.co.za/robots.txt']
sitemap_follow = [
'^https://www.iol.co.za/news/((?!eish).)*$',
'www.iol.co.za/business-report',
'www.iol.co.za/politics',
'www.iol.co.za/personal-finance',
]
publication_name = 'IOL News'
def parse(self, response):
title = response.xpath('//header/h1/text()').extract_first()
self.logger.info('%s %s', response.url, title)
article_body = response.xpath('//div[@itemprop="articleBody"]')
if article_body:
body_html = article_body.extract_first()
byline = response.xpath('//span[@itemprop="author"]/strong/text()').extract_first()
publication_date_str = response.xpath('//span[@itemprop="datePublished"]/@content').extract_first()
publication_date_str = publication_date_str.strip()[:16]
publication_date = datetime.strptime(publication_date_str, '%Y-%m-%dT%H:%M')
publication_date = SAST.localize(publication_date)
item = ScrapenewsItem()
item['body_html'] = body_html
item['title'] = title
item['byline'] = byline
item['published_at'] = publication_date.isoformat()
item['retrieved_at'] = datetime.utcnow().isoformat()
item['url'] = response.url
item['file_name'] = response.url.split('/')[-1]
item['spider_name'] = self.name
item['publication_name'] = self.publication_name
yield item