-
Notifications
You must be signed in to change notification settings - Fork 27
/
article_extractor.py
268 lines (226 loc) · 12.4 KB
/
article_extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
# -*- coding: utf-8 -*-
'''
Created on 2017-11-27 16:41
---------
@summary: 正文提取
---------
@author: Boris
'''
import sys
sys.path.append('..')
import init
import re
import utils.tools as tools
from utils.log import log
from extractor.config import *
class ArticleExtractor():
def __init__(self, url, html = None, language='zh'):
self._html = html
self._url = url
self._content_start_pos = ''
self._content_end_pos = ''
self._content_center_pos = ''
self._paragraphs = ''
if not html:
self._html = tools.get_html(url)
self._text = self.__del_html_tag(self._html, save_useful_tag = True)
def __replace_str(self, source_str, regex, replace_str = ''):
'''
@summary: 替换字符串
---------
@param source_str: 原字符串
@param regex: 正则
@param replace_str: 用什么来替换 默认为''
---------
@result: 返回替换后的字符串
'''
str_info = re.compile(regex)
return str_info.sub(replace_str, source_str)
def __del_html_tag(self, html, save_useful_tag = False):
'''
@summary:
---------
@param html:
@param save_useful_tag:保留有用的标签,如img和p标签
---------
@result:
'''
html = self.__replace_str(html, '(?i)<script(.|\n)*?</script>') #(?i)忽略大小写
html = self.__replace_str(html, '(?i)<style(.|\n)*?</style>')
html = self.__replace_str(html, '<!--(.|\n)*?-->')
html = self.__replace_str(html, '(?!&[a-z]+=)&[a-z]+;?', ' ') # 干掉 等无用的字符 但&xxx= 这种表示参数的除外
if save_useful_tag:
html = self.__replace_str(html, r'(?!{useful_tag})<(.|\n)+?>'.format(useful_tag = '|'.join(USEFUL_TAG)))
else:
html = self.__replace_str(html, '<(.|\n)*?>')
html = self.__replace_str(html, '[\f\r\t\v]') # 将空格和换行符外的其他空白符去掉
html = html.strip()
return html
def __del_unnecessary_character(self, content):
'''
@summary: 去掉多余的换行和空格
---------
@param content:
---------
@result:
'''
content = content.strip()
content = content[content.find('>') + 1 : ] if content.startswith('</') else content # 去掉开头的结束符
content = self.__replace_str(content, ' {2,}', '') # 去掉超过一个的空格
return self.__replace_str(content, '(?! )\s+', '\n') # 非空格的空白符转换为回车
def get_title(self):
title = ''
# 处理特殊的网站不规则的标题
for domain, regex in SPECIAL_TITLE.items():
if domain in self._url:
title = tools.get_info(self._html, regex, fetch_one = True)
break
if not title:
regex = '(?i)<title.*?>(.*?)</title>'
title = tools.get_info(self._html, regex, fetch_one = True)
title = title[:title.find('_')] if '_' in title else title
title = title[:title.find('-')] if '-' in title else title
title = title[:title.find('|')] if '|' in title else title
if not title:
regexs = ['<h1.*?>(.*?)</h1>', '<h2.*?>(.*?)</h2>', '<h3.*?>(.*?)</h3>', '<h4.*?>(.*?)</h4>']
title = tools.get_info(self._html, regexs, fetch_one = True)
title = tools.del_html_tag(title)
return title
def get_content(self):
'''
@summary:
基于文本密度查找正文
1、将html去标签,将空格和换行符外的其他空白符去掉
2、统计连续n段文字的长度,此处用于形容一定区域的文本密度
3、将文本最密集处当成正文的开始和结束位置
4、在正文开始处向上查找、找到文本密度小于等于正文文本密度阈值值,算为正文起始位置。该算法文本密度阈值值为文本密度值的最小值
5、在正文开始处向下查找、找到文本密度小于等于正文文本密度阈值值,算为正文结束位置。该算法文本密度阈值值为文本密度值的最小值
去除首页等干扰项:
1、正文一般都包含p标签。此处统计p标签内的文字数占总正文文字数的比例。超过一定阈值,则算为正文
待解决:
翻页 如:http://mini.eastday.com/a/171205202028050-3.html
---------
---------
@result:
'''
paragraphs = self._text.split('\n')
# for i, paragraph in enumerate(paragraphs):
# print(i, paragraph)
# 统计连续n段的文本密度
paragraph_lengths = [len(self.__del_html_tag(paragraph)) for paragraph in paragraphs]
# paragraph_lengths = [len(paragraph.strip()) for paragraph in paragraphs]
paragraph_block_lengths = [sum(paragraph_lengths[i : i + MAX_PARAGRAPH_DISTANCE]) for i in range(len(paragraph_lengths))] # 连续n段段落长度的总和(段落块),如段落长度为[0,1,2,3,4] 则连续三段段落长度为[3,6,9,3,4]
self._content_center_pos = content_start_pos = content_end_pos = paragraph_block_lengths.index(max(paragraph_block_lengths)) #文章的开始和结束位置默认在段落块文字最密集处
min_paragraph_block_length = MIN_PARAGRAPH_LENGHT * MAX_PARAGRAPH_DISTANCE
# 段落块长度大于最小段落块长度且数组没有越界,则看成在正文内。开始下标继续向上查找
while content_start_pos > 0 and paragraph_block_lengths[content_start_pos] > min_paragraph_block_length:
content_start_pos -= 1
# 段落块长度大于最小段落块长度且数组没有越界,则看成在正文内。结束下标继续向下查找
while content_end_pos < len(paragraph_block_lengths) and paragraph_block_lengths[content_end_pos] > min_paragraph_block_length:
content_end_pos += 1
# 处理多余的换行和空白符
content = paragraphs[content_start_pos : content_end_pos]
content = '\n'.join(content)
content = self.__del_unnecessary_character(content)
# 此处统计p标签内的文字数占总正文文字数的比例。超过一定阈值,则算为正文
paragraphs_text_len = len(self.__del_html_tag(''.join(tools.get_info(content, '<p.*?>(.*?)</p>'))))
content_text_len = len(self.__del_html_tag(content))
if content_text_len and content_text_len > MIN_COUNTENT_WORDS and ((paragraphs_text_len / content_text_len) > MIN_PARAGRAPH_AND_CONTENT_PROPORTION):
self._content_start_pos = content_start_pos
self._content_end_pos = content_end_pos
self._paragraphs = paragraphs
# print(content_start_pos, content_end_pos, self._content_center_pos)
return content
else:
return ''
def get_author(self):
# 不去掉标签匹配
author = tools.get_info(self._text, AUTHOR_REGEXS_TEXT, fetch_one = True)
if not author: # 没有匹配到,去掉标签后进一步匹配,有的作者和名字中间有标签
author = tools.get_info(self.__replace_str(self._text, '<(.|\n)*?>', ' '), AUTHOR_REGEXS_TEXT, fetch_one = True)
if not author: # 仍没匹配到,则在html的author中匹配
author = tools.get_info(self._html, AUTHOR_REGEX_TAG, fetch_one = True)
return author
def get_release_time_old(self):
if self._content_start_pos and self._content_end_pos:
content = self.__replace_str('\n'.join(self._paragraphs[self._content_start_pos - RELEASE_TIME_OFFSET: self._content_end_pos + RELEASE_TIME_OFFSET]), '<(.|\n)*?>', '<>')
else:
content = self.__replace_str(self._text, '<(.|\n)*?>', '<>')
release_time = tools.get_info(content, DAY_TIME_REGEXS, fetch_one = True)
if not release_time:
release_time = tools.get_info(self.__replace_str(self._text, '<(.|\n)*?>', '<>'), DAY_TIME_REGEXS, fetch_one = True)
release_time = tools.format_date(release_time)
return release_time
def get_release_time(self):
def get_release_time_in_paragraph(paragraph_pos):
if self._paragraphs:
while paragraph_pos >= 0:
content = self.__replace_str(self._paragraphs[paragraph_pos], '<(.|\n)*?>', '<>')
release_time = tools.get_info(content, DAY_TIME_REGEXS, fetch_one = True)
if release_time:
return tools.format_date(release_time)
paragraph_pos -= 1
return None
release_time = get_release_time_in_paragraph(self._content_start_pos)
if not release_time:
release_time = get_release_time_in_paragraph(self._content_center_pos)
return release_time
if __name__ == '__main__':
urls = [
# 'http://news.cctv.com/2017/11/30/ARTIvCEUIYEZx9HTsTypXySQ171130.shtml',
# 'http://www.sohu.com/a/208214795_115178',
# 'http://mini.eastday.com/a/171201210623679.html',
# 'http://e.gmw.cn/2017-12/04/content_26998661.htm',
# 'http://www.sohu.com/a/208241102_570245',
# 'http://news.163.com/17/1201/21/D4JN5JRE0001875P.html', # 乱码问题
# 'http://mini.eastday.com/a/171204173416401.html',
# 'http://www.sohu.com/a/208241102_570245',
# # # # 'http://www.sohu.com/'
# # # # 'http://www.southcn.com/'
# # # # 'http://kb.southcn.com/default.htm'
# 'http://www.sohu.com/a/207186412_104421',
# 'http://kb.southcn.com/content/2017-12/05/content_179364393.htm',
# 'http://yn.people.com.cn/n2/2017/1129/c372315-30976586.html', #乱码
# 'http://www.sohu.com/a/207186412_104421',
# 'http://www.sohu.com/a/208209445_603687',
# 'http://mini.eastday.com/a/171204173416401.html',
# 'http://yn.people.com.cn/n2/2017/1129/c372315-30976586.html',
# 'http://news.eastday.com/eastday/13news/auto/news/society/20171206/u7ai7256226.html',
# 'http://cnews.chinadaily.com.cn/2017-12/06/content_35230092.htm',
# 'http://e.gmw.cn/2017-12/04/content_26998661.htm',
# 'http://www.sohu.com/a/208241102_570245',
# 'http://cnews.chinadaily.com.cn/2017-12/06/content_35230092.htm',
# 'http://news.eastday.com/eastday/13news/auto/news/society/20171206/u7ai7256226.html',
# 'http://cj.sina.com.cn/article/detail/6185269244/510492',
# 'http://0575gwy.com/index.php/Index/show/id/2130',
# 'http://hdmedicine.com.cn/News_info.aspx?News_Id=787&CateId=24',
# 'http://www.qz001.gov.cn/info/view/86ec076d71a44869ab71e00e5707f89e',
# 'http://payh.gov.cn/Art/Art_2/Art_2_795.aspx',
# 'http://qiushi.nbgxedu.com/show.aspx?id=d479b45a-1747-4f60-83f3-f1e2dc85a0d2',
# 'http://31ly.com/show/10117/product-13846.html'
# 'http://www.jawin.com.cn/news/show-247708.html'
# 'http://pjsl.cn/Item/5845.aspx'
# 'http://news.sina.com.cn/sf/news/flfg/2017-12-04/doc-ifypikwt7105025.shtml'
# 'http://cq.people.com.cn/n2/2018/0327/c365403-31387318.html'
# 'http://www.zjgrrb.com/zjzgol/system/2018/03/28/030796013.shtml',
# 'http://tech.ifeng.com/a/20180116/44847498_0.shtml'
# 'http://tech.ifeng.com/a/20171228/44825006_0.shtml'
# 'http://news.ifeng.com/a/20180514/58297302_0.shtml'
# 'https://baijiahao.baidu.com/s?id=1601748269934604720'
# 'http://big5.cntv.cn/gate/big5/news.cctv.com/special/sx/index.shtml'
'http://finance.youth.cn/finance_cyxfgsxw/201806/t20180611_11641628.htm'
]
for url in urls:
html = tools.get_html(url)
article_extractor = ArticleExtractor(url, html)
content = article_extractor.get_content()
title = article_extractor.get_title()
release_time = article_extractor.get_release_time()
author = article_extractor.get_author()
print('---------------------------')
print(url)
print('title : ', title)
print('release_time: ', release_time)
print('author', author)
print('content : ',content)
print('---------------------------')