In [1]:
import sys
import requests
from lxml import etree
import re
from bs4 import BeautifulSoup

In [2]:
def get_index_pages():
    """
    选取导航栏中的不同的话题
    """
    r = requests.get('http://www.news.cn/')
    tree = etree.HTML(r.text)
    index_spans = []
    index_pages = []
    index_spans.extend(tree.xpath('/html/body/div[1]/div/div[2]/div/div[3]/div[1]/a'))
    index_spans.extend(tree.xpath('/html/body/div[1]/div/div[2]/div/div[3]/div[2]/a'))

    for index_page in index_spans:
        if index_page.text not in ['视频', '图片', '高层']:
            index_pages.append(index_page.get('href'))

    return index_pages

In [3]:
def get_text(url):
    try:
        news = ''
        r = requests.get(url)
        r.encoding = r.apparent_encoding

        soup = BeautifulSoup(r.text, 'html.parser')
        title = soup.title.text.strip()  # 获得标题
        title += '!!!!'  # 区分标记
        news += title
        # 找到所有div(块),其中id为detail
        for x in soup.find_all('div', {'id': ['detail']}):
            for y in x.find_all('p'):
                text = y.text.strip()  # 得到文本, strip()去除空格
                news += text
        return news
    except:
        print("爬取失败")

In [4]:
def get_all_url(url):
    try:
        news_list = []  # 空列表
        r = requests.get(url)  # 解析种子网页
        r.encoding = r.apparent_encoding
        soup = BeautifulSoup(r.text, 'html.parser')  # 利用BeautifulSoup库解析
        tags = soup.find_all('a')  # 找到所有锚/超链接
        for tag in tags:
            news_list.append((str(tag.get('href')).strip()))  # 得到href
        news_list = list(set(news_list))
        return news_list
    except:
        print("爬取失败")

In [5]:
def clean_urls(urls):
    urls = list(set(urls))
    cleaned_urls = []
    for url in urls:
        if 'www.news.cn' in url and (url.split(".")[-1] == 'htm' or url.split(".")[-1] == 'html'):
            cleaned_urls.append(url)

    return cleaned_urls

In [6]:
def clean_text(text):
    if len(text) < 30 or text[-4:] == '!!!!':
        return
    cleaned_text = re.sub(r'\s+', '', text)  # 去除空格、换行符等
    return cleaned_text

In [7]:
index_pages = get_index_pages()
index_pages

['http://www.news.cn/politics/xxjxs/index.htm',
 'http://www.news.cn/politics/leaders/index.htm',
 'http://www.news.cn/politics/index.html',
 'http://www.news.cn/politics/xhrs/index.html',
 'http://www.news.cn/world/index.html',
 'http://www.news.cn/fortune/index.htm',
 'http://www.news.cn/comments/index.html',
 'http://www.news.cn/gangao/index.html',
 'http://www.news.cn/tw/index.html',
 'http://www.news.cn/sikepro/index.html',
 'http://www.news.cn/world/globalink/index.html',
 'http://education.news.cn/index.htm',
 'http://www.news.cn/tech/index.html',
 'http://www.news.cn/science/index.htm',
 'http://sports.news.cn/index.htm',
 'http://www.news.cn/culture/',
 'http://www.news.cn/health/index.html',
 'http://www.news.cn/milpro/index.htm',
 'http://www.news.cn/talking/index.html',
 'http://www.news.cn/politics/zywj/index.htm',
 'http://www.news.cn/money/index.html',
 'http://www.news.cn/auto/index.html',
 'http://www.news.cn/food/index.html',
 'http://www.news.cn/house/index.html',
 '

In [8]:
urls = []
for page in index_pages:
    urls.extend(get_all_url(page))
valid_urls = clean_urls(urls)
len(valid_urls)

1047

In [9]:
f = open("foo.txt", "w", encoding="UTF-8")
for i in valid_urls:
    text = get_text(i)
    text = clean_text(text) if text else None
    if text:
        f.write(i)
        f.write(' ')
        f.write(text)
        f.write("\n")
f.close()

爬取失败


In [13]:
with open('links.txt', 'r') as f:
    links_str = f.read()
    links = links_str.split('\n')

In [14]:
links

['https://www.globaltimes.cn/page/202310/1299397.shtml',
 'https://www.globaltimes.cn/page/202310/1299377.shtml',
 'https://www.globaltimes.cn/page/202310/1299376.shtml',
 'https://www.globaltimes.cn/page/202310/1299295.shtml',
 'https://www.globaltimes.cn/page/202310/1299285.shtml',
 'https://www.globaltimes.cn/page/202310/1299180.shtml',
 'https://www.globaltimes.cn/page/202310/1299162.shtml',
 'https://www.globaltimes.cn/page/202309/1299152.shtml',
 'https://www.globaltimes.cn/page/202309/1299144.shtml',
 'https://www.globaltimes.cn/page/202309/1299062.shtml',
 'https://www.globaltimes.cn/page/202309/1299061.shtml',
 'https://www.globaltimes.cn/page/202309/1299002.shtml',
 'https://www.globaltimes.cn/page/202309/1299001.shtml',
 'https://www.globaltimes.cn/page/202309/1298977.shtml',
 'https://www.globaltimes.cn/page/202309/1298939.shtml',
 'https://www.globaltimes.cn/page/202309/1298897.shtml',
 'https://www.globaltimes.cn/page/202309/1298867.shtml',
 'https://www.globaltimes.cn/pa