# 第三节 静态网页内容爬取与解析

## 一、urllib基本应用

### 1.读取并显示网页内容

In [1]:
import urllib.request
fp = urllib.request.urlopen(r'http://www.python.org')
print(fp.read(100))              #读取100个字节
print(fp.read(100).decode())     #使用UTF8进行解码
fp.close()                       #关闭连接

b'<!doctype html>\n<!--[if lt IE 7]>   <html class="no-js ie6 lt-ie7 lt-ie8 lt-ie9">   <![endif]-->\n<!-'
-[if IE 7]>      <html class="no-js ie7 lt-ie8 lt-ie9">          <![endif]-->
<!--[if IE 8]>      <h


### 2.提交网页参数

#### (1)下面的代码演示了如何使用GET方法读取并显示指定url的内容。

In [2]:
import urllib.request
import urllib.parse
url = "http://tianqihoubao.com/weather/province.aspx?id=340000"
with urllib.request.urlopen(url) as f:
    print(f.read(100))

b'\r\n<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xh'


#### (2)下面的代码演示了如何使用POST方法提交参数并读取指定页面内容。

In [3]:
#使用POST方法读取并显示指定url的内容
import urllib.request
import urllib.parse
data = urllib.parse.urlencode({'spam': 1, 'eggs': 2, 'bacon': 0})
data = data.encode('ascii')
#随着网站改版，可能会出错
urlstr="http://www.csdn.net"
with urllib.request.urlopen(urlstr, data) as f:
    print(f.read().decode('utf-8'))

<!doctype html><html lang="zh" data-server-rendered="true"><head><title>CSDN - 专业开发者社区</title> <meta name="keywords" content="CSDN博客,CSDN学院,CSDN论坛,CSDN直播"> <meta name="description" content="CSDN是全球知名中文IT技术交流平台,创建于1999年,包含原创博客、精品问答、职业培训、技术论坛、资源下载等产品服务,提供原创、优质、完整内容的专业IT技术开发社区."> <meta http-equiv="content-type" content="text/html;charset=utf-8"> <meta name="viewport" content="initial-scale=1, maximum-scale=1, user-scalable=no, minimal-ui"> <meta name="referrer" content="always"> <!----> <!----> <!----> 
        <script src="https://g.csdnimg.cn/tingyun/tingyun.js"></script>
       <!----> <!----> <!----> <link rel="shortcut icon" href="https://g.csdnimg.cn/static/logo/favicon32.ico" type="image/x-icon"> <link rel="canonical" href="https://www.csdn.net"> <!----> 
          <meta name="toolbar" content={"type":"0","fixModel":"1"} />
       
          <meta name="report" content={"spm":"1000.2115"} />
       <script src="https://g.csdnimg.cn/??lib/jquery/1.12.4/jquery.min.js,user-tooltip/2.7

### 3.使用HTTP代理访问页面

In [None]:
import urllib.request
#注意：代理网址需要自行寻找设置，此处只是示例代理网址，运行会出错
proxies = {'http': 'http://proxy.example.com:8080/'}
opener = urllib.request.FancyURLopener(proxies)
with opener.open("http://www.python.org") as f:
    f.read().decode('utf-8')

## 二、BeautifulSoup基本应用

### 2.BeautifulSoup应用

In [4]:
from bs4 import BeautifulSoup
BeautifulSoup('hello world!', 'lxml')      #自动添加标签
BeautifulSoup('<span>hello world!', 'lxml') #自动补全标签
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""                   
soup = BeautifulSoup(html_doc, 'html.parser') #可指定lxml或其他解析器                                   
print(soup.prettify())                        #以优雅的方式显示出来

<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>



操作soup对象示例代码如下：

In [5]:
soup.title   #访问<title>标签的内容，返回结果：<title>The Dormouse's story</title>
soup.title.name      #查看标签的名字，返回结果：'title'
soup.title.text      #查看标签的文本，返回结果："The Dormouse's story"
soup.title.string    #查看标签的文本，返回结果："The Dormouse's story"
soup.title.parent    #查看上一级标签
#返回结果：<head><title>The Dormouse's story</title></head>
soup.head       #返回结果：<head><title>The Dormouse's story</title></head>
soup.b          #访问<b>标签的内容，返回结果：<b>The Dormouse's story</b>
soup.body.b     #访问<body>中<b>标签的内容，返回结果：<b>The Dormouse's story</b>
soup.name       #把整个BeautifulSoup对象看作标签对象，返回结果：'[document]'
soup.find_all('a')    #查找所有<a>标签，返回结果如下：
"""
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
"""
soup.find_all(['a', 'b'])        #同时查找<a>和<b>标签，返回结果如下：
"""
[<b>The Dormouse's story</b>,
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
"""
print(soup.get_text())     #返回所有文本，返回结果如下：
"""
The Dormouse's story
The Dormouse's story
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
...
"""
soup.a['id'] = 'test_link1'  #修改标签属性的值
soup.a  
#返回结果：<a class="sister" href="http://example.com/elsie" id="test_link1">Elsie</a>


The Dormouse's story

The Dormouse's story
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
...



<a class="sister" href="http://example.com/elsie" id="test_link1">Elsie</a>

遍历子标签示例代码如下：

In [6]:
for child in soup.body.children:     #遍历直接子标签
    print(child)



<p class="title"><b>The Dormouse's story</b></p>


<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="test_link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>


<p class="story">...</p>




## 三、requests基本应用

### 2.requests基本操作

#### (1)增加头部并设置访问代理

In [7]:
import requests
url = 'https://api.github.com/some/endpoint'
headers = {'user-agent': 'my-app/0.0.1'}
r = requests.get(url, headers=headers)

#### (2)访问网页并提交数据

In [8]:
payload = {'key1': 'value1', 'key2': 'value2'}
r = requests.post("http://httpbin.org/post", data=payload)
print(r.text)        #查看网页信息，略去输出结果
url = 'https://api.github.com/some/endpoint'
payload = {'some': 'data'}
r = requests.post(url, json=payload)
print(r.text)        #查看网页信息，略去输出结果
print(r.headers)     #查看头部信息，略去输出结果
print(r.headers['Content-Type'])
print(r.headers['Content-Encoding'])

{
  "args": {}, 
  "data": "", 
  "files": {}, 
  "form": {
    "key1": "value1", 
    "key2": "value2"
  }, 
  "headers": {
    "Accept": "*/*", 
    "Accept-Encoding": "gzip, deflate, br", 
    "Content-Length": "23", 
    "Content-Type": "application/x-www-form-urlencoded", 
    "Host": "httpbin.org", 
    "User-Agent": "python-requests/2.31.0", 
    "X-Amzn-Trace-Id": "Root=1-662116b7-158b07aa78a99de46b1ac4da"
  }, 
  "json": null, 
  "origin": "124.207.151.79", 
  "url": "http://httpbin.org/post"
}

{"message":"Not Found","documentation_url":"https://docs.github.com/rest"}
{'Server': 'GitHub.com', 'Date': 'Thu, 18 Apr 2024 12:48:57 GMT', 'Content-Type': 'application/json; charset=utf-8', 'X-GitHub-Media-Type': 'github.v3; format=json', 'x-github-api-version-selected': '2022-11-28', 'Access-Control-Expose-Headers': 'ETag, Link, Location, Retry-After, X-GitHub-OTP, X-RateLimit-Limit, X-RateLimit-Remaining, X-RateLimit-Used, X-RateLimit-Resource, X-RateLimit-Reset, X-OAuth-Scopes, X-

#### (3)获取和设置cookies

下面的代码演示了使用get()方法获取网页信息时cookies属性的用法：

In [9]:
r = requests.get("http://www.baidu.com/")
r.cookies

<RequestsCookieJar[Cookie(version=0, name='BDORZ', value='27315', port=None, port_specified=False, domain='.baidu.com', domain_specified=True, domain_initial_dot=True, path='/', path_specified=True, secure=False, expires=1713530941, discard=False, comment=None, comment_url=None, rest={}, rfc2109=False)]>

下面的代码演示了使用get()方法获取网页信息时设置cookies参数的用法：

In [10]:
url = 'http://httpbin.org/cookies'
cookies = dict(cookies_are='working')
r = requests.get(url, cookies=cookies)  #设置cookies
print(r.text)

{
  "cookies": {
    "cookies_are": "working"
  }
}



### 3.网页JSON数据爬取与解析

下面我们使用request库来获取国际空间站ISS的当前位置。

In [11]:
import requests 
# 通过opennotify api请求获取国际空间站ISS的最新位置.
response = requests.get("http://api.open-notify.org/iss-now.json")
response    #显示结果：<Response [200]>

<Response [200]>

In [12]:
response.content

b'{"message": "success", "iss_position": {"latitude": "21.6288", "longitude": "-129.9335"}, "timestamp": 1713444552}'

In [13]:
response.headers['content-type']

'application/json'

In [14]:
response_j = response.content.decode("utf-8")
print(response_j)

{"message": "success", "iss_position": {"latitude": "21.6288", "longitude": "-129.9335"}, "timestamp": 1713444552}


上述是JSON读取内容，看起来像个字典，有key-value对，可以使用json 库将JSON转换为对象：

In [15]:
import json
response_d = json.loads(response_j)
print(type(response_d))
print(response_d)
response_d["iss_position"]
#或者使用pandas模块直接读入,pandas也可以读入json
import pandas as pd 

df = pd.read_json(response_j)
df

<class 'dict'>
{'message': 'success', 'iss_position': {'latitude': '21.6288', 'longitude': '-129.9335'}, 'timestamp': 1713444552}


  df = pd.read_json(response_j)


Unnamed: 0,message,iss_position,timestamp
latitude,success,21.6288,2024-04-18 12:49:12
longitude,success,-129.9335,2024-04-18 12:49:12


## 四、Pandas读取Table标签内容

对于HTML的Table标签组织的表格数据，还可以是使用Pandas的read_html函数直接读取，装载为DataFrame对象。下面示例程序展示了如何读取新浪股票网页中的表格数据。

运行结果如下：

In [16]:
import pandas as pd
# 由于网站更新，链接可能无效
pd.set_option('display.width', None)
url='https://finance.sina.com.cn/stock/'
df = pd.read_html(url)[6]  #返回值为DataFram数组，我们取第7个
df.head()

Unnamed: 0,股票名称,申购代码,日期,申购价格
0,欧莱新材,787530,04-25,--
1,宏鑫科技,301539,04-01,10.64
2,灿芯股份,787691,03-29,19.86
3,无锡鼎邦,889900,03-27,6.20
4,中瑞股份,301587,03-25,21.73


## 五、正则表达式与网页内容解析

### 2.正则表达式模块re

正则表达式解析示例：

In [17]:
import re
example = 'Beautiful is better than ugly.'
re.findall('\\b\w.+?\\b', example)           #所有单词

['Beautiful', 'is', 'better', 'than', 'ugly']

In [18]:
re.findall('\w+', example)                   #所有单词

['Beautiful', 'is', 'better', 'than', 'ugly']

In [19]:
re.findall(r'\b\w.+?\b', example)            #使用原始字符串

['Beautiful', 'is', 'better', 'than', 'ugly']

In [20]:
re.split('\s', example)               #使用任何空白字符分隔字符串

['Beautiful', 'is', 'better', 'than', 'ugly.']

In [21]:
re.findall('\d+\.\d+\.\d+', 'Python 2.7.13') #查找x.x.x形式的数字

['2.7.13']

In [22]:
re.findall('\d+\.\d+\.\d+', 'Python 2.7.13,Python 3.6.0')

['2.7.13', '3.6.0']

In [23]:
s = '<html><head>This is head.</head><body>This is body.</body></html>'

In [24]:
pattern = r'<html><head>(.+)</head><body>(.+)</body></html>'

In [25]:
result = re.search(pattern, s)

In [26]:
result.group(1)                              #第一个子模式

'This is head.'

In [27]:
result.group(2)                              #第二个子模式

'This is body.'

### 3.match对象

match对象应用示例：

In [28]:
m = re.match(r"(?P<first_name>\w+) (?P<last_name>\w+)", "Malcolm Reynolds")
m.group('first_name')      #使用命名的子模式

'Malcolm'

In [29]:
m.group('last_name')

'Reynolds'

In [30]:
m = re.match(r"(\d+)\.(\d+)", "24.1632")
m.groups()                 #返回所有匹配的子模式（不包括第0个）

('24', '1632')

In [31]:
m = re.match(r"(?P<first_name>\w+) (?P<last_name>\w+)", "Malcolm Reynolds")
m.groupdict()              #以字典形式返回匹配的结果

{'first_name': 'Malcolm', 'last_name': 'Reynolds'}

# 第四节 动态网页内容爬取

## 二、Selenium基本应用

### 3.Seleniums实践

下面是Selenium动态网页内容爬取指定城市当前天气的示例：

In [32]:
import re
from selenium import webdriver
import time
driver = webdriver.Edge()		#指定引擎，或者Chrome()等
city = input('请输入要查询的城市：').lower()
#获取指定URL的信息，并进行渲染
driver.get(r'http://openweathermap.org/find?q={0}'.format(city))
time.sleep(2)
#网页内容渲染结束之后获取网页源代码，并转换成小写
content = driver.page_source.lower()
matchResult = re.search(r'<a href="(.+?)">\s+'+city+'.+?]', content)
if matchResult:
    print(matchResult.group(0))
else:
    print('查不到，请检查城市名字。')
driver.close()  # 关闭当前页面，如果只有一个页面，会关闭浏览器
driver.quit() 	# 关闭浏览器

请输入要查询的城市：Beijing
<a href="/city/1816670"> beijing, cn</a></b> <img src="http://openweathermap.org/images/flags/cn.png"><b><i> broken clouds</i></b><p><span class="badge badge-info">21.9°с </span> temperature from 21.9 to 21.9 °с, wind 4.3 m/s. clouds 56 %, 1009 hpa</p><p>geo coords <a href="/weathermap?zoom=12&amp;lat=39.9075&amp;lon=116.3972">[39.9075, 116.3972]


### 4.Selenium结合浏览器驱动的应用

Selenium结合headless形式Chrome浏览器驱动的爬取网页示例：

In [35]:
import re
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
driver = webdriver.Chrome(options=chrome_options) 
city = input('请输入要查询的城市：').lower()
#获取指定URL的信息，并进行渲染
driver.get(r'http://openweathermap.org/find?q={0}'.format(city))
#网页内容渲染结束之后获取网页源代码，并转换成小写
content = driver.page_source.lower()
matchResult = re.search(r'<a href="(.+?)">\s+'+city+'.+?]', content)
#后面代码与前述示例代码相同，略
if matchResult:
    print(matchResult.group(0))
else:
    print('查不到，请检查城市名字。')
# 获取当前url
print(driver.current_url)
# 关闭浏览器
driver.quit()

请输入要查询的城市：Shanghai
查不到，请检查城市名字。
https://openweathermap.org/find?q=shanghai


## 三、网站操作模拟

### 1.网站模拟登陆

通过driver.find_element_by_name（Selenium 4.0之后版本对应方法是find_element函数）方法找到对应元素，然后调用send_keys()方法模拟手工输入，或者执行click()方法模拟鼠标点击。

In [None]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import time

chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
driver = webdriver.Chrome(options=chrome_options) 
#注意：selenium最新版本已不支持PhantomJS的webdriver
#driver = webdriver.PhantomJS()
driver.get("http://www.douban.com")
# 输入账号密码
# 注意：selenium 4.0版本之前的定位元素方法类似于“find_element_by_xx”，但4.0及之后版本:使用find_element函数配合By.ID、By.NAME等参数
#driver.find_element_by_name("form_email").send_keys("xxxxx@xxxx.com")
#driver.find_element_by_name("form_password").send_keys("xxxxxxxx")
driver.find_element(By.ID,"username").send_keys("xxxxx@xxxx.com")
driver.find_element(By.ID,"password").send_keys("xxxxxxxx")
# 模拟点击登录
#driver.find_element_by_xpath("//input[@class='bn-submit']").click()
driver.find_element(By.XPATH,"//input[@class='bn-submit']").click()
# 等待3秒
time.sleep(3)
# 生成登陆后快照
driver.save_screenshot("douban.png")
with open("douban.html", "w") as file:
    file.write(driver.page_source)
driver.quit()

In [None]:
#百度文心一言生成代码
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
 
# 确保设置了ChromeDriver的路径
#driver_path = 'path/to/chromedriver'
 
# 初始化WebDriver
#driver = webdriver.Chrome(executable_path=driver_path)
driver = webdriver.Chrome()
 
# 打开豆瓣网首页
driver.get('https://www.douban.com/')
 
# 找到登录按钮并点击
login_button = driver.find_element(By.LINK_TEXT, '登录')
login_button.click()
 
# 输入用户名和密码
username_input = driver.find_element(By.ID, 'username')
password_input = driver.find_element(By.ID, 'password')
username_input.send_keys('your_username')
password_input.send_keys('your_password')
 
# 提交登录信息
submit_button = driver.find_element(By.CLASS_NAME, 'bn-submit')
submit_button.click()
 
# 确保登录成功
wait = WebDriverWait(driver, 10)
wait.until(EC.presence_of_element_located((By.LINK_TEXT, '账户名')))
 
# 关闭浏览器
driver.quit()

### 2.执行 JavaScript 语句

通过driver的 execute()方法执行JavaScript语句。

In [None]:
from selenium import webdriver
driver = webdriver.PhantomJS()
driver.get("https://www.baidu.com/")
# 给搜索输入框标红的javascript脚本
js = "var q=document.getElementById(\"kw\");q.style.border=\"2px solid red\";"
# 调用给搜索输入框标红js脚本
driver.execute_script(js)
#查看页面快照
driver.save_screenshot("redbaidu.png")
#js隐藏元素，将获取的图片元素隐藏
img = driver.find_element_by_xpath("//*[@id='lg']/img")
driver.execute_script('$(arguments[0]).fadeOut()',img)
# 向下滚动到页面底部
driver.execute_script("$('.scroll_top').click(function(){$('html,body').animate({scrollTop: '0px'}, 800);});")
#查看页面快照
driver.save_screenshot("nullbaidu.png")
driver.quit()

In [None]:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import time

chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
driver = webdriver.Chrome(options=chrome_options) 
 
# 配置webdriver路径，根据实际情况修改
url = 'https://movie.douban.com/typerank?type_name=喜剧&type=13&interval_id=100:90&action='
# 初始化webdriver
driver.get(url)
 
# 等待页面加载完成
wait = WebDriverWait(driver, 40)
#wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'paginator')))
 
# 滚动到页面底部加载更多数据
for i in range(5):  # 假设只需加载5次
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(2)  # 等待加载，可能需要根据实际网络情况调整
 
# 提取数据
movies = driver.find_elements(By.NAME,'item')
for movie in movies:
    rank = movie.find_element_by_css_selector('.rank').text
    name = movie.find_element_by_css_selector('.title').text
    score = movie.find_element_by_css_selector('.rating_num').text
    print(f'排名: {rank}, 名称: {name}, 评分: {score}')
 
# 关闭浏览器
driver.quit()