#  - httpx支持http2.0

In [2]:
import httpx

In [3]:
response = httpx.get('https://www.httpbin.org/get')
print(response.status_code)
print(response.headers)
print(response.text)

200
Headers({'date': 'Sun, 19 Nov 2023 12:34:45 GMT', 'content-type': 'application/json', 'content-length': '316', 'connection': 'keep-alive', 'server': 'gunicorn/19.9.0', 'access-control-allow-origin': '*', 'access-control-allow-credentials': 'true'})
{
  "args": {}, 
  "headers": {
    "Accept": "*/*", 
    "Accept-Encoding": "gzip, deflate, br", 
    "Host": "www.httpbin.org", 
    "User-Agent": "python-httpx/0.25.1", 
    "X-Amzn-Trace-Id": "Root=1-655a00e5-48f25cd238cb89540f17c701"
  }, 
  "origin": "54.254.235.25", 
  "url": "https://www.httpbin.org/get"
}



In [10]:
url = 'https://www.httpbin.org/get'
headers = {'User-Agent': 'my-app/0.0.1'}

with httpx.Client(headers=headers) as client:
    response = client.get(url)
    print(response.json()['headers']['User-Agent'])

my-app/0.0.1


In [8]:
# 等价于
client = httpx.Client()

try:
    response = client.get('https://www.httpbin.org/get')
finally:
    client.close()

# - xpath 实例引入

In [16]:
from lxml import etree
text = '''
<div>
    <ul>
         <li class="item-0"><a href="link1.html">first item</a></li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-inactive"><a href="link3.html">third item</a></li>
         <li class="item-1"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a>
     </ul>
 </div>
'''
html = etree.HTML(text)
result = etree.tostring(html)
# print(result.decode('utf-8'))

In [19]:
html = etree.parse('./test.html', etree.HTMLParser())
result = etree.tostring(html, method='html')
print(result.decode('utf-8'))

<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
<html><body><div>
  <ul>
    <li class="item-0"><a href="link1.html">first item</a></li>
    <li class="item-1"><a href="link2.html">second item</a></li>
    <li class="item-inactive"><a href="link3.html">third item</a></li>
    <li class="item-1"><a href="link4.html">fourth item</a></li>
    <li class="item-0"><a href="link5.html">fifth item</a></li>
  </ul>
</div></body></html>


## 所有节点

In [21]:
result = html.xpath('//*')
result

[<Element html at 0x15bf047c0>,
 <Element body at 0x15d05a9c0>,
 <Element div at 0x15be6e1c0>,
 <Element ul at 0x15be65900>,
 <Element li at 0x15d07ee40>,
 <Element a at 0x15d07db80>,
 <Element li at 0x15d07e980>,
 <Element a at 0x15d07f100>,
 <Element li at 0x15d07d5c0>,
 <Element a at 0x15d07d640>,
 <Element li at 0x15d085900>,
 <Element a at 0x15d087d00>,
 <Element li at 0x15d087580>,
 <Element a at 0x15d084f00>]

In [32]:
result = html.xpath('//li')
result

[<Element li at 0x15d07ee40>,
 <Element li at 0x15d07e980>,
 <Element li at 0x15d07d5c0>,
 <Element li at 0x15d085900>,
 <Element li at 0x15d087580>]

In [29]:
# 子节点
result = html.xpath('//li/a/text()')
result

['first item', 'second item', 'third item', 'fourth item', 'fifth item']

In [35]:
# 父节点
# result = html.xpath('//a[@href="link4.html"]/../@class')
result = html.xpath('//a[@href="link4.html"]/parent::*/@class')
result

['item-1']

In [37]:
# 属性匹配
result = html.xpath('//li[@class="item-0"]')
result

[<Element li at 0x15d07ee40>, <Element li at 0x15d087580>]

In [41]:
# 文本获取
result = html.xpath('//li[@class="item-0"]/a/text()')
result1 = html.xpath('//li[@class="item-0"]//text()')
print(result)
print(result1)

['first item', 'fifth item']
['first item', 'fifth item']


注意，此处和属性匹配的方法不同，属性匹配是中括号加属性名和值来限定某个属性，
如 [@href="link1.html"]，

而此处的 @href 指的是获取节点的某个属性，二者需要做好区分。

In [42]:
result = html.xpath('//li/a/@href')
result

['link1.html', 'link2.html', 'link3.html', 'link4.html', 'link5.html']

In [43]:
# 属性多值匹配

In [46]:
text = '''
<li class="li li-first"><a href="link.html">first item</a></li>
'''
html = etree.HTML(text)
# result = html.xpath('//li[@class="li"]/a/text()')
result = html.xpath('//li[contains(@class, "li")]/a/text()')
result

['first item']

In [47]:
# 多属性匹配

In [49]:
text = '''
<li class="li li-first" name="item"><a href="link.html">first item</a></li>
'''
html = etree.HTML(text)
result = html.xpath('//li[contains(@class, "li") and @name="item"]/a/text()')
result

['first item']

In [50]:
# 按序选择

In [59]:
text = '''
<div>
    <ul>
         <li class="item-0"><a href="link1.html">first item</a></li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-inactive"><a href="link3.html">third item</a></li>
         <li class="item-1"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a>
     </ul>
 </div>
'''
html = etree.HTML(text)
result = html.xpath('//li[1]/a/text()')
print(result)
result = html.xpath('//li[last()]/a/text()')
print(result)
result = html.xpath('//li[position()<3]/a/text()')
print(result)
result = html.xpath('//li[last()-2]/a/text()')
print(result)

['first item']
['fifth item']
['first item', 'second item']
['third item']


In [None]:
# 节点轴的选择
'''

ancestor:   祖先
attribute:  属性值
child:      子节点
descendant: 子孙节点
follwing:   当前节点之后的所有节点
follwing-sibling: 获取当前节点之后的所有同级节点

'''

In [78]:
text = '''
<div>
    <ul>
         <li class="item-0"><a href="link1.html"><span>first item</span></a></li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-inactive"><a href="link3.html">third item</a></li>
         <li class="item-1"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a>
     </ul>
 </div>
'''
html = etree.HTML(text)
result = html.xpath('//li[1]/ancestor::*')
print(result)
print('-------------')
result = html.xpath('//li[1]/ancestor::div')
print(result)
print('-------------')
result = html.xpath('//li[1]/attribute::*')
print(result)
print('-------------')
result = html.xpath('//li[1]/child::a[@href="link1.html"]')
print(result)
print('-------------')
result = html.xpath('//li[1]/descendant::span')
print(result)
print('-------------')
result = html.xpath('//li[1]/following::*')
print(result)
print('-------------')
result = html.xpath('//li[1]/following-sibling::*')
print(result)

[<Element html at 0x15d07c580>, <Element body at 0x15ce5ef40>, <Element div at 0x15d2c78c0>, <Element ul at 0x15ce64680>]
-------------
[<Element div at 0x15d2c78c0>]
-------------
['item-0']
-------------
[<Element a at 0x15b3dff80>]
-------------
[<Element span at 0x15d296840>]
-------------
[<Element li at 0x15b3dff80>, <Element a at 0x15d28b4c0>, <Element li at 0x15b620840>, <Element a at 0x15ce64680>, <Element li at 0x15ce66100>, <Element a at 0x15ce66480>, <Element li at 0x15ce665c0>, <Element a at 0x15ce64300>]
-------------
[<Element li at 0x15b3dff80>, <Element li at 0x15b620840>, <Element li at 0x15ce66100>, <Element li at 0x15ce665c0>]


In [79]:
from lxml import etree

text = '''
<div>
    <ul>
         <li class="item-0"><a href="link1.html"><span>first item</span></a></li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-inactive"><a href="link3.html">third item</a></li>
         <li class="item-1"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a>
     </ul>
 </div>
'''

# 使用lxml库解析HTML文本
html = etree.HTML(text)

# 选择第一个<li>元素之后的所有节点
result = html.xpath('//li[@class="item-0"][1]/following::*')

# 打印结果
for elem in result:
    print(etree.tostring(elem, pretty_print=True, encoding='unicode'))


<li class="item-1">
  <a href="link2.html">second item</a>
</li>
         

<a href="link2.html">second item</a>

<li class="item-inactive">
  <a href="link3.html">third item</a>
</li>
         

<a href="link3.html">third item</a>

<li class="item-1">
  <a href="link4.html">fourth item</a>
</li>
         

<a href="link4.html">fourth item</a>

<li class="item-0"><a href="link5.html">fifth item</a>
     </li>

<a href="link5.html">fifth item</a>
     

