In [None]:
# 多行输出结果
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# PyQuery的使用
`PyQuery` 的CSS选择器功能比 `BeautifulSoup` 强大

## 1. 初始化
像 `BeautifulSoup` 一样，`PyQuery` 初始化的时候也需要传入 `HTML` 数据源来初始化一个操作对象，它的初始化方式有多种，比如直接传入字符串，传入 `URL`，传文件名
### 1.1 字符串初始化

In [13]:
html = '''
<div id="container">
    <ul class="list">
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
 </div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
print(doc('li'))

<li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     


### 1.2 URL初始化
指定参数为 `url` 即可

In [8]:
from pyquery import PyQuery as pq

doc = pq(url="http://www.baidu.com", encoding='utf-8')
print(doc('title'))

<title>百度一下，你就知道</title>


### 1.3 文件初始化
指定参数为 `filename` 即可

In [11]:
# doc = pq(filename='./temp/maoyan.html', encoding='gbk')
# print(doc('title'))

## 2. 基本CSS选择器

In [16]:
doc = pq(html)
print(type(doc('#container .list li')))
print(doc('#container .list li'))

<class 'pyquery.pyquery.PyQuery'>
<li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     


## 3. 查找节点
### 3.1 子节点
- `find()` 所有子孙节点
- `children()` 子节点

In [22]:
items = doc('.list')
print(type(items))
print(items)
lis = items.find('li')
print(type(lis))
print(lis)

<class 'pyquery.pyquery.PyQuery'>
<ul class="list">
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
 
<class 'pyquery.pyquery.PyQuery'>
<li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     


In [24]:
lis = items.children()
print(type(lis))
print(lis)
lis = items.children('.active')
print(type(lis))
print(lis)

<class 'pyquery.pyquery.PyQuery'>
<li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     
<class 'pyquery.pyquery.PyQuery'>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         


### 3.2 父节点
- `parent()`  获取父节点
- `parents()`  获取父节点

### 3.3 兄弟节点
- `siblings`

## 4. 遍历
- 单节点可直接输出
- 多个节点需要接 `items()` 方法

In [30]:
doc = pq(html)
lis = doc('li').items()
print('type(lis): ', type(lis))
for li in lis:
    print(li, type(li))

type(lis):  <class 'generator'>
<li class="item-0">first item</li>
          <class 'pyquery.pyquery.PyQuery'>
<li class="item-1"><a href="link2.html">second item</a></li>
          <class 'pyquery.pyquery.PyQuery'>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
          <class 'pyquery.pyquery.PyQuery'>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
          <class 'pyquery.pyquery.PyQuery'>
<li class="item-0"><a href="link5.html">fifth item</a></li>
      <class 'pyquery.pyquery.PyQuery'>


## 5. 获取信息
### 5.1 获取属性
- `attr('href')`
- `attr.href`

In [34]:
doc = pq(html)
a = doc('.item-0.active a')
print(a, type(a))
print(a.attr('href'))
print(a.attr.href)

<a href="link3.html"><span class="bold">third item</span></a> <class 'pyquery.pyquery.PyQuery'>
link3.html
link3.html


#### 注意: 只会返回第一个结果，有多结果时需要遍历

In [36]:
a = doc('a')
print(a.attr('href'))
for item in a.items():
    print(item.attr('href'))

link2.html
link2.html
link3.html
link4.html
link5.html


### 5.2 获取文本
- `text()`
- `html()`

In [40]:
doc = pq(html)
a = doc('.item-0.active a')
print(a)
print(a.text())
print(a.html())

<a href="link3.html"><span class="bold">third item</span></a>
third item
<span class="bold">third item</span>


#### 注意: text() 会返回所有结果，html() 只会返回第一个结果的内部的html文本

In [47]:
a = doc('a')
print('a: ', a, end='\n\n')
print('a.text(): ', a.text(), '\ntype(a.text()): ', type(a.text()), end='\n\n')
print('a.html(): ', a.html())

a:  <a href="link2.html">second item</a><a href="link3.html"><span class="bold">third item</span></a><a href="link4.html">fourth item</a><a href="link5.html">fifth item</a>

a.text():  second item third item fourth item fifth item 
type(a.text()):  <class 'str'>

a.html():  second item


## 6. 节点操作
- 节点操作
  - `addClass()` 添加节点
  - `removeClass()` 移除节点
- 节点信息操作
  - `attr('name', 'attr changed')`    修改 `name` 属性为 `'attr changed'`
  - `text('text changed')`    修改 `text` 为 `text changed`
  - `html('<span>html changed</span>')`    修改 `html` 为 `html changed`

## 7. 伪类选择器
支持顺序结构选择比如: 第n个，倒数第n个，奇数节点...  
详细见CSS选择器参考手册: http://www.w3school.com.cn/cssref/css_selectors.asp