# 数据采集（四）：HTML解析利器BeautifulSoup实战

####  1. BeautifulSoup的安装

##### pip install lxml
##### pip install beautifulsoup4

####  2. BeautifulSoup基本用法

In [7]:
from bs4 import BeautifulSoup 

In [8]:
html_doc = """
<html><head><title>The Dormouse's story</title></head>

<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

In [10]:
soup = BeautifulSoup(html_doc,"lxml")

In [13]:
soup.head

<head><title>The Dormouse's story</title></head>

In [11]:
soup.title

<title>The Dormouse's story</title>

In [12]:
soup.p

<p class="title"><b>The Dormouse's story</b></p>

In [15]:
soup.find_all('p')

[<p class="title"><b>The Dormouse's story</b></p>,
 <p class="story">Once upon a time there were three little sisters; and their names were
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
 and they lived at the bottom of a well.</p>,
 <p class="story">...</p>]

In [14]:
soup.p.get_text()

"The Dormouse's story"

In [16]:
soup.a['href']

'http://example.com/elsie'

In [17]:
soup.a['class']

['sister']

In [18]:
soup.head.contents

[<title>The Dormouse's story</title>]

In [19]:
soup.find_all('title', limit=1)

[<title>The Dormouse's story</title>]

In [20]:
soup.find('title')

<title>The Dormouse's story</title>

In [21]:
soup.find_all('title')

[<title>The Dormouse's story</title>]

In [22]:
soup.find_all('p','title')

[<p class="title"><b>The Dormouse's story</b></p>]

In [23]:
soup.find_all('a')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [24]:
soup.find_all(id="link2")

[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

In [25]:
soup.find_all(id=True)

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

#### css选择器部分

In [26]:
soup.select("title")

[<title>The Dormouse's story</title>]

In [28]:
soup.select("body a")

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [29]:
soup.select("html head title")

[<title>The Dormouse's story</title>]

In [31]:
soup.select("p")

[<p class="title"><b>The Dormouse's story</b></p>,
 <p class="story">Once upon a time there were three little sisters; and their names were
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
 and they lived at the bottom of a well.</p>,
 <p class="story">...</p>]

In [32]:
soup.select("head > title")

[<title>The Dormouse's story</title>]

In [33]:
soup.select("p > a")

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [34]:
soup.select(".sister")

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [35]:
soup.select("#link1")

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

In [36]:
soup.select('a[href]')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

####  3. BeautifulSoup实战

In [220]:
#导入库
import requests
import time
from bs4 import BeautifulSoup
import pandas as pd


# 设置列表页面URL的固定部分
BASE_URL_U1 = "http://www.cyzone.cn/event/list-0-"
BASE_URL_U2 = "-0-0-0-0/0"

#最好在http请求中设置一个头部信息，否则很容易被封ip
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Accept':'text/html;q=0.9,*/*;q=0.8',
'Accept-Charset':'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding':'gzip',
'Connection':'close',
'Referer':'http://www.baidu.com/link?url=_andhfsjjjKRgEWkj7i9cFmYYGsisrnm2A-TN3XZDQXxvGsM9k9ZZSnikW2Yds4s&amp;amp;wd=&amp;amp;eqid=c3435a7d00146bd600000003582bfd1f'
}

#存储页面html内容
html=''
# 循环抓取列表页信息 ，分页
for i in range(1,11):
    i=str(i)
    var_url=(BASE_URL_U1 + i + BASE_URL_U2)
    r=requests.get(url=var_url,headers=headers)
    html2= r.content.decode()
    html = html + html2
    # 每次间隔1秒
    time.sleep(1)
    # 解析抓取的页面内容

In [221]:
res = BeautifulSoup(html, 'html.parser')
# 获取感兴趣目标信息：
# 提取公司名称
# table>tbody>tr.table-plate3>td.tp2>span.tp2_tit>a     
table_plate3 = res.find_all('tr', 'table-plate3')
print(len(table_plate3))

180


In [222]:
urls = []        #详情页
companies = []   #公司名称
money = []       #融资额
frounds = []     #融资轮次
investors = []   #融资方
businesses = []  #行业
update_times = [] #跟新时间

for table_plate in table_plate3:
    all_td = table_plate.find_all('td')
    href = "http"+ all_td[0].a['href']
    company = all_td[1].span.text
    mon = all_td[2].find_all("div")[-1].text
    fround = all_td[8].text
    if all_td[9].a is not None:
        investor = all_td[9].a.text
    else:
        investor = '未知'
    business = all_td[10].a.text
    update_time = all_td[11].text
    
    companies.append(company)
    money.append(mon)
    frounds.append(fround)
    investors.append(investor)
    businesses.append(business)
    update_times.append(update_time)
    urls.append(href)

In [223]:
import pandas as pd

table_dict = {'公司名称':companies,'融资金额':money,'融资轮次':frounds,'投资方':investors,'行业':businesses,'更新时间':update_times,'详情URL':urls}
# 创建数据表
resultsDatas = pd.DataFrame(table_dict)

In [224]:
resultsDatas.head(10)

Unnamed: 0,公司名称,融资金额,融资轮次,投资方,行业,更新时间,详情URL
0,MyTaiwanTour,150万美元,战略投资,投资方未披露,旅游,2018-11-21,http//www.cyzone.cn/company/480955.html
1,Stilla Technologies,1800万美元,A轮,投资方未披露,医疗健康,2018-11-21,http//www.cyzone.cn/company/480954.html
2,中建材信息,5.87亿人民币,定向增发,投资方未披露,企业服务,2018-11-21,http//www.cyzone.cn/company/480953.html
3,晶准生物,数百万人民币,天使轮,投资方未披露,医疗健康,2018-11-21,http//www.cyzone.cn/company/480960.html
4,百布易卖,1亿美元,C+轮,老虎基金,电子商务,2018-11-21,http//www.cyzone.cn/company/480914.html
5,亮动科技,1000万人民币,天使轮,南京虎眼,医疗健康,2018-11-21,http//www.cyzone.cn/company/480913.html
6,QM Scientific,未公开,并购,Ulta Beauty,人工智能,2018-11-21,http//www.cyzone.cn/company/480906.html
7,GlamST,未公开,并购,Ulta Beauty,ARVR,2018-11-21,http//www.cyzone.cn/company/480905.html
8,Voi Technology,5000万美元,A轮,Balderton Capital,汽车交通,2018-11-21,http//www.cyzone.cn/company/480892.html
9,ClearDATA,2600万美元,E轮,Norwest Venture Partners,企业服务,2018-11-21,http//www.cyzone.cn/company/480891.html
