In [1]:
from bs4 import BeautifulSoup

In [2]:
html = """<html><body><div><span>
        <a href=http://www.naver.com>naver</a>
        <a href=https://www.google.com>google</a>
        <a href=http://www.daum.net/>daum</a>
        </span></div></body></html>"""

In [4]:
soup = BeautifulSoup(html, 'lxml')

In [6]:
print(soup)

<html><body><div><span>
<a href="http://www.naver.com">naver</a>
<a href="https://www.google.com">google</a>
<a href="http://www.daum.net/">daum</a>
</span></div></body></html>


In [8]:
print(soup.prettify())

<html>
 <body>
  <div>
   <span>
    <a href="http://www.naver.com">
     naver
    </a>
    <a href="https://www.google.com">
     google
    </a>
    <a href="http://www.daum.net/">
     daum
    </a>
   </span>
  </div>
 </body>
</html>


In [9]:
print(soup.find('a'))

<a href="http://www.naver.com">naver</a>


In [13]:
print(type(soup.findAll('a')))

<class 'bs4.element.ResultSet'>


In [14]:
print(soup.find('a').get_text())

naver


In [17]:
print(type(soup.find_all('a')))

<class 'bs4.element.ResultSet'>


In [19]:
site_names = soup.find_all('a')
for site in site_names:
    print(site.get_text())

naver
google
daum


In [22]:
texts = [site.get_text() for site in soup.find_all('a')]
print(texts)

['naver', 'google', 'daum']


In [24]:
soup2 = BeautifulSoup("""
	<html>
	 <head>
	  <title>작품과 작가 모음</title>
	 </head>
	 <body>
	  <h1>책 정보</h1>
	  <p id="book_title">토지</p>
	  <p id="author">박경리</p>
	
	  <p id="book_title">태백산맥</p>
	  <p id="author">조정래</p>
	
	  <p id="book_title">감옥으로부터의 사색</p>
	  <p id="author">신영복</p>
	 </body>
	</html>
""", 'lxml')

In [25]:
print(soup2.title) # unique tag

<title>작품과 작가 모음</title>


In [26]:
print(soup2.body)

<body>
<h1>책 정보</h1>
<p id="book_title">토지</p>
<p id="author">박경리</p>
<p id="book_title">태백산맥</p>
<p id="author">조정래</p>
<p id="book_title">감옥으로부터의 사색</p>
<p id="author">신영복</p>
</body>


In [33]:
print(soup2.find_all('p', {'id':'book_title'}))

[<p id="book_title">토지</p>, <p id="book_title">태백산맥</p>, <p id="book_title">감옥으로부터의 사색</p>]


In [28]:
print(soup2.find_all('p',{'id':'author'}))

[<p id="author">박경리</p>, <p id="author">조정래</p>, <p id="author">신영복</p>]


In [38]:
titles = soup2.find_all('p', {'id':'book_title'})
authors = soup2.find_all('p', {'id':'author'})
for title, author in zip(titles, authors):
    print(title.get_text() + '/' + author.get_text())

토지/박경리
태백산맥/조정래
감옥으로부터의 사색/신영복


In [41]:
# CSS Selecter 이용
print(soup2.select('body'))

[<body>
<h1>책 정보</h1>
<p id="book_title">토지</p>
<p id="author">박경리</p>
<p id="book_title">태백산맥</p>
<p id="author">조정래</p>
<p id="book_title">감옥으로부터의 사색</p>
<p id="author">신영복</p>
</body>]


In [42]:
# CSS Selecter 이용
print(soup2.select('body'))

[<body>
<h1>책 정보</h1>
<p id="book_title">토지</p>
<p id="author">박경리</p>
<p id="book_title">태백산맥</p>
<p id="author">조정래</p>
<p id="book_title">감옥으로부터의 사색</p>
<p id="author">신영복</p>
</body>]


In [44]:
# CSS Selecter 이용
print(soup2.select('body p'))

[<p id="book_title">토지</p>, <p id="author">박경리</p>, <p id="book_title">태백산맥</p>, <p id="author">조정래</p>, <p id="book_title">감옥으로부터의 사색</p>, <p id="author">신영복</p>]


In [45]:
# CSS Selecter 이용
print(soup2.select('p#book_title')) # # is id,

[<p id="book_title">토지</p>, <p id="book_title">태백산맥</p>, <p id="book_title">감옥으로부터의 사색</p>]


In [46]:
html = """
<!doctype html>
<html>
<head>
<meta charset="utf-8">
<title>사이트 모음</title>
</head>
<body>
<p id="title">
<b>자주 가는 사이트 모음</b>
</p>
<p id="contents">이곳은 자주 가는 사이트를 모아둔 곳입니다.</p>
<a href="http://www.naver.com" class="portal" id="naver">네이버</a> <br>
<a href="https://www.google.com" class="search" id="google">구글</a> <br>
<a href="http://www.daum.net" class="portal" id="daum">다음</a> <br>
<a href="http://www.nl.go.kr" class="government" id="nl">국립중앙도서관</a>
</body>
</html>
"""

In [47]:
soup3 = BeautifulSoup(html, 'lxml')

In [48]:
print(soup3.select('a.portal'))

[<a class="portal" href="http://www.naver.com" id="naver">네이버</a>, <a class="portal" href="http://www.daum.net" id="daum">다음</a>]


In [49]:
print(soup3.select('html body a'))

[<a class="portal" href="http://www.naver.com" id="naver">네이버</a>, <a class="search" href="https://www.google.com" id="google">구글</a>, <a class="portal" href="http://www.daum.net" id="daum">다음</a>, <a class="government" href="http://www.nl.go.kr" id="nl">국립중앙도서관</a>]


In [50]:
 print(soup3.select('html a'))

[<a class="portal" href="http://www.naver.com" id="naver">네이버</a>, <a class="search" href="https://www.google.com" id="google">구글</a>, <a class="portal" href="http://www.daum.net" id="daum">다음</a>, <a class="government" href="http://www.nl.go.kr" id="nl">국립중앙도서관</a>]
