In [9]:
html_example = '''
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>BeautifulSoup 활용</title>
</head>
<body>
    <h1 id="heading">Heading 1</h1>
    <p>Paragraph</p>
    <span class="red">BeautifulSoup Library Examples!</span>
    <div id="link">
        <a class="external_link" href="www.google.com">google</a>
        <div id="class1">
            <p id="first">class1's first paragraph</p>
            <a class="external_link" href="www.naver.com">naver</a>

            <p id="second">class1's second paragraph</p>
            <a class="internal_link" href="/pages/page1.html">Page1</a>
            <p id="third">class1's third paragraph</p>
        </div>
    </div>
    <div id="text_id2">
        Example page
        <p>g</p>
    </div>
    <h1 id="footer">Footer</h1>
</body>
</html>
'''


In [10]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_example, 'html.parser')

# Select 문법

In [11]:
head = soup.select_one('head')
print(head)

print('head.text', head.text.strip())

<head>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<title>BeautifulSoup 활용</title>
</head>
head.text BeautifulSoup 활용


In [12]:
h1 = soup.select_one('h1')
print(h1)

<h1 id="heading">Heading 1</h1>


In [None]:
# <h1> 태그의 id가 'footer'인 항목 추출
footer = soup.select_one('h1#footer')
print(footer)

<h1 id="footer">Footer</h1>


In [15]:
class_link = soup.select_one('a.internal_link')
print(class_link)

<a class="internal_link" href="/pages/page1.html">Page1</a>


In [16]:
print(class_link.text)
print(class_link['href'])

Page1
/pages/page1.html


부모태그 > 자식태그 형식으로 접근 : 태그가 단계적으로 존재할때

In [18]:
link1 = soup.select_one('div#link > a.external_link')
print(link1)

<a class="external_link" href="www.google.com">google</a>


In [19]:
link_find = soup.find('div', {'id':'link'})

external_link = link_find.find('a',{'class': 'external_link'})
print('find external_link:', external_link)


find external_link: <a class="external_link" href="www.google.com">google</a>


공백으로 하위 태그 선언
(상위 태그 하위태그) 형식으로 접근 : 자손 관계의 하위 태그

In [20]:
link2 = soup.select_one('div#class1 p#second')
print(link2)
print(link2.text)

<p id="second">class1's second paragraph</p>
class1's second paragraph


In [21]:
internal_link = soup.select_one('div#link a.internal_link')
print(internal_link['href'])
print(internal_link.text)

/pages/page1.html
Page1


In [22]:
h1_all = soup.select('h1')
print('h1_all', h1_all)

h1_all [<h1 id="heading">Heading 1</h1>, <h1 id="footer">Footer</h1>]


모든 a 태그의 url 링크 검색

In [23]:
url_link = soup.select('a')
for link in url_link:
	print(link['href'])

www.google.com
www.naver.com
/pages/page1.html


In [24]:
# <div id= 'class1'> 내부의 모든 <a> 태그 검색후 url 추출

div_urls = soup.select('div#class1 > a')

print(div_urls)
print(div_urls[0]['href'])

[<a class="external_link" href="www.naver.com">naver</a>, <a class="internal_link" href="/pages/page1.html">Page1</a>]
www.naver.com


In [25]:
div_urls2 = soup.select('div#class1 a')
print(div_urls2)

[<a class="external_link" href="www.naver.com">naver</a>, <a class="internal_link" href="/pages/page1.html">Page1</a>]


여러항목 검색하기 'h1' 태그의 id 가 heading 과 footer 를 모두 검색 

In [26]:
h1 = soup.select('#heading,#footer')
print(h1)

[<h1 id="heading">Heading 1</h1>, <h1 id="footer">Footer</h1>]


In [27]:
url_link = soup.select('a.external_link, a.internal_link')
print(url_link)

[<a class="external_link" href="www.google.com">google</a>, <a class="external_link" href="www.naver.com">naver</a>, <a class="internal_link" href="/pages/page1.html">Page1</a>]


In [None]:
|