## 데이터 분석을 위한 파이썬 철저 입문

In [1]:
import requests

r = requests.get('https://www.google.co.kr')
r

<Response [200]>

In [2]:
r.text[0:1000]

'<!doctype html><html itemscope="" itemtype="http://schema.org/WebPage" lang="en"><head><meta content="Search the world\'s information, including webpages, images, videos and more. Google has many special features to help you find exactly what you\'re looking for." name="description"><meta content="noodp" name="robots"><meta content="text/html; charset=UTF-8" http-equiv="Content-Type"><meta content="/logos/doodles/2022/dr-michiaki-takahashis-94th-birthday-6753651837109359-l.png" itemprop="image"><meta content="Dr. Michiaki Takahashi\'s 94th Birthday" property="twitter:title"><meta content="Dr. Michiaki Takahashi\'s 94th Birthday! #GoogleDoodle" property="twitter:description"><meta content="Dr. Michiaki Takahashi\'s 94th Birthday! #GoogleDoodle" property="og:description"><meta content="summary_large_image" property="twitter:card"><meta content="@GoogleDoodles" property="twitter:site"><meta content="https://www.google.com/logos/doodles/2022/dr-michiaki-takahashis-94th-birthday-6753651837

### 예제1

In [3]:
from bs4 import BeautifulSoup

# 테스트용 html 코드
html = '''<html><body><div><span>\
<a href=http://www.naver.com>naver</a>\
<a href=https://www.google.com>google</a>\
<a href=http://www.daum.net/>daum</a>\
</span></div></body></html>'''

# BeautifulSoup를 이용해 HTML 소스를 파싱
soup = BeautifulSoup(html,'lxml')
soup

<html><body><div><span><a href="http://www.naver.com">naver</a><a href="https://www.google.com">google</a><a href="http://www.daum.net/">daum</a></span></div></body></html>

In [4]:
print(soup.prettify())

<html>
 <body>
  <div>
   <span>
    <a href="http://www.naver.com">
     naver
    </a>
    <a href="https://www.google.com">
     google
    </a>
    <a href="http://www.daum.net/">
     daum
    </a>
   </span>
  </div>
 </body>
</html>


In [5]:
soup.find('a')

<a href="http://www.naver.com">naver</a>

In [6]:
soup.find('a').get_text()

'naver'

In [7]:
soup.find_all('a')

[<a href="http://www.naver.com">naver</a>,
 <a href="https://www.google.com">google</a>,
 <a href="http://www.daum.net/">daum</a>]

In [8]:
site_names = soup.find_all('a')
for site_name in site_names:
    print(site_name.get_text())

naver
google
daum


### 예제2

In [9]:
from bs4 import BeautifulSoup

# 테스트용 HTML 코드
html2 = '''
<html>
    <head>
        <title>작품과 작가 모음</title>
    </head>
    <body>
        <h1>책정보</h1>
        <p id="book_title">토지</p>
        <p id="author">박경리</p>

        <p id="book_title">태백산맥</p>
        <p id="author">조정래</p>

        <p id="book_title">감옥으로부터의 사색</p>
        <p id="author">신영복</p>
    </body>
</html>
'''

soup2 = BeautifulSoup(html2, 'lxml')

In [10]:
# 내 생각엔 BeautifulSoup은 HTML을 파싱하기 좋은 모듈인 것 같다.
soup2.title

<title>작품과 작가 모음</title>

In [11]:
soup2.body

<body>
<h1>책정보</h1>
<p id="book_title">토지</p>
<p id="author">박경리</p>
<p id="book_title">태백산맥</p>
<p id="author">조정래</p>
<p id="book_title">감옥으로부터의 사색</p>
<p id="author">신영복</p>
</body>

In [12]:
soup2.body.h1

<h1>책정보</h1>

In [13]:
soup2.find_all('p')

[<p id="book_title">토지</p>,
 <p id="author">박경리</p>,
 <p id="book_title">태백산맥</p>,
 <p id="author">조정래</p>,
 <p id="book_title">감옥으로부터의 사색</p>,
 <p id="author">신영복</p>]

In [14]:
# BeautifulSoup에서는 태그와 속성을 통해 원하는 결과를 얻을 수 있다.
'''
BeautifulSoup.find_all("태그","속성")
BeautifulSoup.find("태그","속성")
'''

'\nBeautifulSoup.find_all("태그","속성")\nBeautifulSoup.find("태그","속성")\n'

In [16]:
soup2.find('p',{'id':'book_title'})

<p id="book_title">토지</p>

In [18]:
soup2.find('p',{'id':'author'})

<p id="author">박경리</p>

In [19]:
soup2.find_all('p',{'id':'book_title'})

[<p id="book_title">토지</p>,
 <p id="book_title">태백산맥</p>,
 <p id="book_title">감옥으로부터의 사색</p>]

In [20]:
soup2.find_all('p',{'id':'author'})

[<p id="author">박경리</p>, <p id="author">조정래</p>, <p id="author">신영복</p>]

In [21]:
from bs4 import BeautifulSoup

soup2 = BeautifulSoup(html2, 'lxml')

book_titles = soup2.find_all('p',{'id':'book_title'})
authors = soup2.find_all('p',{'id':'author'})

for book_title, author in zip(book_titles, authors):
    print(book_title.get_text() + '/' + author.get_text())

토지/박경리
태백산맥/조정래
감옥으로부터의 사색/신영복


In [22]:
soup2.select('body h1')

[<h1>책정보</h1>]

In [23]:
soup2.select('body p')

[<p id="book_title">토지</p>,
 <p id="author">박경리</p>,
 <p id="book_title">태백산맥</p>,
 <p id="author">조정래</p>,
 <p id="book_title">감옥으로부터의 사색</p>,
 <p id="author">신영복</p>]

In [24]:
soup2.select('p')

[<p id="book_title">토지</p>,
 <p id="author">박경리</p>,
 <p id="book_title">태백산맥</p>,
 <p id="author">조정래</p>,
 <p id="book_title">감옥으로부터의 사색</p>,
 <p id="author">신영복</p>]

In [25]:
soup2.select('p#book_title')

[<p id="book_title">토지</p>,
 <p id="book_title">태백산맥</p>,
 <p id="book_title">감옥으로부터의 사색</p>]

In [26]:
soup2.select('p#author')

[<p id="author">박경리</p>, <p id="author">조정래</p>, <p id="author">신영복</p>]