# 웹 데이터를 가져오는 BeautifulSoup 익히기

In [1]:
from bs4 import BeautifulSoup
fp = open('example2.html')
soup = BeautifulSoup(fp, 'html.parser')
fp.close()

In [2]:
soup

<!DOCTYPE html>

<html>
<head>
<meta charset="utf-8"/>
<title>Very Simple HTML Code by PinkWink</title>
</head>
<body>
<div>
<p class="inner-text first-item" id="first">
                Happy PinkWink.
                <a href="http://www.pinkwink.kr" id="pw-link">PinkWink</a>
</p>
<p class="inner-text second-item">
                Happy Data Science
                <a href="https://www.python.org" id="py-link">Python</a>
</p>
</div>
<p class="outer-text first-item" id="second">
<b>
                Data Science is Funny
            </b>
</p>
<p class="outer-text">
<b>
                All I need is Love.
            </b>
</p>
</body>
</html>

In [4]:
print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  <meta charset="utf-8"/>
  <title>
   Very Simple HTML Code by PinkWink
  </title>
 </head>
 <body>
  <div>
   <p class="inner-text first-item" id="first">
    Happy PinkWink.
    <a href="http://www.pinkwink.kr" id="pw-link">
     PinkWink
    </a>
   </p>
   <p class="inner-text second-item">
    Happy Data Science
    <a href="https://www.python.org" id="py-link">
     Python
    </a>
   </p>
  </div>
  <p class="outer-text first-item" id="second">
   <b>
    Data Science is Funny
   </b>
  </p>
  <p class="outer-text">
   <b>
    All I need is Love.
   </b>
  </p>
 </body>
</html>


In [5]:
list(soup.children)

['html',
 '\n',
 <html>
 <head>
 <meta charset="utf-8"/>
 <title>Very Simple HTML Code by PinkWink</title>
 </head>
 <body>
 <div>
 <p class="inner-text first-item" id="first">
                 Happy PinkWink.
                 <a href="http://www.pinkwink.kr" id="pw-link">PinkWink</a>
 </p>
 <p class="inner-text second-item">
                 Happy Data Science
                 <a href="https://www.python.org" id="py-link">Python</a>
 </p>
 </div>
 <p class="outer-text first-item" id="second">
 <b>
                 Data Science is Funny
             </b>
 </p>
 <p class="outer-text">
 <b>
                 All I need is Love.
             </b>
 </p>
 </body>
 </html>]

In [8]:
body = list(soup.body)
body

['\n',
 <div>
 <p class="inner-text first-item" id="first">
                 Happy PinkWink.
                 <a href="http://www.pinkwink.kr" id="pw-link">PinkWink</a>
 </p>
 <p class="inner-text second-item">
                 Happy Data Science
                 <a href="https://www.python.org" id="py-link">Python</a>
 </p>
 </div>,
 '\n',
 <p class="outer-text first-item" id="second">
 <b>
                 Data Science is Funny
             </b>
 </p>,
 '\n',
 <p class="outer-text">
 <b>
                 All I need is Love.
             </b>
 </p>,
 '\n']

In [11]:
all_ps = soup.find_all('p')
all_ps

[<p class="inner-text first-item" id="first">
                 Happy PinkWink.
                 <a href="http://www.pinkwink.kr" id="pw-link">PinkWink</a>
 </p>,
 <p class="inner-text second-item">
                 Happy Data Science
                 <a href="https://www.python.org" id="py-link">Python</a>
 </p>,
 <p class="outer-text first-item" id="second">
 <b>
                 Data Science is Funny
             </b>
 </p>,
 <p class="outer-text">
 <b>
                 All I need is Love.
             </b>
 </p>]

In [20]:
first_p = soup.find('p')
first_p.get_text()

'\n                Happy PinkWink.\n                PinkWink\n'

In [14]:
first_p.find('a').get_text()

'PinkWink'

In [15]:
soup.find_all(class_='inner-text')

[<p class="inner-text first-item" id="first">
                 Happy PinkWink.
                 <a href="http://www.pinkwink.kr" id="pw-link">PinkWink</a>
 </p>,
 <p class="inner-text second-item">
                 Happy Data Science
                 <a href="https://www.python.org" id="py-link">Python</a>
 </p>]

In [21]:
soup.find_all(class_='outer-text')

[<p class="outer-text first-item" id="second">
 <b>
                 Data Science is Funny
             </b>
 </p>,
 <p class="outer-text">
 <b>
                 All I need is Love.
             </b>
 </p>]

In [22]:
soup.find(id='second')

<p class="outer-text first-item" id="second">
<b>
                Data Science is Funny
            </b>
</p>

In [23]:
soup.find('a')

<a href="http://www.pinkwink.kr" id="pw-link">PinkWink</a>

In [25]:
a = soup.find('a')
a['href']

'http://www.pinkwink.kr'

In [26]:
links = soup.find_all('a')
for link in links:
    url = link['href']
    site = link.get_text()
    print("%s\t%s" %(url, site))

http://www.pinkwink.kr	PinkWink
https://www.python.org	Python


## 크롬 개발자 도구를 이요해서 원하는 태그 찾기

In [29]:
from urllib.request import urlopen

In [30]:
url = 'https://finance.naver.com/marketindex/'
page = urlopen(url)
soup = BeautifulSoup(page, 'html.parser')

In [31]:
soup


<script language="javascript" src="/template/head_js.nhn?referer=info.finance.naver.com&amp;menu=marketindex&amp;submenu=market"></script>
<script src="/js/info/jindo.min.ns.1.5.3.euckr.js" type="text/javascript"></script>
<script src="/js/jindo.1.5.3.element-text-patch.js" type="text/javascript"></script>
<div id="container" style="padding-bottom:0px;">
<script language="JavaScript" src="/js/flashObject.js?20200609205513"></script>
<div class="market_include">
<div class="market_data">
<div class="market1">
<div class="title">
<h2 class="h_market1"><span>환전 고시 환율</span></h2>
</div>
<!-- data -->
<div class="data">
<ul class="data_lst" id="exchangeList">
<li class="on">
<a class="head usd" href="/marketindex/exchangeDetail.nhn?marketindexCd=FX_USDKRW" onclick="clickcr(this, 'fr1.usdt', '', '', event);">
<h3 class="h_lst"><span class="blind">미국 USD</span></h3>
<div class="head_info point_dn">
<span class="value">1,196.60</span>
<span class="txt_krw"><span class="blind">원</span></span>


In [32]:
soup.find_all(class_='value')

[<span class="value">1,196.60</span>,
 <span class="value">1,110.79</span>,
 <span class="value">1,357.12</span>,
 <span class="value">169.15</span>,
 <span class="value">107.7100</span>,
 <span class="value">1.1353</span>,
 <span class="value">1.2722</span>,
 <span class="value">96.3200</span>,
 <span class="value">38.94</span>,
 <span class="value">1307.71</span>,
 <span class="value">1714.7</span>,
 <span class="value">66040.8</span>]

In [37]:
market_data = soup.find(class_ = 'market_data')
lis = market_data.find_all('li')
lis

[<li class="on">
 <a class="head usd" href="/marketindex/exchangeDetail.nhn?marketindexCd=FX_USDKRW" onclick="clickcr(this, 'fr1.usdt', '', '', event);">
 <h3 class="h_lst"><span class="blind">미국 USD</span></h3>
 <div class="head_info point_dn">
 <span class="value">1,196.60</span>
 <span class="txt_krw"><span class="blind">원</span></span>
 <span class="change"> 4.40</span>
 <span class="blind">하락</span>
 </div>
 </a>
 <a class="graph_img" href="/marketindex/exchangeDetail.nhn?marketindexCd=FX_USDKRW" onclick="clickcr(this, 'fr1.usdc', '', '', event);">
 <img alt="" height="153" src="https://ssl.pstatic.net/imgfinance/chart/marketindex/FX_USDKRW.png" width="295"/>
 </a>
 <div class="graph_info">
 <span class="time">2020.06.10 11:10</span>
 <span class="source">하나은행 기준</span>
 <span class="count">고시회차<span class="num">159</span>회</span>
 </div>
 </li>,
 <li class="">
 <a class="head jpy" href="/marketindex/exchangeDetail.nhn?marketindexCd=FX_JPYKRW" onclick="clickcr(this, 'fr1.jpyt', ''

In [42]:
for li in lis:
    label = li.find(class_='blind')
    value = li.find(class_='value')
    print(label.get_text(), '\t', value.get_text())

미국 USD 	 1,196.60
일본 JPY(100엔) 	 1,110.79
유럽연합 EUR 	 1,357.12
중국 CNY 	 169.15
일본 엔/달러 	 107.7100
달러/유로 	 1.1353
달러/영국파운드 	 1.2722
달러인덱스 	 96.3200
WTI 	 38.94
휘발유 	 1307.71
국제 금 	 1714.7
국내 금 	 66040.8


In [43]:
import pandas as pd

In [44]:
labels = []
values = []
for li in lis:
    label = li.find(class_='blind')
    value = li.find(class_='value')
    labels.append(label.get_text())
    values.append(value.get_text())
print(labels)
print(values)

['미국 USD', '일본 JPY(100엔)', '유럽연합 EUR', '중국 CNY', '일본 엔/달러', '달러/유로', '달러/영국파운드', '달러인덱스', 'WTI', '휘발유', '국제 금', '국내 금']
['1,196.60', '1,110.79', '1,357.12', '169.15', '107.7100', '1.1353', '1.2722', '96.3200', '38.94', '1307.71', '1714.7', '66040.8']


In [51]:
eco_index = pd.DataFrame({'항목': labels, '값': values})
eco_index

Unnamed: 0,항목,값
0,미국 USD,1196.6
1,일본 JPY(100엔),1110.79
2,유럽연합 EUR,1357.12
3,중국 CNY,169.15
4,일본 엔/달러,107.71
5,달러/유로,1.1353
6,달러/영국파운드,1.2722
7,달러인덱스,96.32
8,WTI,38.94
9,휘발유,1307.71
