# BeautifulSoup

In [1]:
%%writefile example.html
<!DOCYTYPE HTML>
<html>
  <head>
  <title>HTML 기본구조</title>
  <meta charset="UTF-8">
  </head>
  <body>
      <h1>HTML 5 구조를 공부합시다.</h1>
  </body>
</html>

Writing example.html


In [2]:
from bs4 import BeautifulSoup
import urllib.request

with open("example.html") as fp :
  soup = BeautifulSoup(fp, "html.parser")

In [3]:
soup

<!--DOCYTYPE HTML-->
<html>
<head>
<title>HTML 기본구조</title>
<meta charset="utf-8"/>
</head>
<body>
<h1>HTML 5 구조를 공부합시다.</h1>
</body>
</html>

In [4]:
print(soup.prettify())

<!--DOCYTYPE HTML-->
<html>
 <head>
  <title>
   HTML 기본구조
  </title>
  <meta charset="utf-8"/>
 </head>
 <body>
  <h1>
   HTML 5 구조를 공부합시다.
  </h1>
 </body>
</html>


In [5]:
soup.title

<title>HTML 기본구조</title>

In [6]:
soup.title.name

'title'

In [7]:
soup.title.string

'HTML 기본구조'

In [8]:
soup.title.parent

<head>
<title>HTML 기본구조</title>
<meta charset="utf-8"/>
</head>

In [9]:
soup.title.parent.name

'head'

In [10]:
soup.h1

<h1>HTML 5 구조를 공부합시다.</h1>

In [11]:
soup.p

In [12]:
soup.div

In [13]:
soup.a

# HTML 태그 검색
다 알 수 없다.
- find() : 해당 조건에 맞는 하나의 태그를 가져옴
- find_all() : 해당 조건에 맞는 모든 태그를 가져옴
- select() : css 선택자와 같은 형식으로 선택 가능

In [14]:
soup_find = soup.find("h1")
soup_find

<h1>HTML 5 구조를 공부합시다.</h1>

In [15]:
soup.find_all("div")

[]

In [17]:
soup_find_all = soup.find_all("head")

In [18]:
soup_find_all

[<head>
 <title>HTML 기본구조</title>
 <meta charset="utf-8"/>
 </head>]

In [21]:
soup_find_all[:]

[<head>
 <title>HTML 기본구조</title>
 <meta charset="utf-8"/>
 </head>]

In [22]:
find_by_id = soup.find_all('div', {'id':'id1'})

find_by_id

[]

In [None]:
find_by_class = soup.find_all('div', {'class':'class1'}) # class가 class1인 것만

find_by_class

In [None]:
soup.find('a').get('href')
soup.find('a').get_text()

## 원하는 html 가져오는 법

In [None]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

res_comic = requests.get('https://comic.naver.com/index')
soup_comic = BeautifulSoup(res_comic.text, 'lxml')

soup_comic

In [24]:
comic_all = soup_comic.find("ol", attrs = {"id":"realTimeRankFavorite"}).find_all("li")

list_comic_all = []
for rank, comic in enumerate(comic_all) :
  list_comic = []
  list_comic.append(rank+1)
  list_comic.append(comic.a.text.strip())
  list_comic.append('https://comic.naver.com' + comic.a['href'])
  list_comic_all.append(list_comic)

df_comic = pd.DataFrame(list_comic_all, columns = ['순위', '제목', '링크'])
df_comic

Unnamed: 0,순위,제목,링크
0,1,참교육-93화,https://comic.naver.com/webtoon/detail?titleId...
1,2,퀘스트지상주의-43화 너 이런 거 하지마,https://comic.naver.com/webtoon/detail?titleId...
2,3,소녀의 세계-2부 113화,https://comic.naver.com/webtoon/detail?titleId...
3,4,윈드브레이커-4부 - 23화 HOT남,https://comic.naver.com/webtoon/detail?titleId...
4,5,백수세끼-139화 녹두전,https://comic.naver.com/webtoon/detail?titleId...
5,6,신의 탑-3부 133화,https://comic.naver.com/webtoon/detail?titleId...
6,7,팔이피플-44화 - 폭로의 홍수,https://comic.naver.com/webtoon/detail?titleId...
7,8,뷰티풀 군바리-341화_신임 중대장 (4),https://comic.naver.com/webtoon/detail?titleId...
8,9,신화급 귀속 아이템을 손에 넣었다-11화,https://comic.naver.com/webtoon/detail?titleId...
9,10,버림받은 왕녀의 은밀한 침실-7화. 낙인 부활,https://comic.naver.com/webtoon/detail?titleId...
