# 실제 웹페이지를 가져와서 BeautifulSoup 사용해보기
- 웹페이지의 HTML 코드를 가져오는 과정은 `requests` 모듈이 해줌

In [1]:
# 라이브러리 불러오기
from bs4 import BeautifulSoup
import requests

# 예제 사이트
> https://statkim.github.io/stats-summer-2021/

## 1. requests 모듈로 url의 HTML 코드 가져오기
- `requests.get(url)`

In [2]:
req = requests.get("https://statkim.github.io/stats-summer-2021/")
req

<Response [200]>

In [3]:
# object로부터 HTML 코드만 가져오기
req.content

b'<!DOCTYPE html>\n<html lang="en-US">\n  <head>\n    <meta charset="UTF-8">\n\n<!-- Begin Jekyll SEO tag v2.7.1 -->\n<title>\xed\x86\xb5\xea\xb3\x84\xed\x8a\xb9\xea\xb0\x95 with Python | \xeb\x8f\x99\xec\x95\x84\xeb\x8c\x80\xed\x95\x99\xea\xb5\x90 \xed\x8c\xa8\xec\x85\x98\xeb\x94\x94\xec\x9e\x90\xec\x9d\xb8\xed\x95\x99\xea\xb3\xbc</title>\n<meta name="generator" content="Jekyll v3.9.0" />\n<meta property="og:title" content="\xed\x86\xb5\xea\xb3\x84\xed\x8a\xb9\xea\xb0\x95 with Python" />\n<meta property="og:locale" content="en_US" />\n<meta name="description" content="\xeb\x8f\x99\xec\x95\x84\xeb\x8c\x80\xed\x95\x99\xea\xb5\x90 \xed\x8c\xa8\xec\x85\x98\xeb\x94\x94\xec\x9e\x90\xec\x9d\xb8\xed\x95\x99\xea\xb3\xbc" />\n<meta property="og:description" content="\xeb\x8f\x99\xec\x95\x84\xeb\x8c\x80\xed\x95\x99\xea\xb5\x90 \xed\x8c\xa8\xec\x85\x98\xeb\x94\x94\xec\x9e\x90\xec\x9d\xb8\xed\x95\x99\xea\xb3\xbc" />\n<link rel="canonical" href="https://statkim.github.io/stats-summer-2021/" />\n<me

## 2. BeautifulSoup object로 만들기

In [4]:
doc = BeautifulSoup(req.content, "html.parser")
doc

<!DOCTYPE html>

<html lang="en-US">
<head>
<meta charset="utf-8"/>
<!-- Begin Jekyll SEO tag v2.7.1 -->
<title>통계특강 with Python | 동아대학교 패션디자인학과</title>
<meta content="Jekyll v3.9.0" name="generator">
<meta content="통계특강 with Python" property="og:title"/>
<meta content="en_US" property="og:locale"/>
<meta content="동아대학교 패션디자인학과" name="description"/>
<meta content="동아대학교 패션디자인학과" property="og:description"/>
<link href="https://statkim.github.io/stats-summer-2021/" rel="canonical"/>
<meta content="https://statkim.github.io/stats-summer-2021/" property="og:url"/>
<meta content="통계특강 with Python" property="og:site_name"/>
<meta content="summary" name="twitter:card"/>
<meta content="통계특강 with Python" property="twitter:title"/>
<script type="application/ld+json">
{"description":"동아대학교 패션디자인학과","url":"https://statkim.github.io/stats-summer-2021/","@type":"WebSite","headline":"통계특강 with Python","name":"통계특강 with Python","@context":"https://schema.org"}</script>
<!-- End Jekyll SEO tag -->
<met

In [5]:
print(doc.prettify())

<!DOCTYPE html>
<html lang="en-US">
 <head>
  <meta charset="utf-8"/>
  <!-- Begin Jekyll SEO tag v2.7.1 -->
  <title>
   통계특강 with Python | 동아대학교 패션디자인학과
  </title>
  <meta content="Jekyll v3.9.0" name="generator">
   <meta content="통계특강 with Python" property="og:title"/>
   <meta content="en_US" property="og:locale"/>
   <meta content="동아대학교 패션디자인학과" name="description"/>
   <meta content="동아대학교 패션디자인학과" property="og:description"/>
   <link href="https://statkim.github.io/stats-summer-2021/" rel="canonical"/>
   <meta content="https://statkim.github.io/stats-summer-2021/" property="og:url"/>
   <meta content="통계특강 with Python" property="og:site_name"/>
   <meta content="summary" name="twitter:card"/>
   <meta content="통계특강 with Python" property="twitter:title"/>
   <script type="application/ld+json">
    {"description":"동아대학교 패션디자인학과","url":"https://statkim.github.io/stats-summer-2021/","@type":"WebSite","headline":"통계특강 with Python","name":"통계특강 with Python","@context":"https://schem

## 3. 페이지 제목 가져오기

In [6]:
doc.title

<title>통계특강 with Python | 동아대학교 패션디자인학과</title>

In [7]:
doc.title.get_text()

'통계특강 with Python | 동아대학교 패션디자인학과'

## 4. 목차 내용 가져오기

In [8]:
doc.find_all("li")

[<li><a href="https://github.com/statKim/stats-summer-2021/blob/main/Lecture_note/Installation/intro_python.pdf">Python 소개 및 Anaconda 설치</a></li>,
 <li>기초 Python 프로그래밍 1 (<a href="https://github.com/statKim/stats-summer-2021/blob/main/Lecture_note/Day1/1.자료형.ipynb">자료형</a>, <a href="https://github.com/statKim/stats-summer-2021/blob/main/Lecture_note/Day1/2.조건문과%20반복문.ipynb/">조건문과 반복문</a>)</li>,
 <li><a href="https://github.com/statKim/stats-summer-2021/blob/main/Files/Day1.zip?raw=T"><strong>실습자료 다운로드</strong></a></li>,
 <li>기초 Python 프로그래밍 2 (<a href="https://github.com/statKim/stats-summer-2021/blob/main/Lecture_note/Day2/3.함수와%20모듈.ipynb/">함수와 모듈</a>)</li>,
 <li>데이터 다루기 1 (<a href="https://github.com/statKim/stats-summer-2021/blob/main/Lecture_note/Day2/NumPy.ipynb/">NumPy</a>)</li>,
 <li><a href="https://github.com/statKim/stats-summer-2021/blob/main/Files/Day2.zip?raw=T"><strong>실습자료 다운로드</strong></a></li>,
 <li>데이터 다루기 2 (<a href="https://github.com/statKim/stats-summer-2021/blob

In [9]:
len(doc.find_all("li"))

14

In [10]:
[i.get_text() for i in doc.find_all("li")]

['Python 소개 및 Anaconda 설치',
 '기초 Python 프로그래밍 1 (자료형, 조건문과 반복문)',
 '실습자료 다운로드',
 '기초 Python 프로그래밍 2 (함수와 모듈)',
 '데이터 다루기 1 (NumPy)',
 '실습자료 다운로드',
 '데이터 다루기 2 (Pandas1, Pandas2)',
 '실습자료 다운로드',
 '데이터 시각화 (matplotlib1, matplotlib2, seaborn)',
 '실습자료 다운로드',
 '파이썬 가상환경 만들기',
 '웹 크롤링 1 (Beautifulsoup1, Beautifulsoup2)',
 '실습자료 다운로드',
 '웹 크롤링 2 (Selenium1, Selenium2)']

# 예제) 구글 관련 검색어 가져오기

## 1. URL 분석
- 검색에 이용되는 최소한의 url만을 가져와야함
- 실제로 구글에서 "파이썬" 검색시, 주소창에는 https://www.google.com/search?q=python&newwindow=1&ei=tzbhYOirLpLcmAWmnLyoCQ&oq=python&gs_lcp=Cgdnd3Mtd2l6EAMyBwgAELEDEEMyBAgAEEMyBwgAELEDEEMyBAgAEEMyBAgAEEMyBAgAEEMyBwgAELEDEEMyBAgAEEMyBAgAEEMyBAgAEEM6BQgAELEDOgIIADoICAAQsQMQgwFKBAhBGABQwtYEWOzlBGDl5wRoBHACeACAAYMBiAHmCpIBBDAuMTGYAQCgAQGqAQdnd3Mtd2l6sAEAwAEB&sclient=gws-wiz&ved=0ahUKEwio47Xpx8jxAhUSLqYKHSYOD5UQ4dUDCA4&uact=5 와 같이 나타남
- 하지만, 실제로 검색을 할 때에는 https://www.google.com/search?q=python 만 있어도 정상적으로 검색이 됨
- 여기서 `search?q=`가 검색어 옵션일 것으로 추측이 가능하고, `python`이 검색어라는 것을 유추할 수 있음

In [11]:
# 기본 url 지정
base_url = "https://www.google.com/search?q="
# 검색어
search = "python"

url = base_url + search
url

'https://www.google.com/search?q=python'

## 2. URL에 접근하여 HTML 코드 가져오기

In [12]:
# 해당 검색어에 대한 게시글의 html 객체 가져오기
req = requests.get(url).content
doc = BeautifulSoup(req, "html.parser", from_encoding="utf-8")
print(doc.prettify())

<!DOCTYPE html>
<html lang="ko">
 <head>
  <meta charset="utf-8"/>
  <meta content="/images/branding/googleg/1x/googleg_standard_color_128dp.png" itemprop="image"/>
  <title>
   python - Google 검색
  </title>
  <script nonce="Cn1SIDBq8+Mn6hFj/oGDGw==">
   (function(){
document.documentElement.addEventListener("submit",function(b){var a;if(a=b.target){var c=a.getAttribute("data-submitfalse");a="1"==c||"q"==c&&!a.elements.q.value?!0:!1}else a=!1;a&&(b.preventDefault(),b.stopPropagation())},!0);document.documentElement.addEventListener("click",function(b){var a;a:{for(a=b.target;a&&a!=document.documentElement;a=a.parentElement)if("A"==a.tagName){a="1"==a.getAttribute("data-nohref");break a}a=!1}a&&b.preventDefault()},!0);}).call(this);(function(){
var a=window.performance;window.start=Date.now();a:{var b=window;if(a){var c=a.timing;if(c){var d=c.navigationStart,f=c.responseStart;if(f>d&&f<=window.start){window.start=f;b.wsrt=f-d;break a}}a.now&&(b.wsrt=Math.floor(a.now()))}}window.google=w

## 3. 관련 검색어 가져오기

In [13]:
doc.find_all("div", "BNeawe s3v9rd AP7Wnd lRVwie")

[<div class="BNeawe s3v9rd AP7Wnd lRVwie">python // 연산자</div>,
 <div class="BNeawe s3v9rd AP7Wnd lRVwie">파이썬 강좌</div>,
 <div class="BNeawe s3v9rd AP7Wnd lRVwie">Python</div>,
 <div class="BNeawe s3v9rd AP7Wnd lRVwie">파이썬 설치</div>,
 <div class="BNeawe s3v9rd AP7Wnd lRVwie">파이썬 문법</div>,
 <div class="BNeawe s3v9rd AP7Wnd lRVwie">python :</div>,
 <div class="BNeawe s3v9rd AP7Wnd lRVwie">python -m 뜻</div>,
 <div class="BNeawe s3v9rd AP7Wnd lRVwie">파이썬이란</div>]

In [14]:
[i.get_text() for i in doc.find_all("div", "BNeawe s3v9rd AP7Wnd lRVwie")]

['python // 연산자',
 '파이썬 강좌',
 'Python',
 '파이썬 설치',
 '파이썬 문법',
 'python :',
 'python -m 뜻',
 '파이썬이란']

# 예제) 코로나 일일확진자 수 가져오기

## 1. URL 분석
> http://ncov.mohw.go.kr/

In [27]:
# 질병관리청 홈페이지
url = "http://ncov.mohw.go.kr/"

## 2. URL에 접근하여 HTML 코드 가져오기

In [29]:
# 해당 검색어에 대한 게시글의 html 객체 가져오기
req = requests.get(url).content
doc = BeautifulSoup(req, "html.parser")
# print(doc.prettify())

## 3. 확진자 수 가져오기
- 이번엔 CSS selector로 가져오기

In [31]:
doc.select("body > div > div.mainlive_container > div.container > div > div.liveboard_layout > div.liveNumOuter > div.liveNum_today_new > div > ul > li:nth-child(1) > span.data")

[<span class="data">690</span>]

In [32]:
doc.select("body > div > div.mainlive_container > div.container > div > div.liveboard_layout > div.liveNumOuter > div.liveNum_today_new > div > ul > li:nth-child(2) > span.data")

[<span class="data">56</span>]

In [33]:
num_patient = doc.select("body > div > div.mainlive_container > div.container > div > div.liveboard_layout > div.liveNumOuter > div.liveNum_today_new > div > ul > li > span.data")
num_patient

[<span class="data">690</span>, <span class="data">56</span>]

In [34]:
n_patient = [i.get_text() for i in num_patient]
n_patient

['690', '56']

## 4. 국내발생, 해외유입 글자 가져오기

In [35]:
doc.select("body > div > div.mainlive_container > div.container > div > div.liveboard_layout > div.liveNumOuter > div.liveNum_today_new > div > ul > li:nth-child(1) > span.subtit")

[<span class="subtit">국내발생</span>]

In [36]:
doc.select("body > div > div.mainlive_container > div.container > div > div.liveboard_layout > div.liveNumOuter > div.liveNum_today_new > div > ul > li:nth-child(2) > span.subtit")

[<span class="subtit">해외유입</span>]

In [37]:
wh = doc.select("body > div > div.mainlive_container > div.container > div > div.liveboard_layout > div.liveNumOuter > div.liveNum_today_new > div > ul > li > span.subtit")
wh

[<span class="subtit">국내발생</span>, <span class="subtit">해외유입</span>]

In [38]:
whe = [i.get_text() for i in wh]
whe

['국내발생', '해외유입']

In [39]:
# 데이터프레임으로 만들기
import pandas as pd
pd.DataFrame([n_patient], columns=whe)

Unnamed: 0,국내발생,해외유입
0,690,56


# 예제) 네이버 음원차트 TOP 10 가져오기
- 네이버에서 "음원차트" 검색했을 떄 나오는 음원 차트
> https://search.naver.com/search.naver?sm=tab_hty.top&where=nexearch&query=%EC%9D%8C%EC%9B%90%EC%B0%A8%ED%8A%B8&oquery=%EC%9D%8C%EC%9B%90%EC%B0%A8%ED%8A%B8&tqi=hMnzqwprvmZssj4tlECssssst84-276548

## 1. URL 분석

In [40]:
url = "https://search.naver.com/search.naver?query=음원차트"

## 2. URL에 접근하여 HTML 코드 가져오기

In [41]:
# 해당 검색어에 대한 게시글의 html 객체 가져오기
req = requests.get(url).content
doc = BeautifulSoup(req, "html.parser")
# print(doc.prettify())

## 3. 음원차트 TOP 10 제목 가져오기

In [42]:
doc.select("#main_pack > section.sc_new.sp_pmusic._au_music_collection._prs_mus_sen > div > div.group_music.type_chart > ol:nth-child(1) > li:nth-child(1) > div > div.album_info > div > span > a")

[<a class="tit" href="https://vibe.naver.com/track/49356016" onclick="return goOtherCR(this, 'a=mus_sen*s.tit&amp;r=1&amp;i=080003B5_000049356016&amp;u='+urlencode(this.href))" target="_vibe">바라만 본다</a>]

In [43]:
doc.select("div.album_info > div > span > a")

[<a class="tit" href="https://vibe.naver.com/track/49356016" onclick="return goOtherCR(this, 'a=mus_sen*s.tit&amp;r=1&amp;i=080003B5_000049356016&amp;u='+urlencode(this.href))" target="_vibe">바라만 본다</a>,
 <a class="tit" href="https://vibe.naver.com/track/48291574" onclick="return goOtherCR(this, 'a=mus_sen*s.tit&amp;r=2&amp;i=080003B5_000048291574&amp;u='+urlencode(this.href))" target="_vibe">Next Level</a>,
 <a class="tit" href="https://vibe.naver.com/track/48454459" onclick="return goOtherCR(this, 'a=mus_sen*s.tit&amp;r=3&amp;i=080003B5_000048454459&amp;u='+urlencode(this.href))" target="_vibe">Butter</a>,
 <a class="tit" href="https://vibe.naver.com/track/49356017" onclick="return goOtherCR(this, 'a=mus_sen*s.tit&amp;r=4&amp;i=080003B5_000049356017&amp;u='+urlencode(this.href))" target="_vibe">나를 아는 사람</a>,
 <a class="tit" href="https://vibe.naver.com/track/47849803" onclick="return goOtherCR(this, 'a=mus_sen*s.tit&amp;r=5&amp;i=080003B5_000047849803&amp;u='+urlencode(this.href))" t

In [44]:
len(doc.select("div.album_info > div > span > a"))

10

In [45]:
# 제목만 가져오기
title = [i.text for i in doc.select("div.album_info > div > span > a")]
title

['바라만 본다',
 'Next Level',
 'Butter',
 '나를 아는 사람',
 'Dun Dun Dance',
 '치맛바람 (Chi Mat Ba Ram)',
 '헤픈 우연',
 'Alcohol-Free',
 "롤린 (Rollin')",
 'Peaches (Feat. Daniel Caesar, Giveon)']

## 4. 가수 이름 가져오기

In [46]:
doc.select("#main_pack > section.sc_new.sp_pmusic._au_music_collection._prs_mus_sen > div > div.group_music.type_chart > ol:nth-child(1) > li:nth-child(1) > div > div.album_info > div > div > span:nth-child(2) > a")

[<a href="https://vibe.naver.com/artist/4469800" onclick="return goOtherCR(this, 'a=mus_sen*s.artist&amp;r=1&amp;i=080003B5_000049356016&amp;u='+urlencode(this.href));" target="_vibe">MSG워너비(M.O.M)</a>]

In [47]:
doc.select("div.album_info > div > div > span:nth-child(2)")

[<span class="name"><a href="https://vibe.naver.com/artist/4469800" onclick="return goOtherCR(this, 'a=mus_sen*s.artist&amp;r=1&amp;i=080003B5_000049356016&amp;u='+urlencode(this.href));" target="_vibe">MSG워너비(M.O.M)</a></span>,
 <span class="name"><a href="https://vibe.naver.com/artist/3980296" onclick="return goOtherCR(this, 'a=mus_sen*s.artist&amp;r=2&amp;i=080003B5_000048291574&amp;u='+urlencode(this.href));" target="_vibe">aespa</a></span>,
 <span class="name"><a href="https://vibe.naver.com/artist/143179" onclick="return goOtherCR(this, 'a=mus_sen*s.artist&amp;r=3&amp;i=080003B5_000048454459&amp;u='+urlencode(this.href));" target="_vibe">방탄소년단</a></span>,
 <span class="name"><a href="https://vibe.naver.com/artist/4469795" onclick="return goOtherCR(this, 'a=mus_sen*s.artist&amp;r=4&amp;i=080003B5_000049356017&amp;u='+urlencode(this.href));" target="_vibe">MSG워너비(정상동기)</a></span>,
 <span class="name"><a href="https://vibe.naver.com/artist/401694" onclick="return goOtherCR(this, 'a=

In [49]:
singer = [i.text for i in doc.select("div.album_info > div > div > span:nth-child(2)")]
singer

['MSG워너비(M.O.M)',
 'aespa',
 '방탄소년단',
 'MSG워너비(정상동기)',
 '오마이걸(OH MY GIRL)',
 '브레이브걸스(Brave Girls)',
 '헤이즈 (Heize)',
 'TWICE(트와이스)',
 '브레이브걸스(Brave Girls)',
 'Justin Bieber']

## 5. 앨범 제목 가져오기

In [48]:
doc.select("div.album_info > div > div > span:nth-child(1)")

[<span class="name"><a href="https://vibe.naver.com/album/6086678" onclick="return goOtherCR(this, 'a=mus_sen*s.album&amp;r=1&amp;i=080003B5_000049356016&amp;u='+urlencode(this.href))" target="_vibe">MSG워너비 1집</a></span>,
 <span class="name"><a href="https://vibe.naver.com/album/5872148" onclick="return goOtherCR(this, 'a=mus_sen*s.album&amp;r=2&amp;i=080003B5_000048291574&amp;u='+urlencode(this.href))" target="_vibe">Next Level</a></span>,
 <span class="name"><a href="https://vibe.naver.com/album/5902422" onclick="return goOtherCR(this, 'a=mus_sen*s.album&amp;r=3&amp;i=080003B5_000048454459&amp;u='+urlencode(this.href))" target="_vibe">Butter</a></span>,
 <span class="name"><a href="https://vibe.naver.com/album/6086678" onclick="return goOtherCR(this, 'a=mus_sen*s.album&amp;r=4&amp;i=080003B5_000049356017&amp;u='+urlencode(this.href))" target="_vibe">MSG워너비 1집</a></span>,
 <span class="name"><a href="https://vibe.naver.com/album/5793100" onclick="return goOtherCR(this, 'a=mus_sen*s.al

In [50]:
album = [i.text for i in doc.select("div.album_info > div > div > span:nth-child(1)")]
album

['MSG워너비 1집',
 'Next Level',
 'Butter',
 'MSG워너비 1집',
 'Dear OHMYGIRL',
 'Summer Queen',
 'HAPPEN',
 'Taste of Love',
 "Rollin'",
 'Justice']

## 6. 데이터 정리하기

In [51]:
df = pd.DataFrame({
    "노래명": title,
    "가수": singer,
    "앨범": album
})
df

Unnamed: 0,노래명,가수,앨범
0,바라만 본다,MSG워너비(M.O.M),MSG워너비 1집
1,Next Level,aespa,Next Level
2,Butter,방탄소년단,Butter
3,나를 아는 사람,MSG워너비(정상동기),MSG워너비 1집
4,Dun Dun Dance,오마이걸(OH MY GIRL),Dear OHMYGIRL
5,치맛바람 (Chi Mat Ba Ram),브레이브걸스(Brave Girls),Summer Queen
6,헤픈 우연,헤이즈 (Heize),HAPPEN
7,Alcohol-Free,TWICE(트와이스),Taste of Love
8,롤린 (Rollin'),브레이브걸스(Brave Girls),Rollin'
9,"Peaches (Feat. Daniel Caesar, Giveon)",Justin Bieber,Justice
