# 정적 데이터 수집

---

## 0. 라이브러리 import

In [1]:
import requests
from bs4 import BeautifulSoup

## 1. 데이터 요청 하기

In [2]:
url = 'http://quotes.toscrape.com/' # 크롤링할 사이트 주소(크롤링 연습 사이트)
# response = requests.get(url)
response = requests.get(url, verify=False) # requests 모듈을 이용해 사이트에 요청을 보냄

In [3]:
response.status_code # 200이 나오면 정상적으로 응답이 왔다는 뜻

200

In [4]:
my_html = response.text # 응답받은 객체에서 html 코드만 추출

## 2. 데이터 파싱

In [5]:
soup = BeautifulSoup(my_html, 'html.parser') # BeautifulSoup 객체를 생성

print(soup.prettify()) # soup 객체를 출력

<!DOCTYPE html>
<html lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   Quotes to Scrape
  </title>
  <link href="/static/bootstrap.min.css" rel="stylesheet"/>
  <link href="/static/main.css" rel="stylesheet"/>
 </head>
 <body>
  <div class="container">
   <div class="row header-box">
    <div class="col-md-8">
     <h1>
      <a href="/" style="text-decoration: none">
       Quotes to Scrape
      </a>
     </h1>
    </div>
    <div class="col-md-4">
     <p>
      <a href="/login">
       Login
      </a>
     </p>
    </div>
   </div>
   <div class="row">
    <div class="col-md-8">
     <div class="quote" itemscope="" itemtype="http://schema.org/CreativeWork">
      <span class="text" itemprop="text">
       “The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”
      </span>
      <span>
       by
       <small class="author" itemprop="author">
        Albert Einstein
       </small>
       <a href="/author/Albert

## 3. 원하는 데이터 추출하기

### 1) CSS_selector 사용법 알아보기

In [6]:
test_html = """
<html>
    <head>
        <title>CSS Selectors Practice</title>
    </head>
    <body>
        <h2>Hello, World!</h2>
        <p id="uniqueId">This is a paragraph with a unique ID!</p>
        <p class="blueText">This is a paragraph with class blueText!</p>
        <input type="text" value="Hello there!" />
        <a href="#">Hover over me!</a>
        <p>Watch the first letter of this paragraph!</p>
        <div>
            <p>This is a direct child of a div element!</p>
        </div>
    </body>
</html>
"""
test_soup = BeautifulSoup(test_html, 'html.parser') # BeautifulSoup 객체를 생성

In [7]:
# BeautifulSoup객체에서 CSS Selector로 요소를 찾는 방법

# soup.select_one() : CSS Selector로 요소 하나를 추출
# soup.select() : CSS Selector로 요소 여러 개를 리스트로 추출

# 1. 태그로 요소를 찾는 방법
# soup.select_one('태그이름')
print(test_soup.select_one('h2')) # h2 태그를 찾아서 출력
print(test_soup.select('p')) # p 태그를 찾아서 출력


<h2>Hello, World!</h2>
[<p id="uniqueId">This is a paragraph with a unique ID!</p>, <p class="blueText">This is a paragraph with class blueText!</p>, <p>Watch the first letter of this paragraph!</p>, <p>This is a direct child of a div element!</p>]


In [8]:

# 2. 아이디로 요소를 찾는 방법
# soup.select_one('#아이디이름')
print(test_soup.select_one('#uniqueId')) # id가 uniqueId인 요소를 찾아서 출력


<p id="uniqueId">This is a paragraph with a unique ID!</p>


In [9]:

# 3. 클래스로 요소를 찾는 방법
# soup.select_one('.클래스이름')
print(test_soup.select('.blueText')) # class가 blueText인 요소를 찾아서 출력


[<p class="blueText">This is a paragraph with class blueText!</p>]


In [10]:

# 4. 속성으로 요소를 찾는 방법
# soup.select_one('태그이름[속성이름=값]')
print(test_soup.select('a[href="#"]')) # a 태그 중 href가 #인 요소를 찾아서 출력


[<a href="#">Hover over me!</a>]


In [11]:

# 5. 하위 요소를 찾는 방법
# soup.select_one('상위태그이름 > 하위태그이름')
print(test_soup.select('div > p')) # div 태그의 하위 태그 중 p 태그를 찾아서 출력


[<p>This is a direct child of a div element!</p>]


In [12]:

# 6. 후손 요소를 찾는 방법
# soup.select_one('상위태그이름 하위태그이름')
print(test_soup.select('body p')) # div 태그의 후손 태그 중 p 태그를 찾아서 출력


[<p id="uniqueId">This is a paragraph with a unique ID!</p>, <p class="blueText">This is a paragraph with class blueText!</p>, <p>Watch the first letter of this paragraph!</p>, <p>This is a direct child of a div element!</p>]


In [13]:

# 7. n번째 요소를 찾는 방법
# soup.select_one('태그이름:nth-child(n)')
print(test_soup.select_one('p:nth-child(3)')) # p 태그 중 3번째 자식인 것을 찾아서 출력


<p class="blueText">This is a paragraph with class blueText!</p>


In [14]:

# 8. n번째 타입 요소를 찾는 방법
# soup.select_one('태그이름:nth-of-type(n)')
print(test_soup.select_one('p:nth-of-type(3)')) # p 태그 중 3번째 태그을 찾아서 출력

<p>Watch the first letter of this paragraph!</p>


### 2) 데이터 추출하기

- HTML 요소 찾기

In [15]:
# 개발자도구를 이용해 CSS Selector를 찾는 방법
# 1. 개발자도구를 열고 원하는 요소를 선택
# 2. 마우스 오른쪽 버튼을 클릭하고 Copy - Copy selector를 선택
# 3. 복사한 CSS Selector를 select_one()이나 select()의 인자로 넣어서 요소를 찾음

# selector : body > div > div:nth-child(2) > div.col-md-8 > div:nth-child(1) > span.text
span = soup.select_one('body > div > div:nth-child(2) > div.col-md-8 > div:nth-child(1) > span.text')
print(span)

# 여러개의 요소를 찾을 때는 select()를 사용
spans = soup.select('body > div > div:nth-child(2) > div.col-md-8 > div > span.text')
print(spans)

<span class="text" itemprop="text">“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”</span>
[<span class="text" itemprop="text">“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”</span>, <span class="text" itemprop="text">“It is our choices, Harry, that show what we truly are, far more than our abilities.”</span>, <span class="text" itemprop="text">“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”</span>, <span class="text" itemprop="text">“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”</span>, <span class="text" itemprop="text">“Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”</span>, <span class="text" itemprop="text">“Try not to become a man of success. Rat

- 찾은 요소의 데이터를 다루기

In [16]:
# 찾은 요소의 텍스트를 추출하는 방법
# 1. get_text() : 요소에서 텍스트만 추출
print(span.get_text())

# 2. string 속성 : 요소에서 텍스트만 추출
print(span.string)

# 3. text 속성 : 요소에서 텍스트만 추출
print(span.text)

“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”
“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”
“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”


In [17]:
# 찾은 요소의 속성을 추출하는 방법
# 1. attrs 속성 : 요소의 모든 속성을 딕셔너리로 추출
print(span.attrs)

# 2. 속성 이름으로 접근
print(span['class'])

# 3. get() 메서드로 추출
print(span.get('class'))

{'class': ['text'], 'itemprop': 'text'}
['text']
['text']
