# 웹크롤링

### 크롤링을 통해 10개의 명언,저자 출력해보기

In [13]:
# 필요한 라이브러리 (requests, BeautifulSoup)
# requests > 웹페이지의 HTML을 가져오는 라이브러리
# BeautifulSoup > 가져온 HTML을 분석(파싱)하는 라이브러리 

import requests
from bs4 import BeautifulSoup

# BeautifulSoup 라이브러리 설치가 필요할 시
# pip install requests beautifulsoup4

In [15]:
# 크롤링할 웹사이트 (명언 사이트)
url = "https://quotes.toscrape.com/"

In [19]:
# 웹 페이지 요청
response = requests.get(url)
# response 객체에 웹사이트의 HTML 코드가 저장됨

print(response.status_code) # 200이면 성공

200


In [21]:
# BeautifulSoup을 사용해 HTML 분석
soup = BeautifulSoup(response.text, "html.parser")

# response.text > 가져온 HTML 데이터를 문자열로 반환
# BeautifulSoup(response.text, "html.parser") > HTML을 분석할 수 있도록 변환

print(soup.prettify())  # 들여쓰기된 HTML 출력

<!DOCTYPE html>
<html lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   Quotes to Scrape
  </title>
  <link href="/static/bootstrap.min.css" rel="stylesheet"/>
  <link href="/static/main.css" rel="stylesheet"/>
 </head>
 <body>
  <div class="container">
   <div class="row header-box">
    <div class="col-md-8">
     <h1>
      <a href="/" style="text-decoration: none">
       Quotes to Scrape
      </a>
     </h1>
    </div>
    <div class="col-md-4">
     <p>
      <a href="/login">
       Login
      </a>
     </p>
    </div>
   </div>
   <div class="row">
    <div class="col-md-8">
     <div class="quote" itemscope="" itemtype="http://schema.org/CreativeWork">
      <span class="text" itemprop="text">
       “The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”
      </span>
      <span>
       by
       <small class="author" itemprop="author">
        Albert Einstein
       </small>
       <a href="/author/Albert

In [24]:
# 명언의 div class가 quote로 설정되있음을 확인  <div class="quote" ...>

# ex) 
#     <div class="quote" itemscope="" itemtype="http://schema.org/CreativeWork">
#      <span class="text" itemprop="text">
#       “It is better to be hated for what you are than to be loved for what you are not.”
#      </span>
#      <span>
#       by
#       <small class="author" itemprop="author">
#        André Gide
#       </small>
#       <a href="/author/Andre-Gide">
#        (about)
#       </a>
#      </span>
#      <div class="tags">
#       Tags:
#       <meta class="keywords" content="life,love" itemprop="keywords"/>
#       <a class="tag" href="/tag/life/page/1/">
#        life
#       </a>
#       <a class="tag" href="/tag/love/page/1/">
#        love
#       </a>
#      </div>
#     </div>

# div 한 블럭 

IndentationError: unindent does not match any outer indentation level (<string>, line 9)

In [26]:
# 명언과 저자 정보 가져오기
quotes = soup.select(".quote")
# soup.select(".quote") > CSS 선택자를 사용해 <div class="quote"> 요소를 모두 가져옴

In [28]:
for idx, quote in enumerate(quotes, 1):
    text = quote.select_one(".text").get_text(strip=True)
    author = quote.select_one(".author").get_text(strip=True)
    print(f"{idx}. {text} - {author}")

# enumerate(quotes, 1)
# → quotes 리스트를 인덱스(1부터 시작)와 함께 반복
    
# quote.select_one(".text").get_text(strip=True)
# → .text 클래스를 가진 <span>에서 명언 내용 추출

# quote.select_one(".author").get_text(strip=True)
# → .author 클래스를 가진 <span>에서 저자 이름 추출

# print(f"{idx}. {text} - {author}")
# → idx를 붙여서 "1. 명언 - 저자" 형식으로 출력

1. “The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.” - Albert Einstein
2. “It is our choices, Harry, that show what we truly are, far more than our abilities.” - J.K. Rowling
3. “There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.” - Albert Einstein
4. “The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.” - Jane Austen
5. “Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.” - Marilyn Monroe
6. “Try not to become a man of success. Rather become a man of value.” - Albert Einstein
7. “It is better to be hated for what you are than to be loved for what you are not.” - André Gide
8. “I have not failed. I've just found 10,000 ways that won't work.” - Thomas A. Edison
9. “A woman is like a tea bag; you never know how strong it is until it's in

In [3]:
pip install requests beautifulsoup4 pandas

Note: you may need to restart the kernel to use updated packages.


In [30]:
import requests
import csv
from bs4 import BeautifulSoup

# 크롤링할 웹사이트 URL
base_url = "https://quotes.toscrape.com"
page_url = "/page/1/"

# CSV 파일 생성 (UTF-8 인코딩 설정)
with open("quotes.csv", "w", newline="", encoding="utf-8-sig") as file:
    writer = csv.writer(file)
    writer.writerow(["Quote", "Author", "Tags"])

    while page_url:
        response = requests.get(base_url + page_url)
        soup = BeautifulSoup(response.text, "html.parser")

        # 명언 데이터 추출
        quotes = soup.find_all("div", class_="quote")

        for quote in quotes:
            text = quote.find("span", class_="text").get_text(strip=True)
            author = quote.find("small", class_="author").get_text(strip=True)
            tags = ", ".join(tag.get_text(strip=True) for tag in quote.find_all("a", class_="tag"))

            writer.writerow([text, author, tags])

        # 다음 페이지로 이동
        next_page = soup.find("li", class_="next")
        page_url = next_page.a["href"] if next_page else None

print("크롤링 완료! quotes.csv 파일이 생성되었습니다.")

크롤링 완료! quotes.csv 파일이 생성되었습니다.
