In [1]:
# 웹 크롤링(web crawling)
# 웹을 통해 콘텐츠 및 기타 데이터를 검색하고 자동으로 인덱싱하는 프로그램
# 특정 URL과 도메인을 찾을 수 있음

In [2]:
# 웹 스크래핑(Web Scraping)
# 데이터, 즉 특정 웹사이트에서 추출하려는 데이터 필드에 관한 것
# 도메인은 알고 있어야 함

In [3]:
# 웹 크롤링 프로세스
# 크롤러는 각 특정 페이지의 모든 콘텐츠를 인덱싱한 후 인덱싱된 데이터를 데이터베이스에 저장함

In [4]:
# robots.txt
# 사이트의 최상위 경로

In [5]:
# 특정 User-agent가 특정 URL에 대해서 접근 가능한지 체크

import urllib.robotparser
rp = urllib.robotparser.RobotFileParser()
rp.set_url("https://www.reuters.com/robots.txt")
rp.read()

In [6]:
rp.can_fetch(useragent = "*", url = "https://reuters.com/sitemap.xml")

True

In [7]:
sitemaps = rp.site_maps()
sitemaps

['https://www.reuters.com/arc/outboundfeeds/sitemap-index/?outputType=xml',
 'https://www.reuters.com/arc/outboundfeeds/news-sitemap-index/?outputType=xml',
 'https://www.reuters.com/sitemap_video_index.xml',
 'https://www.reuters.com/brandfeature/sitemap']

In [8]:
!pip install xmltodict



In [9]:
import xmltodict
import requests

url = sitemaps[0] # /sitemap/latest-articles
sitemaps = xmltodict.parse(requests.get(url).text)

In [10]:
# sitemap urls을 확보

sitemap_urls = [sitemap['loc'] for sitemap in sitemaps['sitemapindex']['sitemap']]

In [11]:
# 개수 파악
print(f"{len(sitemap_urls)}개의 사이트맵 URL이 있습니다.")

# 샘플로 앞의 sitemap url 5개만 확인
sitemap_urls[:5]

96개의 사이트맵 URL이 있습니다.


['https://www.reuters.com/arc/outboundfeeds/sitemap/?outputType=xml',
 'https://www.reuters.com/arc/outboundfeeds/sitemap/?outputType=xml&from=100',
 'https://www.reuters.com/arc/outboundfeeds/sitemap/?outputType=xml&from=200',
 'https://www.reuters.com/arc/outboundfeeds/sitemap/?outputType=xml&from=300',
 'https://www.reuters.com/arc/outboundfeeds/sitemap/?outputType=xml&from=400']

In [12]:
response = requests.get(sitemap_urls[0])

In [13]:
urls = xmltodict.parse(response.text)

In [14]:
urls

{'urlset': {'@xmlns': 'http://www.sitemaps.org/schemas/sitemap/0.9',
  '@xmlns:image': 'http://www.google.com/schemas/sitemap-image/1.1',
  'url': [{'loc': 'https://www.reuters.com/business/autos-transportation/audi-will-not-cut-ev-prices-follow-teslas-lead-audi-europe-chief-2023-02-28/',
    'lastmod': '2023-02-28T07:13:27.750Z',
    'image:image': {'image:loc': 'https://www.reuters.com/resizer/3ck-rB2BWzRLINVI1mYRPBE4F2I=/cloudfront-us-east-2.images.arcpublishing.com/reuters/XKPE7AQGMVLO3PSI37MGYMG2OM.jpg',
     'image:caption': 'The Audi logo is seen during the first press day of the Paris auto show, in Paris, France, October 2, 2018. REUTERS/Benoit Tessier'}},
   {'loc': 'https://www.reuters.com/markets/europe/abrdn-swings-pretax-loss-markets-turmoil-weighs-2023-02-28/',
    'lastmod': '2023-02-28T07:13:48.463Z',
    'image:image': {'image:loc': 'https://www.reuters.com/resizer/RuuwsOfwropXQyGRsY1xRJCYNl4=/cloudfront-us-east-2.images.arcpublishing.com/reuters/K63GJYXFKZPB5NRGQRU3PX

In [15]:
# 뉴스의 URL값만 갖는 리스트를 생성하고 그 리스트를 news_urls 변수에 할당

news_urls = [url['loc'] for url in urls['urlset']['url']]

In [16]:
display(news_urls)

['https://www.reuters.com/business/autos-transportation/audi-will-not-cut-ev-prices-follow-teslas-lead-audi-europe-chief-2023-02-28/',
 'https://www.reuters.com/markets/europe/abrdn-swings-pretax-loss-markets-turmoil-weighs-2023-02-28/',
 'https://www.reuters.com/markets/asia/china-income-spending-per-capita-grow-over-2022-national-bureau-statistics-2023-02-28/',
 'https://www.reuters.com/business/finance/spains-santander-raises-three-year-profitability-target-15-17-2023-02-28/',
 'https://www.reuters.com/world/europe/russian-offensive-eastern-ukraine-focused-bakhmut-2023-02-28/',
 'https://www.reuters.com/lifestyle/sports/wta-roundup-alycia-parks-bounced-austin-opener-2023-02-28/',
 'https://www.reuters.com/markets/deals/insurer-axa-sells-off-most-its-shares-monte-dei-paschi-233-eurosshare-2023-02-28/',
 'https://www.reuters.com/world/us/florida-gov-desantis-signs-bill-asserting-state-oversight-land-around-walt-2023-02-27/',
 'https://www.reuters.com/world/india/india-bans-oil-tankers

In [22]:
%%time
# session 추상화
session = requests.Session()
# 앞의 5개 링크만 불러온다.
for url in news_urls[:5]:
    file = url.split("/")[-2] + ".html"
    response = session.get(url)
    if response.ok:
        with open("datas/"+file, "w+b") as f:
            f.write(response.text.encode("utf-8"))
    else:
        print(f"error with URL : {url}")

Wall time: 1.61 s


In [24]:
# filename.txt 파일을 쓰기 모드(w)로 열기
# open() 함수는 파일 객체 반환
file = open('filename.txt', 'w')

# 파일 객체의 write() 메서드를 통해 문자열을 파일에 작성 가능
file.write("파일에 작성할 문자열")

# 파일에 대한 처리가 끝났다면 반드시 파일 객체를 닫아줘야 한다.
file.close()

In [28]:
# close에 대한 처리 없이 간편히 쓰려면 with 활용
# 아래 코드는 이전에 작성했던 것과 동일한 동작 수행

with open('filename.txt', 'w') as file:
    file.write("파일에 작성할 문자열")

In [29]:
# 파일 읽기

with open('filename.txt', 'r') as file:
    print(file.read())

파일에 작성할 문자열


In [30]:
# 파일 이름 가져오기

import os
path = "./datas/"
files = [path + file for file in os.listdir(path)]
files

['./datas/abrdn-swings-pretax-loss-markets-turmoil-weighs-2023-02-28.html',
 './datas/audi-will-not-cut-ev-prices-follow-teslas-lead-audi-europe-chief-2023-02-28.html',
 './datas/china-income-spending-per-capita-grow-over-2022-national-bureau-statistics-2023-02-28.html',
 './datas/russian-offensive-eastern-ukraine-focused-bakhmut-2023-02-28.html',
 './datas/spains-santander-raises-three-year-profitability-target-15-17-2023-02-28.html']

In [31]:
# HTML 파서를 사용한 데이터 추출
# 앞서 불러온 html 목록에서 3번째를 불러오기

with open(files[2], "r", encoding="utf8") as f:
    html = f.read()

In [34]:
# BeautifulSoup
# HTML 파싱을 하는데 도움을 주는 모듈
# 파서의 종류: “html.parser”, “lxml”, “lxml-xml”, “xml”, “html5lib”

from bs4 import BeautifulSoup as bs
soup = bs(html, 'html.parser')
title = soup.select_one("#main-content > article > div.article__main__33WV2 > div.article__content__6hMn9 > header > div > div > h1")
print(title.text)

China urban employment, per capita spending logged rare declines in 2022


In [36]:
# <h1> 태그에 해당하는 객체 가져오기

soup.h1

<h1 class="text__text__1FZLe text__dark-grey__3Ml43 text__medium__1kbOh text__heading_3__1kDhc heading__base__2T28j heading__heading_3__3aL54 article-header__title__3Y2hh" data-testid="Heading">China urban employment, per capita spending logged rare declines in 2022</h1>

In [37]:
# <h1> 태그 사이에 있는 텍스트 값 가져오기

soup.h1.text

'China urban employment, per capita spending logged rare declines in 2022'

In [43]:
soup.select_one("div.article-body__content__17Yit.paywall-article > p").text

"BEIJING, Feb 28 (Reuters) - China's urban employment fell for the first time in six decades last year and per capita spending also marked a rare decline, as harsh COVID-19 curbs ravaged the world's second-biggest economy."

In [44]:
soup.select("p")

[<p class="text__text__1FZLe text__dark-grey__3Ml43 text__regular__2N1Xr text__large__nEccO body__full_width__ekUdw body__large_body__FV5_X article-body__element__2p5pI" data-testid="paragraph-0" style="font-size:20px">BEIJING, Feb 28 (Reuters) - China's urban employment fell for the first time in six decades last year and per capita spending also marked a rare decline, as harsh COVID-19 curbs ravaged the world's second-biggest economy.</p>,
 <p class="text__text__1FZLe text__dark-grey__3Ml43 text__regular__2N1Xr text__large__nEccO body__full_width__ekUdw body__large_body__FV5_X article-body__element__2p5pI" data-testid="paragraph-1" style="font-size:20px">The new data from the National Bureau of Statistics also showed the smallest income growth in more than three decades.</p>,
 <p class="text__text__1FZLe text__dark-grey__3Ml43 text__regular__2N1Xr text__large__nEccO body__full_width__ekUdw body__large_body__FV5_X article-body__element__2p5pI" data-testid="paragraph-2" style="font-siz

In [45]:
# 표준(Standard) URL(=canonical URL)

soup.find("link", {"rel": "canonical"})["href"]

'https://www.reuters.com/markets/asia/china-income-spending-per-capita-grow-over-2022-national-bureau-statistics-2023-02-28/'