## Scrapy 설치


In [1]:
!pip install scrapy

Collecting scrapy
  Downloading Scrapy-2.11.2-py2.py3-none-any.whl.metadata (5.3 kB)
Collecting Twisted>=18.9.0 (from scrapy)
  Downloading twisted-24.3.0-py3-none-any.whl.metadata (9.5 kB)
Collecting cssselect>=0.9.1 (from scrapy)
  Downloading cssselect-1.2.0-py2.py3-none-any.whl.metadata (2.2 kB)
Collecting itemloaders>=1.0.1 (from scrapy)
  Downloading itemloaders-1.3.1-py3-none-any.whl.metadata (3.9 kB)
Collecting parsel>=1.5.0 (from scrapy)
  Downloading parsel-1.9.1-py2.py3-none-any.whl.metadata (11 kB)
Collecting queuelib>=1.4.2 (from scrapy)
  Downloading queuelib-1.7.0-py2.py3-none-any.whl.metadata (5.7 kB)
Collecting service-identity>=18.1.0 (from scrapy)
  Downloading service_identity-24.1.0-py3-none-any.whl.metadata (4.8 kB)
Collecting w3lib>=1.17.0 (from scrapy)
  Downloading w3lib-2.2.1-py3-none-any.whl.metadata (2.1 kB)
Collecting zope.interface>=5.1.0 (from scrapy)
  Downloading zope.interface-7.0.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_

## Scrapy 프로젝트 생성

In [4]:
!scrapy startproject hankyung_crawler

New Scrapy project 'hankyung_crawler', using template directory '/usr/local/lib/python3.10/dist-packages/scrapy/templates/project', created in:
    /content/hankyung_crawler

You can start your first spider with:
    cd hankyung_crawler
    scrapy genspider example example.com


## 스파이더 만들기

In [15]:
!cd hankyung_crawler

In [16]:
!scrapy genspider hankyung search.hankyung.com

Created spider 'hankyung' using template 'basic' 


In [9]:
import scrapy

class HankyungSpider(scrapy.Spider):
    name = "hankyung"
    allowed_domains = ['https://search.hankyung.com/search/total?query=%EA%B8%88%EB%A6%AC']
    start_urls = ['https://search.hankyung.com/search/total?query=%EA%B8%88%EB%A6%AC/']

    def parse(self, response):
        print(response.text)
        print("END")

In [35]:
!cd /content/

In [37]:
!apt-get update
!apt-get install -y python3-dev
!apt-get install -y libxml2-dev libxslt-dev
!apt-get install -y zlib1g-dev
!apt-get install -y libffi-dev libssl-dev

!pip install scrapy


0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
0% [Connecting to archive.ubuntu.com (185.125.190.81)] [Waiting for headers] [1 InRelease 3,626 B/3,0% [Connecting to archive.ubuntu.com (185.125.190.81)] [Waiting for headers] [Connected to r2u.stat.                                                                                                    Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Ign:4 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:6 https://r2u.stat.illinois.edu/ubuntu jammy Release
Get:7 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [921 kB]
Get:8 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Hit:10 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jam

In [82]:
# hankyung.py 파일 생성 후 코드 삽입
import scrapy
from datetime import datetime

class HankyungNewsSpider(scrapy.Spider):
    name = "hankyung_news"

    def start_requests(self):
        search_term = self.search_term
        start_date = self.start_date
        end_date = self.end_date
        page_num = 1

        url = f"https://search.hankyung.com/search/news?query={search_term}&mediaid_clust=HKPAPER,HKCOM&period=DATE&area=ALL&sdate={start_date}&edate={end_date}&page={page_num}"
        yield scrapy.Request(url, callback=self.parse_list_page, meta={'page_num': page_num})

    def parse_list_page(self, response):
        # 뉴스 링크 추출
        news_links = response.css('ul.article > li > div.txt_wrap > a::attr(href)').getall()
        for link in news_links:
            yield response.follow(link, callback=self.parse_article)

        # 다음 페이지 링크가 있을 경우 크롤링
        next_page = response.css('a.next::attr(href)').get()
        if next_page:
            yield response.follow(next_page, callback=self.parse_list_page)

    def parse_article(self, response):
        # 날짜 추출
        date_str = response.css('#container > div > div > article > div > div > div.article-timestamp > div.datetime > span:nth-child(1) > span.txt-date').get()
        if not date_str:
            date_str = response.xpath('//div[@class="article-timestamp"]//span[@class="txt-date"]/text()').get()

        # 날짜 포맷 정리
        if date_str:
            try:
                # 날짜와 시간 정보 분리
                date_obj = datetime.strptime(date_str.strip(), '%Y.%m.%d %H:%M')
                date_only = date_obj.strftime('%Y.%m.%d')
            except ValueError:
                date_only = 'No date'
        else:
            date_only = 'No date'

        # 내용 추출
        content = ''.join(response.css('div.article-body-wrap > div.article-body ::text').getall()).strip()

        yield {
            'date': date_only,
            'content': content,
            'url': response.url
        }

    def __init__(self, search_term='', start_date='', end_date='', *args, **kwargs):
        super(HankyungNewsSpider, self).__init__(*args, **kwargs)
        self.search_term = search_term
        self.start_date = start_date
        self.end_date = end_date


In [83]:
!scrapy runspider hankyung.py -a search_term="금리" -a start_date="2016.01.01" -a end_date="2016.02.01" -o hankyung_news.json

2024-08-09 08:04:44 [scrapy.utils.log] INFO: Scrapy 2.11.2 started (bot: scrapybot)
2024-08-09 08:04:44 [scrapy.utils.log] INFO: Versions: lxml 4.9.4.0, libxml2 2.10.3, cssselect 1.2.0, parsel 1.9.1, w3lib 2.2.1, Twisted 24.3.0, Python 3.10.12 (main, Jul 29 2024, 16:56:48) [GCC 11.4.0], pyOpenSSL 24.2.1 (OpenSSL 3.2.2 4 Jun 2024), cryptography 42.0.8, Platform Linux-6.1.85+-x86_64-with-glibc2.35
2024-08-09 08:04:44 [scrapy.addons] INFO: Enabled addons:
[]


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
  return cls(crawler)
2024-08-09 08:04:44 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.epollreactor.EPollReactor
2024-08-09 08:04:44 [scrapy.extensions.telnet] INFO: Telnet Password: 9b9438550ddfae2d
2024-08-09 08:04:44 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'sc

## 날짜 데이터 가져오기

In [86]:
import pandas as pd
from datetime import datetime, timedelta

# 주어진 announce_date 리스트
announce_dates = [
    '2005-01-13', '2005-02-15', '2005-03-10', '2005-04-07', '2005-05-12', '2005-06-09', '2005-07-07',
    '2005-08-11', '2005-09-08', '2005-10-11', '2005-11-10', '2005-12-08', '2006-01-12', '2006-02-09',
    '2006-03-09', '2006-04-07', '2006-05-11', '2006-06-08', '2006-07-07', '2006-08-10', '2006-09-07',
    '2006-10-12', '2006-11-09', '2006-12-07', '2007-01-11', '2007-02-08', '2007-03-08', '2007-04-12',
    '2007-05-10', '2007-06-08', '2007-07-12', '2007-08-09', '2007-09-07', '2007-10-11', '2007-11-08',
    '2007-12-07', '2008-01-10', '2008-02-13', '2008-03-07', '2008-04-10', '2008-05-08', '2008-06-12',
    '2008-07-10', '2008-08-07', '2008-09-11', '2008-10-09', '2008-10-27', '2008-11-07', '2008-12-11',
    '2009-01-09', '2009-02-12', '2009-03-12', '2009-04-09', '2009-05-12', '2009-06-11', '2009-07-09',
    '2009-08-11', '2009-09-10', '2009-10-09', '2009-11-12', '2009-12-10', '2010-01-08', '2010-02-11',
    '2010-03-11', '2010-04-09', '2010-05-12', '2010-06-10', '2010-07-09', '2010-08-12', '2010-09-09',
    '2010-10-14', '2010-11-16', '2010-12-09', '2011-01-13', '2011-02-11', '2011-03-10', '2011-04-12',
    '2011-05-13', '2011-06-10', '2011-07-14', '2011-08-11', '2011-09-08', '2011-10-13', '2011-11-11',
    '2011-12-08', '2012-01-13', '2012-02-09', '2012-03-08', '2012-04-13', '2012-05-10', '2012-06-08',
    '2012-07-12', '2012-08-09', '2012-09-13', '2012-10-11', '2012-11-09', '2012-12-13', '2013-01-11',
    '2013-02-14', '2013-03-14', '2013-04-11', '2013-05-09', '2013-06-13', '2013-07-11', '2013-08-08',
    '2013-09-12', '2013-10-10', '2013-11-14', '2013-12-12', '2014-01-09', '2014-02-13', '2014-03-13',
    '2014-04-10', '2014-05-09', '2014-06-12', '2014-07-10', '2014-08-14', '2014-09-12', '2014-10-15',
    '2014-11-13', '2014-12-11', '2015-01-15', '2015-02-17', '2015-03-12', '2015-04-09', '2015-05-15',
    '2015-06-11', '2015-07-09', '2015-08-13', '2015-09-11', '2015-10-15', '2015-11-12', '2015-12-10',
    '2016-01-14', '2016-02-16', '2016-03-10', '2016-04-19', '2016-05-13', '2016-06-09', '2016-07-14',
    '2016-08-11', '2016-09-09', '2016-10-13', '2016-11-11', '2016-12-15', '2017-01-13', '2017-02-23',
    '2017-04-13', '2017-05-25', '2017-07-13', '2017-08-31', '2017-10-19', '2017-11-30', '2018-01-18',
    '2018-02-27', '2018-04-12', '2018-05-24', '2018-07-12', '2018-08-31', '2018-10-18', '2018-11-30',
    '2019-01-24', '2019-02-28', '2019-04-18', '2019-05-31', '2019-07-18', '2019-08-30', '2019-10-16',
    '2019-11-29', '2020-01-17', '2020-02-27', '2020-03-16', '2020-04-09', '2020-05-28', '2020-07-16',
    '2020-08-27', '2020-10-14', '2020-11-26', '2021-01-15', '2021-02-25', '2021-04-15', '2021-05-27',
    '2021-07-15', '2021-08-26', '2021-10-12', '2021-11-25', '2022-01-14', '2022-02-24', '2022-04-14',
    '2022-05-26', '2022-07-13', '2022-08-25', '2022-10-12', '2022-11-24', '2023-01-13', '2023-02-23',
    '2023-04-11', '2023-05-25', '2023-07-13', '2023-08-24', '2023-10-19', '2023-11-30', '2024-01-11',
    '2024-02-22', '2024-04-12', '2024-05-23', '2024-07-11', '2024-08-22', '2024-10-11', '2024-11-28'
]

# 결과를 저장할 리스트
results = []

# announce_dates 리스트를 datetime 객체로 변환
dates = [datetime.strptime(date, '%Y-%m-%d') for date in announce_dates]

# 각 날짜에 대해 start와 end 계산
for i in range(len(dates) - 1):
    start_date = dates[i] + timedelta(days=1)
    end_date = dates[i + 1] - timedelta(days=1)
    results.append({
        'start': start_date.strftime('%Y.%m.%d'),
        'end': end_date.strftime('%Y.%m.%d')
    })

# 마지막 announce_date 이후의 범위 처리 (optional, if needed)
# results.append({
#     'start': dates[-1] + timedelta(days=1),
#     'end': 'N/A'  # No end date as it's the last date
# })

# DataFrame으로 변환
df = pd.DataFrame(results)

# 출력
print(df)


          start         end
0    2005.01.14  2005.02.14
1    2005.02.16  2005.03.09
2    2005.03.11  2005.04.06
3    2005.04.08  2005.05.11
4    2005.05.13  2005.06.08
..          ...         ...
204  2024.04.13  2024.05.22
205  2024.05.24  2024.07.10
206  2024.07.12  2024.08.21
207  2024.08.23  2024.10.10
208  2024.10.12  2024.11.27

[209 rows x 2 columns]


In [87]:
df.to_csv('date_range.csv', index=False)

## 날짜별 데이터 크롤링

In [88]:
# run_spiders.py 생성 후 코드 입력
import csv
import subprocess

def run_spider_for_date_range(search_term, start_date, end_date):
    # Scrapy command to run the spider
    cmd = [
        'scrapy', 'crawl', 'hankyung_news',
        '-a', f'search_term={search_term}',
        '-a', f'start_date={start_date}',
        '-a', f'end_date={end_date}',
        '-o', f'{start_date}_{end_date}_news.csv'
    ]
    subprocess.run(cmd)

def main():
    input_csv = 'date_range.csv'

    with open(input_csv, mode='r') as file:
        reader = csv.DictReader(file)
        for row in reader:
            start_date = row['start']
            end_date = row['end']
            search_term = '금리'  # Example search term, modify as needed
            run_spider_for_date_range(search_term, start_date, end_date)

if __name__ == "__main__":
    main()


In [89]:
!python run_spiders.py

python3: can't open file '/content/run_spiders.py': [Errno 2] No such file or directory


## CSV 저장

In [85]:
import pandas as pd
import json

# JSON 파일 경로
json_file = 'hankyung_news.json'
# CSV 파일 경로
csv_file = 'hankyung_news.csv'

# JSON 파일 읽기
try:
    with open(json_file, 'r', encoding='utf-8') as file:
        data = json.load(file)
except json.JSONDecodeError as e:
    print(f"JSONDecodeError: {e}")
    data = []  # 빈 리스트로 초기화

# 데이터가 리스트 형태인지 확인
if isinstance(data, list):
    # DataFrame으로 변환
    df = pd.DataFrame(data)

    # 저장된 기사 개수 출력
    print(f"저장된 기사의 개수: {len(df)}")

    # CSV로 저장
    df.to_csv(csv_file, index=False, encoding='utf-8-sig')
    print(f"CSV 파일이 저장되었습니다: {csv_file}")
else:
    print("JSON 데이터가 리스트가 아닙니다. 파일 형식을 확인하세요.")


저장된 기사의 개수: 60
CSV 파일이 저장되었습니다: hankyung_news.csv
