In [1]:
import requests
from bs4 import BeautifulSoup
import time
from datetime import datetime, timedelta
import pandas as pd
import re
import os

def fetch_news_links_for_offices(query, start_date, end_date, news_offices, max_pages=1):
    base_url = "https://search.naver.com/search.naver"
    start = datetime.strptime(start_date, "%Y.%m.%d")
    end = datetime.strptime(end_date, "%Y.%m.%d")
    delta = timedelta(days=1)

    all_links_by_office = {}  # 뉴스 오피스별 링크 저장

    for office in news_offices:
        print(f"Fetching links for office: {office}")
        office_links = {}  # 특정 뉴스 오피스의 날짜별 링크 저장
        current_start = start

        while current_start <= end:
            # 현재 날짜 문자열
            current_date = current_start.strftime("%Y.%m.%d")
            
            # 네이버 검색 파라미터
            params = {
                "where": "news",
                "query": query,
                "sm": "tab_opt",
                "sort": "0",
                "photo": "0",
                "field": "0",
                "pd": "3",
                "ds": current_date,
                "de": current_date,
                "mynews": "1",
                "office_type": "1",
                "office_section_code": "101",
                "news_office_checked": office,  # 현재 뉴스 오피스 코드
                "nso": f"so:r,p:from{current_date.replace('.', '')}to{current_date.replace('.', '')}",
            }

            daily_links = []  # 하루치 링크 저장

            for page in range(1, max_pages + 1):  # 지정된 페이지 수만큼 탐색
                params["start"] = (page - 1) * 10  # 페이지 이동
                response = requests.get(base_url, params=params)
                response.raise_for_status()
                soup = BeautifulSoup(response.text, "html.parser")

                # 뉴스 링크 가져오기
                articles = soup.select("a.info")
                links = [
                    article["href"]
                    for article in articles
                    if "naver.com" in article["href"]  # 네이버 뉴스만 필터링
                ]
                daily_links.extend(links)

                # 진행 상태 출력
                print(f"{current_date} - Office {office} - Page {page} collected {len(links)} links.")

                # 한 페이지만 수집하므로 break
                break

            # 날짜별 링크 저장
            office_links[current_date] = daily_links

            # 다음 날짜로 이동
            current_start += delta

            # 요청 간 딜레이 추가
            time.sleep(2)

        # 뉴스 오피스별 링크 저장
        all_links_by_office[office] = office_links

    return all_links_by_office

In [2]:
# Parameters
query = "조현아"
start_date = "2014.11.05"
end_date = "2014.12.19"

news_offices = ["1032", "1005","1020","1021","1081","1022","1023","1025","1028","1469"]  
# 뉴스 오피스 ID 리스트
# 경향신문 1032, 국민일보 1005, 동아일보 1020, 문화일보 1021, 서울신문 1081, 세계일보 1022, 조선일보 1023, 중앙일보 1025, 한겨레 1028, 한국일보 1469

max_pages = 1  # 각 날짜에서 1페이지씩만 수집

# Fetch links
news_links_by_office = fetch_news_links_for_offices(query, start_date, end_date, news_offices, max_pages)

# Results output
print("\n뉴스 오피스별 수집된 링크:")
for office, links_by_date in news_links_by_office.items():
    print(f"Office {office}:")
    for date, links in links_by_date.items():
        print(f"  {date} ({len(links)}개):")
        for link in links:
            print(f"    - {link}")

Fetching links for office: 1032
2014.11.05 - Office 1032 - Page 1 collected 0 links.
2014.11.06 - Office 1032 - Page 1 collected 0 links.
2014.11.07 - Office 1032 - Page 1 collected 0 links.
2014.11.08 - Office 1032 - Page 1 collected 0 links.
2014.11.09 - Office 1032 - Page 1 collected 0 links.
2014.11.10 - Office 1032 - Page 1 collected 0 links.
2014.11.11 - Office 1032 - Page 1 collected 0 links.
2014.11.12 - Office 1032 - Page 1 collected 0 links.
2014.11.13 - Office 1032 - Page 1 collected 0 links.
2014.11.14 - Office 1032 - Page 1 collected 0 links.
2014.11.15 - Office 1032 - Page 1 collected 0 links.
2014.11.16 - Office 1032 - Page 1 collected 0 links.
2014.11.17 - Office 1032 - Page 1 collected 0 links.
2014.11.18 - Office 1032 - Page 1 collected 0 links.
2014.11.19 - Office 1032 - Page 1 collected 0 links.
2014.11.20 - Office 1032 - Page 1 collected 0 links.
2014.11.21 - Office 1032 - Page 1 collected 0 links.
2014.11.22 - Office 1032 - Page 1 collected 0 links.
2014.11.23 - O

In [3]:
data = []
for office, dates in news_links_by_office.items():
    for date, links in dates.items():
        for link in links:
            data.append({"Office": office, "Date": date, "Link": link})

# Convert the list of rows to a DataFrame
df = pd.DataFrame(data)

news_offices_dict = {'032': "경향신문", '005': "국민일보", '020': "동아일보", '021': "문화일보", '081': "서울신문", '022': "세계일보", '023': "조선일보",
    '025': "중앙일보", '028': "한겨레", '469': "한국일보"}

df['Office_Code'] = df['Link'].str[39:42]
df['Office'] = df['Office_Code'].map(news_offices_dict)
df.drop(columns=['Office_Code'], inplace=True)

def extract_identity(link):
    if link[8] == "m":
        return 'm' + link[-10:]
    elif link[8] == "n":
        return 'n' + link[-18:-8]
    else:
        return "Unknown"

df["Identity"] = df["Link"].apply(extract_identity)


# Convert to datetime object
start_date_datetime = datetime.strptime(start_date, "%Y.%m.%d")
# Format the datetime object to a string in a new format
csv_friendly_date = start_date_datetime.strftime("%Y-%m-%d")

csv_file_name = f"{query}_links_{csv_friendly_date}.csv"
df.to_csv(csv_file_name, index=False, encoding="utf-8")

In [4]:
input_csv = f"{query}_links_{csv_friendly_date}.csv"
links_df = pd.read_csv(input_csv)


# Initialize lists to store successful data and exceptions
successful_offices = []
successful_dates = []
successful_urls = []
successful_contents = []

exception_offices = []
exception_dates = []
exception_urls = []
exception_errors = []

for _, row in links_df.iterrows():  # Iterate through each row
    office = row["Office"]  
    date = row["Date"]
    link = row["Link"]
    identity = row["Identity"]

    try:
        front = identity[0]
        response = requests.get(link)
        response.raise_for_status()  # Check for HTTP request errors
        soup = BeautifulSoup(response.text, 'lxml')
        
        if front == "n":
            article = soup.select_one("article#dic_area")
        else:
            continue

        if article:
            article_text = article.get_text(strip=True)
        else:
            article_text = "Main content not found"

        # Append successful extraction data
        successful_offices.append(office)
        successful_dates.append(date)
        successful_urls.append(link)
        successful_contents.append(article_text)

    except Exception as e:
        # Handle exceptions and record them
        exception_offices.append(office)
        exception_dates.append(date)
        exception_urls.append(link)
        exception_errors.append(str(e))
        print(f"Error processing link {link}: {e}")    
    
# Create DataFrame for successful links
success_df = pd.DataFrame({
    "Office": successful_offices,
    "Date": successful_dates,
    "URL": successful_urls,
    "Content": successful_contents,
})

# Create DataFrame for exceptions
exceptions_df = pd.DataFrame({
    "Office": exception_offices,
    "Date": exception_dates,
    "URL": exception_urls,
    "Error": exception_errors
})

# Save both DataFrames to CSV files
success_csv = f"{query}_success_{csv_friendly_date}.csv"
exceptions_csv = f"{query}_exception_{csv_friendly_date}.csv"

success_df.to_csv(success_csv, index=False)
exceptions_df.to_csv(exceptions_csv, index=False)


print(f"Successfully extracted text to {success_csv}")
print(f"Exception links to {exceptions_csv}")

Error processing link https://n.news.naver.com/mnews/article/020/0002705146: 500 Server Error: Internal Server Error for url: https://n.news.naver.com/mnews/article/020/0002705146
Successfully extracted text to 조현아_success_2014-11-05.csv
Exception links to 조현아_exception_2014-11-05.csv


In [5]:
success_csv = f"{query}_success_{csv_friendly_date}.csv"
exceptions_csv = f"{query}_exception_{csv_friendly_date}.csv"

success_df = pd.read_csv(success_csv)
exceptions_df = pd.read_csv(exceptions_csv)

In [6]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By

# Chrome options
options = webdriver.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

# Define a function to scrape comments from a single URL
def scrape_comments(url):
    try:
        driver.get(url)
        time.sleep(3)  # Allow page to load
        comments = []

        # Locate the comment elements
        comment_elements = driver.find_elements(By.CSS_SELECTOR, ".u_cbox_text_wrap")
        for element in comment_elements:
            comments.append(element.text.strip())

        # Join comments with "|"
        print(comments)
        return " | ".join(comments)

    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return ""
    

In [8]:
# Initialize the Chrome WebDriver using WebDriver Manager
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# Scrape comments for each URL and update the DataFrame
for index, row in success_df.iterrows():
    print(f"Scraping comments for {row['URL']}...")
    comments = scrape_comments(row["URL"])
    success_df.at[index, "comments"] = comments

# Save the updated DataFrame back to the CSV
output_path = f"{query}_with_comments_{csv_friendly_date}.csv"
success_df.to_csv(output_path, index=False)

# Close the WebDriver
driver.quit()

print(f"Scraping completed. Updated file saved to {output_path}")

Scraping comments for https://n.news.naver.com/mnews/article/032/0002551244?sid=101...


KeyboardInterrupt: 