---
# [NLP 활용] 언론사별 상위 10개 뉴스기사 요약본 안내

In [16]:
import requests
from bs4 import BeautifulSoup
# Server에 부하를 주지 않기 위한 Crawling 속도 제한용 Library Package
import time
import random
# Crawling 진행 상황을 체크하기 위한 Module
from tqdm import tqdm
import pandas as pd
from konlpy.tag import Okt
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from transformers import pipeline, AutoTokenizer, TFAutoModelForSequenceClassification
from collections import Counter

# 뉴스 Crawling
def get_news_links_by_press (url) :
  headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
  response = requests.get(url, headers = headers)
  soup = BeautifulSoup(response.content, 'html.parser')
  
  press_data = {}
  press_sections = soup.select('.rankingnews_box')
  
  for press_section in tqdm(press_sections, desc = "언론사별 뉴스 크롤링") :
    press_name = press_section.select_one('.rankingnews_name').get_text(strip = True)
    news_links = set()  # 중복 제거를 위한 set 사용
    for item in press_section.select('li a') :
      title = item.get_text(strip = True)
      link = item['href']
      if title and link and "동영상" not in title :  # 타이틀이 존재하고 "동영상"이 포함되지 않은 경우에만 추가
        news_links.add((title, link))
    press_data[press_name] = list(news_links)[:5]  # 다시 list로 변환 후 상위 5개만 저장
    
    # 각 언론사별 뉴스 Crawling 후 대기 시간 추가
    time.sleep(random.uniform(0.5, 2.0))
  
  return press_data

# Data 전처리
def preprocess_text (text) :
  okt = Okt()
  # tokens = okt.morphs(text, stem = True)
  tokens = okt.nouns(text)  # 명사만 추출
  return ' '.join(tokens)

# 감성 분석 모델과 토크나이저를 미리 다운로드
model_name = 'distilbert-base-uncased-finetuned-sst-2-english'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = TFAutoModelForSequenceClassification.from_pretrained(model_name)

# 감성 분석 파이프라인 초기화
classifier = pipeline('sentiment-analysis', model = model, tokenizer = tokenizer)

# 감성 분석 함수
def sentiment_analysis (text) :
  return classifier(text)

# Topic 모델링
def topic_modeling(docs, num_topics = 5) :
  vectorizer = CountVectorizer(max_df = 0.95, min_df = 2, stop_words = 'english')
  dtm = vectorizer.fit_transform(docs)
  lda = LatentDirichletAllocation(n_components = num_topics, random_state = 0)
  lda.fit(dtm)
  return lda, vectorizer

# 주요 Keyword 추출
# def extract_keywords (docs, num_keywords = 10) :
#   words = ' '.join(docs).split()
#   counter = Counter(words)
#   return counter.most_common(num_keywords)
def extract_keywords(docs, num_keywords = 10) :
  words = ' '.join(docs).split()
  counter = Counter(words)
  # 불용어 제거
  stop_words = set(stopwords.words('english'))
  keywords = [(word, freq) for word, freq in counter.most_common(num_keywords) if word not in stop_words and len(word) > 1]
  return keywords

# 메인 함수
def main () :
  base_url = 'https://news.naver.com/main/ranking/popularDay.naver'
  press_news_data = get_news_links_by_press(base_url)
  
  # 뉴스 DataFrame 생성
  news_list = []
  for press_name, news_data in press_news_data.items() :
    for title, link in news_data :
      news_list.append([press_name, title, link])
  df = pd.DataFrame(news_list, columns=['Press', 'Title', 'Link'])
  
  # Data 전처리
  df['Processed_Title'] = df['Title'].apply(preprocess_text)
  
  # 감성 분석
  df['Sentiment'] = df['Title'].apply(lambda x: sentiment_analysis(x)[0]['label'])
  
  # Topic 모델링
  lda, vectorizer = topic_modeling(df['Processed_Title'].tolist())
  topics = lda.components_
  feature_names = vectorizer.get_feature_names_out()
  topic_keywords = []
  for topic_weights in topics :
    top_keywords = [feature_names[i] for i in topic_weights.argsort()[:-11:-1]]
    topic_keywords.append(top_keywords)
  
  # 주요 Keyword 추출
  keywords = extract_keywords(df['Processed_Title'].tolist())
  
  # 결과 출력
  print("오늘의 주요 키워드:")
  for keyword, freq in keywords :
    print(f"{keyword}: {freq}")
  
  print("\n추천 뉴스:")
  for idx, row in df.head(10).iterrows() :  # 상위 10개의 뉴스만 출력
    print(f"{idx + 1}. {row['Title']} ({row['Sentiment']})")
    print(f"   Link: {row['Link']}")

if __name__ == '__main__' :
  main()

All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification.

All the weights of TFDistilBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.
언론사별 뉴스 크롤링: 100%|██████████| 81/81 [01:46<00:00,  1.31s/it]


오늘의 주요 키워드:
사망: 21
중대장: 16
입건: 16
조사: 14
정부: 14
사건: 14
훈련병: 13
경찰: 12

추천 뉴스:
1. “학교 X 같은 사람”…대학축제 무대서 욕한 비비 사과, 무슨일이 (POSITIVE)
   Link: https://n.news.naver.com/article/009/0005316389?ntype=RANKING
2. “브레이크 고장났다”…경주서 페라리 몰다 앞차와 추돌한 日괴짜부호 (POSITIVE)
   Link: https://n.news.naver.com/article/009/0005316276?ntype=RANKING
3. “우리 아빠가 윤석열 나쁜 사람이래요”…조국이 전한 강원도 민심? (POSITIVE)
   Link: https://n.news.naver.com/article/009/0005316302?ntype=RANKING
4. “몰래 낳았는데 울면 들킬까봐”...신생아 밟아 죽인 비정한 20대 미혼모 (POSITIVE)
   Link: https://n.news.naver.com/article/009/0005316372?ntype=RANKING
5. “처벌 안받아도 평생 반성”...임창정, 주가조작 연루 무혐의 심경 (POSITIVE)
   Link: https://n.news.naver.com/article/009/0005316338?ntype=RANKING
6. 비트코인 박스권 탈출, 마지막 고비?… 이번주 美 FOMC·5월 CPI 발표 [DD주간브리핑] (NEGATIVE)
   Link: https://n.news.naver.com/article/138/0002175012?ntype=RANKING
7. [DD's톡] 연일 지붕 뚫고 하이킥…삼양식품·카페24 질주 이유 들여다보니 (POSITIVE)
   Link: https://n.news.naver.com/article/138/0002174982?ntype=RANKING
8. 디도스 때문에 못 살겠네… T1의 간절한 SOS [e모션] (NEG