---
# [NLP 활용] 언론사별 상위 10개 뉴스기사 요약본 안내

In [5]:
# !pip install transformers
!pip install tf_keras

Collecting tf_keras
  Downloading tf_keras-2.16.0-py3-none-any.whl.metadata (1.6 kB)
Downloading tf_keras-2.16.0-py3-none-any.whl (1.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[?25hInstalling collected packages: tf_keras
Successfully installed tf_keras-2.16.0


In [6]:
import requests
from bs4 import BeautifulSoup
# Server에 부하를 주지 않기 위한 Crawling 속도 제한용 Library Package
import time
import random
# Crawling 진행 상황을 체크하기 위한 Module
from tqdm import tqdm
import pandas as pd
from konlpy.tag import Okt
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from transformers import pipeline, AutoTokenizer, TFAutoModelForSequenceClassification
from collections import Counter

# 뉴스 Crawling
def get_news_links_by_press (url) :
  headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
  response = requests.get(url, headers = headers)
  soup = BeautifulSoup(response.content, 'html.parser')
  
  press_data = {}
  press_sections = soup.select('.rankingnews_box')
  
  for press_section in tqdm(press_sections, desc = "언론사별 뉴스 크롤링") :
    press_name = press_section.select_one('.rankingnews_name').get_text(strip = True)
    news_links = set()  # 중복 제거를 위한 set 사용
    for item in press_section.select('li a') :
      title = item.get_text(strip = True)
      link = item['href']
      if title and link and "동영상" not in title :  # 타이틀이 존재하고 "동영상"이 포함되지 않은 경우에만 추가
        news_links.add((title, link))
    press_data[press_name] = list(news_links)[:5]  # 다시 list로 변환 후 상위 5개만 저장
    
    # 각 언론사별 뉴스 Crawling 후 대기 시간 추가
    time.sleep(random.uniform(0.5, 2.0))
  
  return press_data

# Data 전처리
def preprocess_text (text) :
  okt = Okt()
  # tokens = okt.morphs(text, stem = True)
  tokens = okt.nouns(text)  # 명사만 추출
  return ' '.join(tokens)

# 감성 분석 모델과 토크나이저를 미리 다운로드
model_name = 'distilbert-base-uncased-finetuned-sst-2-english'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = TFAutoModelForSequenceClassification.from_pretrained(model_name)

# 감성 분석 파이프라인 초기화
classifier = pipeline('sentiment-analysis', model = model, tokenizer = tokenizer)

# 감성 분석 함수
def sentiment_analysis (text) :
  return classifier(text)

# Topic 모델링
def topic_modeling(docs, num_topics = 5) :
  vectorizer = CountVectorizer(max_df = 0.95, min_df = 2, stop_words = 'english')
  dtm = vectorizer.fit_transform(docs)
  lda = LatentDirichletAllocation(n_components = num_topics, random_state = 0)
  lda.fit(dtm)
  return lda, vectorizer

# 주요 Keyword 추출
# def extract_keywords (docs, num_keywords = 10) :
#   words = ' '.join(docs).split()
#   counter = Counter(words)
#   return counter.most_common(num_keywords)
def extract_keywords(docs, num_keywords = 10) :
  words = ' '.join(docs).split()
  counter = Counter(words)
  # 불용어 제거
  stop_words = set(stopwords.words('english'))
  keywords = [(word, freq) for word, freq in counter.most_common(num_keywords) if word not in stop_words and len(word) > 1]
  return keywords

# 메인 함수
def main () :
  base_url = 'https://news.naver.com/main/ranking/popularDay.naver'
  press_news_data = get_news_links_by_press(base_url)
  
  # 뉴스 DataFrame 생성
  news_list = []
  for press_name, news_data in press_news_data.items() :
    for title, link in news_data :
      news_list.append([press_name, title, link])
  df = pd.DataFrame(news_list, columns=['Press', 'Title', 'Link'])
  
  # Data 전처리
  df['Processed_Title'] = df['Title'].apply(preprocess_text)
  
  # 감성 분석
  df['Sentiment'] = df['Title'].apply(lambda x: sentiment_analysis(x)[0]['label'])
  
  # Topic 모델링
  lda, vectorizer = topic_modeling(df['Processed_Title'].tolist())
  topics = lda.components_
  feature_names = vectorizer.get_feature_names_out()
  topic_keywords = []
  for topic_weights in topics :
    top_keywords = [feature_names[i] for i in topic_weights.argsort()[:-11:-1]]
    topic_keywords.append(top_keywords)
  
  # 주요 Keyword 추출
  keywords = extract_keywords(df['Processed_Title'].tolist())
  
  # 결과 출력
  print("오늘의 주요 키워드:")
  for keyword, freq in keywords :
    print(f"{keyword}: {freq}")
  
  print("\n추천 뉴스:")
  for idx, row in df.head(10).iterrows() :  # 상위 10개의 뉴스만 출력
    print(f"{idx + 1}. {row['Title']} ({row['Sentiment']})")
    print(f"   Link: {row['Link']}")

if __name__ == '__main__' :
  main()

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

2024-06-12 11:17:33.994085: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2
2024-06-12 11:17:33.994112: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2024-06-12 11:17:33.994122: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2024-06-12 11:17:33.994343: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-06-12 11:17:33.994355: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification.

All the weights of TFDistilBertForSequenceClassification were initialized from the PyTorch m

오늘의 주요 키워드:
지진: 62
부안: 45
전북: 41
규모: 39
속보: 17
최대: 16
올해: 15
물풍선: 14

추천 뉴스:
1. "싼값에 한 봉지 가득 담아간다" 하루 2000개씩 팔리는 '1000원 빵'[르포] (POSITIVE)
   Link: https://n.news.naver.com/article/277/0005430436?ntype=RANKING
2. "침대가 흔들려 깼어"…부안 지진에 누리꾼들도 화들짝 (POSITIVE)
   Link: https://n.news.naver.com/article/277/0005430575?ntype=RANKING
3. "제발 살아달라" 필사적으로 매달린 여고생…소중한 생명 구했다 (POSITIVE)
   Link: https://n.news.naver.com/article/277/0005430037?ntype=RANKING
4. "운동하면서 용돈도 벌어요"…'갓생' 사는 80세 노신사[배달의청춘]② (POSITIVE)
   Link: https://n.news.naver.com/article/277/0005430443?ntype=RANKING
5. 밀양 사건 폭로 유튜버, 생사람 잡았다…6번째 지목男 "난 가해자 아냐" (POSITIVE)
   Link: https://n.news.naver.com/article/277/0005430389?ntype=RANKING
6. [단독]"月300만원 이상도 번다" 서울 구직 시니어 1만명 돌파 (POSITIVE)
   Link: https://n.news.naver.com/article/025/0003366286?ntype=RANKING
7. 고준희, 버닝썬 루머에 "그 쌍X의 새X들…소속사도 방치하더라" (POSITIVE)
   Link: https://n.news.naver.com/article/025/0003366274?ntype=RANKING
8. "창문 깨졌다" "벽 갈라져"…부안 지진, 경기까지 흔들렸다 (POSITIVE)
   Link: https