In [1]:
import konlpy
from konlpy.tag import Okt
import warnings
import pandas as pd
import numpy as np
import re
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from konlpy.tag import Okt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, precision_score, recall_score
import urllib
import requests
import warnings
from bs4 import BeautifulSoup
from time import sleep
warnings.filterwarnings('ignore')

# [3월 18일]
---

## # 네이버 뉴스 4개 카테고리 분석
---
- 뉴스기사 카테고리별 저장(파일 도는 DB)
- 텍스트 전처리 및 피처 벡터화
- 모델링 및 평가(성능개선 포함)
- 참조 : https://wikidocs.net/74715

In [146]:
url = 'https://news.naver.com/main/main.naver?mode=LSD&mid=shm&sid1=100#&date=%2000:00:00&page=1'
html = urllib.request.urlopen(url)
soup = BeautifulSoup(html, 'html.parser')

soup.find_all('div', class_ = ['section_body'])

[<div class="section_body" id="section_body"></div>]

In [132]:
path = 'D:\workspace\python\selenium_tool\chromedriver.exe'
driver = webdriver.Chrome(path)

article_politics = []
article_economy = []
article_society = []
article_science = []
for cat in [100, 101, 102, 105]:
    for page in range(1, 26):
        url = f'https://news.naver.com/main/main.naver?mode=LSD&mid=shm&sid1={cat}#&date=%2000:00:00&page={page}'
        driver.get(url)
        sleep(2)
        for i in range(1, 5):
            try:
                for j in range(1, 6):
                    driver.find_element_by_xpath(f'//*[@id="section_body"]/ul[{i}]/li[{j}]/dl/dt[2]/a').send_keys(Keys.ENTER)
    #                 sleep(0.7)
                    article = driver.find_element_by_xpath('//*[@id="articleBodyContents"]')
                    a = re.sub('[^a-zA-Z가-힣\s]+', '', article.text)
                    b = re.sub('[\n]+', ' ', a)
                    if cat == 100: article_politics.append(b)
                    elif cat == 101: article_economy.append(b)
                    elif cat == 102: article_society.append(b)
                    elif cat == 105: article_science.append(b)    
                    driver.back()
    #                 sleep(0.7)
            except:
                pass
            
driver.close()

In [138]:
len(article_politics), len(article_economy), len(article_society), len(article_science)

(500, 500, 500, 500)

In [133]:
politics_df = pd.DataFrame(article_politics, columns = ['article'])
politics_df['target'] = 0
economy_df = pd.DataFrame(article_economy, columns = ['article'])
economy_df['target'] = 1
society_df = pd.DataFrame(article_society, columns = ['article'])
society_df['target'] = 2
science_df = pd.DataFrame(article_science, columns = ['article'])
science_df['target'] = 3

politics_df.to_csv('politics_df.csv')
economy_df.to_csv('economy_df.csv')
society_df.to_csv('society_df.csv')
science_df.to_csv('science_df.csv')

In [2]:
politics_df = pd.read_csv('politics_df.csv', index_col = 0)
economy_df = pd.read_csv('economy_df.csv', index_col = 0)
society_df = pd.read_csv('society_df.csv', index_col = 0)
science_df = pd.read_csv('science_df.csv', index_col = 0)

In [3]:
article_df = pd.concat((politics_df, economy_df, society_df, science_df)).reset_index(drop = True)
display(article_df.head())

X_train, X_test, y_train, y_test = train_test_split(article_df['article'], article_df['target'], test_size = 0.25, stratify = article_df['target'])

Unnamed: 0,article,target
0,이번 대선에서 국민의힘 윤석열 대통령 당선인이 이상을 득표한 해운대구와 수영구 금...,0
1,동영상 뉴스 앵커 청와대가 아닌 곳에서 업무를 시작한다는 데 방점을 뒀지만 광화문 ...,0
2,윤 당선인 집무실 용산 이전남은 문제는 경향신문 청와대 바라보는 나들이 나온 시민들...,0
3,앵커 정치부 기자와 몇 가지 더 짚어보겠습니다 정성호 기자 나와 있습니다 취임 첫날...,0
4,조감도 가져다 달라기자 질문에 직접 답한 이데일리 권혜미 기자 윤석열 대통령 당선...,0


In [4]:
from konlpy.tag import Okt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

okt = Okt()

def tw_tokenizer(text):
    tokens_ko = okt.morphs(text)
    return tokens_ko

tfidf_vect = TfidfVectorizer(tokenizer = tw_tokenizer, ngram_range = (1, 2), min_df = 3, max_df = 0.9)
tfidf_vect.fit(X_train)
tfidf_matrix_train = tfidf_vect.transform(X_train)
tfidf_matrix_test = tfidf_vect.transform(X_test)

In [5]:
# 로지스틱레그레션의 최적 파라미터 찾기

lg_clf = LogisticRegression(random_state = 0, n_jobs = -1)

params = {'C': [1, 3.5, 4.5, 5.5, 10]}

grid_cv = GridSearchCV(lg_clf, param_grid = params, cv = 3, scoring = 'accuracy', verbose = 0)
grid_cv.fit(tfidf_matrix_train, y_train)
print('최적 파라미터 :', grid_cv.best_params_, '/ 최적 파라미터의 정확도 :', round(grid_cv.best_score_, 4))

최적 파라미터 : {'C': 4.5} / 최적 파라미터의 정확도 : 0.8393


In [6]:
best_estimator = grid_cv.best_estimator_
pred = best_estimator.predict(tfidf_matrix_test)
pred_proba = best_estimator.predict_proba(tfidf_matrix_test)

def get_clf_eval(y_test, pred, pred_proba):
    accuracy = accuracy_score(y_test, pred)
    auc = roc_auc_score(y_test, pred_proba, multi_class = 'ovo')
    print(f'정확도 : {round(accuracy, 3)}, \nAUC : {round(auc, 3)}')

get_clf_eval(y_test, pred, pred_proba)

정확도 : 0.83, 
AUC : 0.961


In [8]:
# 파이프라인 사용

pipeline = Pipeline([
    ('tfidf_vect', TfidfVectorizer(tokenizer = tw_tokenizer, ngram_range = (1, 2), min_df = 3, max_df = 0.9)),
    ('lr_clf', LogisticRegression(random_state = 0, C = 4.5, n_jobs = -1))
])

pipeline.fit(X_train, y_train)
pred = pipeline.predict(X_test)
pred_proba = pipeline.predict_proba(X_test)

get_clf_eval(y_test, pred, pred_proba)

정확도 : 0.83, 
AUC : 0.961
