In [9]:
# module import
import numpy as np
import re
import pickle
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
# 경로 설정
root_path = "/content/drive/My Drive/멀티캠퍼스/[혁신성장] 인공지능 자연어처리 기반/[강의]/조성현 강사님"
data_path = f"{root_path}/dataset"

# _1_. 데이터

In [3]:
# 데이터 로드
with open(f"{data_path}/news.data", 'rb') as f:
    news_data = pickle.load(f)

In [4]:
dir(news_data)

['DESCR', 'data', 'filenames', 'target', 'target_names']

In [6]:
# 데이터 확인
news = news_data.data
print(len(news))
print(news[0])
print('')

# 토픽 확인
print(len(news_data.target_names))
print(news_data.target_names)

11314
Well i'm not sure about the story nad it did seem biased. What
I disagree with is your statement that the U.S. Media is out to
ruin Israels reputation. That is rediculous. The U.S. media is
the most pro-israeli media in the world. Having lived in Europe
I realize that incidences such as the one described in the
letter have occured. The U.S. media as a whole seem to try to
ignore them. The U.S. is subsidizing Israels existance and the
Europeans are not (at least not to the same degree). So I think
that might be a reason they report more clearly on the
atrocities.
	What is a shame is that in Austria, daily reports of
the inhuman acts commited by Israeli soldiers and the blessing
received from the Government makes some of the Holocaust guilt
go away. After all, look how the Jews are treating other races
when they got power. It is unfortunate.


20
['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.

In [10]:
# 전처리
news_1 = []
news_2 = []


# 1) 영문자 아닌 것 제외
for doc in news:
    news_1_temp = re.sub("[^a-zA-Z]", " ", doc)
    news_1.append(news_1_temp)

# 2) 불용어 제거, 소문자 변환, 길이 3이하 단어 제거
stopwords_list = stopwords.words('english')
for doc in news:
    news_2_temp = [] # 임시로 보관할 리스트
    for w in doc.split():
        w = w.lower()
        if len(w) > 3 and w not in stopwords_list:
            news_2_temp.append(w)
    news_2.append(" ".join(news_2_temp))

# 데이터 확인
print(news_2[0])

well sure story seem biased. disagree statement u.s. media ruin israels reputation. rediculous. u.s. media pro-israeli media world. lived europe realize incidences described letter occured. u.s. media whole seem ignore them. u.s. subsidizing israels existance europeans least degree). think might reason report clearly atrocities. shame austria, daily reports inhuman acts commited israeli soldiers blessing received government makes holocaust guilt away. all, look jews treating races power. unfortunate.


# _2_. Tfidf matrix

In [11]:
# tfidf 행렬
tf_vector = TfidfVectorizer(max_features=500) # 500개 사용
tfidf_matrix = tf_vector.fit_transform(news_2)
print(tfidf_matrix.shape) # sparse matrix

(11314, 500)


In [12]:
# 어휘 목록 확인
vocab_list = tf_vector.get_feature_names()
print(len(vocab_list)) # 500개
print(vocab_list[:30]) # 30개 확인

500
['00', '000', '0t', '10', '11', '12', '14', '145', '15', '16', '17', '1992', '1993', '1d9', '1t', '20', '24', '25', '2di', '30', '34u', '3t', '50', '75u', '93', 'a86', 'able', 'access', 'actually', 'address']


# _3_. LDA
* document-topic 분포 반환
* iteration 횟수 조정: 수렴하지 않은 경우 `max_iter` 증가.
* 평가 지표: perplexity

In [13]:
# LDA 모델 생성
lda_model = LDA(n_components=len(news_data.target_names),
                learning_method='online',
                evaluate_every=5,
                max_iter=1000,
                verbose=1)

In [15]:
# tf-idf 모델 학습 후 토픽 반환
doc_topics = lda_model.fit_transform(tfidf_matrix)

iteration: 1 of max_iter: 1000
iteration: 2 of max_iter: 1000
iteration: 3 of max_iter: 1000
iteration: 4 of max_iter: 1000
iteration: 5 of max_iter: 1000, perplexity: 1520.1812
iteration: 6 of max_iter: 1000
iteration: 7 of max_iter: 1000
iteration: 8 of max_iter: 1000
iteration: 9 of max_iter: 1000
iteration: 10 of max_iter: 1000, perplexity: 1519.6199
iteration: 11 of max_iter: 1000
iteration: 12 of max_iter: 1000
iteration: 13 of max_iter: 1000
iteration: 14 of max_iter: 1000
iteration: 15 of max_iter: 1000, perplexity: 1519.5822


# _4_. 결과 확인

In [18]:
# 문서별 토픽 번호 확인: 10개까지만 확인
for i in range(10):
    print(f"문서 번호={i}, 할당된 주제={np.argmax(doc_topics[i:(i+1), :][0])}")

문서 번호=0, 할당된 주제=18
문서 번호=1, 할당된 주제=18
문서 번호=2, 할당된 주제=18
문서 번호=3, 할당된 주제=13
문서 번호=4, 할당된 주제=13
문서 번호=5, 할당된 주제=18
문서 번호=6, 할당된 주제=1
문서 번호=7, 할당된 주제=18
문서 번호=8, 할당된 주제=15
문서 번호=9, 할당된 주제=18


In [22]:
# 토픽-단어 행렬에서 토픽 별로 중요 단어 10개 표시
topic_term_matrix = lda_model.components_
print(topic_term_matrix)
print(topic_term_matrix.shape) # VT에 해당

for i in range(len(topic_term_matrix)):
    indices = np.flipud(topic_term_matrix[i].argsort())[:10] # 큰 순서대로 정렬 후 10개 인덱스 반환
    print(f"토픽={i+1}")
    for idx in indices:
        print(f"   {vocab_list[idx]}")
    print("")

[[1.15081489e+00 5.00000061e-02 5.00000001e-02 ... 1.77168418e+00
  6.72926881e+00 8.43133681e+00]
 [5.00000003e-02 5.00000002e-02 5.00000001e-02 ... 5.00000152e-02
  5.00000087e-02 5.00000019e-02]
 [5.00000391e-02 5.00000010e-02 5.00000002e-02 ... 5.00000010e-02
  5.00000009e-02 5.00000003e-02]
 ...
 [5.00000012e-02 5.00000015e-02 5.00000002e-02 ... 2.85823527e+01
  2.17984571e+01 1.84411263e+01]
 [5.00000922e-02 5.52691217e+00 5.00000001e-02 ... 3.83518716e+01
  2.87581056e+01 6.26235471e+01]
 [6.53082624e+01 5.00000016e-02 5.00000001e-02 ... 5.00000004e-02
  5.00000460e-02 5.00000103e-02]]
(20, 500)
토픽=1
   file
   program
   files
   windows
   window
   com
   edu
   data
   using
   image

토픽=2
   would
   thought
   much
   says
   thing
   look
   sure
   out
   might
   like

토픽=3
   driver
   speed
   drive
   heard
   side
   see
   sound
   output
   code
   rate

토픽=4
   sound
   games
   graphics
   video
   for
   already
   drivers
   day
   window
   four

토픽=5
   driv

In [26]:
# 문서별 분류 코드 예시로 확인
def check_topic(x, y):
    print(f"문서 {x}의 topic = {news_data.target_names[news_data.target[x]]}")
    print(f"문서 {y}의 topic = {news_data.target_names[news_data.target[y]]}")

In [27]:
check_topic(1, 5)
check_topic(0, 2)
check_topic(131, 411)

문서 1의 topic = alt.atheism
문서 5의 topic = soc.religion.christian
문서 0의 topic = talk.politics.mideast
문서 2의 topic = talk.politics.mideast
문서 131의 topic = comp.sys.ibm.pc.hardware
문서 411의 topic = misc.forsale
