In [1]:
!pip install gensim



In [2]:
# module import
import numpy as np
import re
import pickle
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from gensim import corpora
from gensim.models.ldamodel import LdaModel as LDA

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
# 경로 설정
root_path = "/content/drive/My Drive/멀티캠퍼스/[혁신성장] 인공지능 자연어처리 기반/[강의]/조성현 강사님"
data_path = f"{root_path}/dataset"

In [4]:
# 데이터 로드
with open(f"{data_path}/news.data", 'rb') as f:
    news_data = pickle.load(f)

In [6]:
# 데이터 조회
news = news_data.data
print(len(news))
print(news[0])

# 토픽 확인
print(len(news_data.target_names))
print(news_data.target_names)

11314
Well i'm not sure about the story nad it did seem biased. What
I disagree with is your statement that the U.S. Media is out to
ruin Israels reputation. That is rediculous. The U.S. media is
the most pro-israeli media in the world. Having lived in Europe
I realize that incidences such as the one described in the
letter have occured. The U.S. media as a whole seem to try to
ignore them. The U.S. is subsidizing Israels existance and the
Europeans are not (at least not to the same degree). So I think
that might be a reason they report more clearly on the
atrocities.
	What is a shame is that in Austria, daily reports of
the inhuman acts commited by Israeli soldiers and the blessing
received from the Government makes some of the Holocaust guilt
go away. After all, look how the Jews are treating other races
when they got power. It is unfortunate.

20
['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.f

## 전처리: 달라지는 부분 주의
* `"".join` 없음.

In [8]:
# 전처리
news_1 = []
news_2 = []

# 1) 영문자 제외 모두 제거
for doc in news:
    temp = re.sub('[^a-zA-Z]', ' ', doc)
    news_1.append(temp)

# 2) 불용어 제거, 소문자 변환, 길이 3 이하인 단어 제거
stopwords_list = stopwords.words('english')
for doc in news_1:
    temp = []
    for w in doc.split():
        w = w.lower()
        if len(w) > 3 and w not in stopwords_list:
            temp.append(w)
    news_2.append(temp)

# 전처리 결과 확인
print(news_2[0])

['well', 'sure', 'story', 'seem', 'biased', 'disagree', 'statement', 'media', 'ruin', 'israels', 'reputation', 'rediculous', 'media', 'israeli', 'media', 'world', 'lived', 'europe', 'realize', 'incidences', 'described', 'letter', 'occured', 'media', 'whole', 'seem', 'ignore', 'subsidizing', 'israels', 'existance', 'europeans', 'least', 'degree', 'think', 'might', 'reason', 'report', 'clearly', 'atrocities', 'shame', 'austria', 'daily', 'reports', 'inhuman', 'acts', 'commited', 'israeli', 'soldiers', 'blessing', 'received', 'government', 'makes', 'holocaust', 'guilt', 'away', 'look', 'jews', 'treating', 'races', 'power', 'unfortunate']


# GENSIM 사용


# `bow` 생성
- `gensim.corpora.Dictionary`로 어휘 집합 생성.
- 어휘 집합 생성 후 `items()`를 리스트로 바꾸어 단어 확인.
- 어휘 집합에 있는 단어 인덱스 순서에 따라 `news_2`를 수치로 바꿈. 

In [15]:
# doc2bow 생성
vocab = corpora.Dictionary(news_2) # doc2bow 생성: <gensim.corpora.dictionary.Dictionary at 0x7f115af63ac8>
print(dict(list(vocab.items())[:10])) # 확인: ItemsView(<gensim.corpora.dictionary.Dictionary object at 0x7f115af63ac8>) -> 리스트
news_bow = [vocab.doc2bow(s) for s in news_2] # 어휘 집합에 있는 순서에 따라 수치화.
print(news_bow[0]) # '왜 다 1로 나온담?

{0: 'acts', 1: 'atrocities', 2: 'austria', 3: 'away', 4: 'biased', 5: 'blessing', 6: 'clearly', 7: 'commited', 8: 'daily', 9: 'degree'}
[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 2), (22, 2), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 4), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 2), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1)]


## LDA 모델 생성
- `id2word` :
- `.get_document_topics`: 

In [18]:
# LDA
model = LDA(news_bow,
            num_topics = len(news_data.target_names),
            id2word=vocab) # id를 어휘집합에 따라 바꿔 줌.

In [23]:
# 문서별 토픽 번호 확인
doc_topic = model.get_document_topics(news_bow) # <gensim.interfaces.TransformedCorpus at 0x7f115e9fcb70>
for i in range(10):
    topics = np.array(doc_topic[i]) # 확률로 반환
    print(topics)
    most_likely_topic = int(topics[np.argmax(topics[:, 1]), 0]) # 1열에서 가장 큰 애들 찾아서 0열의 번호 반환하야 함.
    print("문서-{:d} : topic-{:d}".format(i+1, most_likely_topic))
    print("")

[[ 3.          0.01873743]
 [ 8.          0.29867086]
 [ 9.          0.27710629]
 [11.          0.2655732 ]
 [17.          0.12781543]]
문서-1 : topic-8

[[2.         0.2011117 ]
 [4.         0.42206389]
 [5.         0.03355008]
 [8.         0.32376215]]
문서-2 : topic-4

[[ 6.          0.03001015]
 [ 8.          0.5039615 ]
 [11.          0.03646209]
 [16.          0.0422205 ]
 [17.          0.2336518 ]
 [18.          0.14240363]]
문서-3 : topic-8

[[ 1.          0.11946963]
 [ 6.          0.22843014]
 [ 7.          0.02014613]
 [ 8.          0.17014597]
 [ 9.          0.19978833]
 [10.          0.0559863 ]
 [11.          0.05347284]
 [12.          0.14360546]]
문서-4 : topic-6

[[ 4.          0.37526175]
 [10.          0.44490704]
 [13.          0.1483497 ]]
문서-5 : topic-10

[[ 2.          0.48178819]
 [17.          0.48071182]]
문서-6 : topic-2

[[2.         0.47088975]
 [4.         0.52444702]]
문서-7 : topic-4

[[ 7.          0.01886694]
 [ 8.          0.33684409]
 [11.          0.57803857]
 

In [28]:
# topic-term 행렬
topic_term = model.get_topic_terms(0, topn=10)
print(topic_term)

for i in range(len(news_data.target_names)):
    topic_term = model.get_topic_terms(i, topn=10)
    indices = [idx for idx, score in topic_term]
    print("topic-{:2d} : ".format(i+1))
    for idx in indices:
        print("       ", "{:s}".format(vocab[idx]))
    print("")

[(1565, 0.010290465), (2561, 0.0070173787), (32, 0.00603439), (16639, 0.0045817285), (1019, 0.004498456), (3514, 0.004433424), (130, 0.0042476123), (6590, 0.0042437133), (2569, 0.0037505464), (631, 0.0035884439)]
topic- 1 : 
        period
        play
        power
        scorer
        first
        goal
        would
        puck
        second
        back

topic- 2 : 
        would
        sabbath
        easter
        doctor
        cancer
        know
        tapes
        also
        please
        receiver

topic- 3 : 
        christian
        also
        truth
        christianity
        christians
        think
        science
        jesus
        objective
        like

topic- 4 : 
        nrhj
        bxom
        wwiz
        gizw
        pitt
        tbxn
        bxlt
        gordon
        wmbxn
        banks

topic- 5 : 
        team
        game
        year
        games
        season
        players
        last
        hockey
        league
        play

to

In [29]:
# 문서별 분류 코드 확인
def check_topic(x, y):
    print("문서 %d의 topic: %s" %(x, news_data.target_names[news_data.target[x]]))
    print("문서 %d의 topic: %s" %(y, news_data.target_names[news_data.target[y]]))

check_topic(0, 314)
check_topic(11300, 2)
check_topic(66, 1)

문서 0의 topic: talk.politics.mideast
문서 314의 topic: sci.electronics
문서 11300의 topic: alt.atheism
문서 2의 topic: talk.politics.mideast
문서 66의 topic: comp.windows.x
문서 1의 topic: alt.atheism


In [17]:
LDA?

In [12]:
list(vocab.items())

[(0, 'acts'),
 (1, 'atrocities'),
 (2, 'austria'),
 (3, 'away'),
 (4, 'biased'),
 (5, 'blessing'),
 (6, 'clearly'),
 (7, 'commited'),
 (8, 'daily'),
 (9, 'degree'),
 (10, 'described'),
 (11, 'disagree'),
 (12, 'europe'),
 (13, 'europeans'),
 (14, 'existance'),
 (15, 'government'),
 (16, 'guilt'),
 (17, 'holocaust'),
 (18, 'ignore'),
 (19, 'incidences'),
 (20, 'inhuman'),
 (21, 'israeli'),
 (22, 'israels'),
 (23, 'jews'),
 (24, 'least'),
 (25, 'letter'),
 (26, 'lived'),
 (27, 'look'),
 (28, 'makes'),
 (29, 'media'),
 (30, 'might'),
 (31, 'occured'),
 (32, 'power'),
 (33, 'races'),
 (34, 'realize'),
 (35, 'reason'),
 (36, 'received'),
 (37, 'rediculous'),
 (38, 'report'),
 (39, 'reports'),
 (40, 'reputation'),
 (41, 'ruin'),
 (42, 'seem'),
 (43, 'shame'),
 (44, 'soldiers'),
 (45, 'statement'),
 (46, 'story'),
 (47, 'subsidizing'),
 (48, 'sure'),
 (49, 'think'),
 (50, 'treating'),
 (51, 'unfortunate'),
 (52, 'well'),
 (53, 'whole'),
 (54, 'world'),
 (55, 'accept'),
 (56, 'actually'),
 (57