In [8]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

#맥, 윈도우즈, 야구, 하키, 중동, 기독교, 전자공학, 의학 8개 주제 추출
cats = ['comp.sys.mac.hardware','comp.windows.x','rec.sport.baseball','rec.sport.hockey','talk.politics.mideast','soc.religion.christian','sci.electronics','sci.med']

#cats 변수로 기재된 카테고리만 추출
news_df = fetch_20newsgroups(subset='all',remove=('headers','footers','quotos'),categories = cats,random_state=0)

#Count기반의 벡터화만 적용
count_vect = CountVectorizer(max_df=0.95,max_features=1000,min_df=2,stop_words='english',ngram_range=(1,2))
feat_vect = count_vect.fit_transform(news_df.data)
print('CountVectorizer Shape : ',feat_vect.shape)

CountVectorizer Shape :  (7855, 1000)


In [9]:
lda = LatentDirichletAllocation(n_components=8,random_state=0)
lda.fit(feat_vect)

LatentDirichletAllocation(n_components=8, random_state=0)

In [10]:
print(lda.components_.shape)
lda.components_

(8, 1000)


array([[2.32217733e-01, 4.11438557e+02, 1.25035456e-01, ...,
        8.81201292e+01, 9.68459933e+01, 9.40383506e+01],
       [1.30826913e+01, 5.87463680e+01, 6.37588808e-01, ...,
        1.16637516e+02, 4.77781900e+00, 3.86140225e+01],
       [1.25252491e-01, 3.38089657e+01, 1.25075672e-01, ...,
        1.73230457e+02, 3.59909294e-01, 1.17096028e+02],
       ...,
       [3.03998314e+00, 1.25245079e-01, 1.29382411e-01, ...,
        4.33503407e+01, 1.25140780e-01, 1.13061729e+01],
       [4.72579443e+02, 1.27778622e+02, 1.84858156e+02, ...,
        1.25064941e-01, 1.68217368e+02, 5.30354897e+01],
       [1.16127319e+02, 4.40309406e+00, 2.86989052e+01, ...,
        3.69411693e+01, 1.27954709e-01, 1.11094297e+01]])

In [11]:
def display_topics(model,feature_names,no_top_words):
    for topic_index,topic in enumerate(model.components_):
        print('Topic #',topic_index)
        
        #components_array에서 가장 값이 큰 순으로 정렬했을 때, 그 값의 array 인덱스 반환
        topic_word_indexes = topic.argsort()[::-1]
        top_indexes = topic_word_indexes[:no_top_words]
        
        #top_indexes 대상인 인덱스 별로 feature_names에 해당하는 word feature 추출 후 join으로 concat
        feature_concat = ' '.join([feature_names[i] for i in top_indexes])
        print(feature_concat)
        

#CountVectorizer 객체 내의 전체 word의 명칭을 get_features_names()를 통해 추출
feature_names = count_vect.get_feature_names()

#토픽별로 가장 연관도 높은 word 15개씩 추출
display_topics(lda,feature_names,15)

Topic # 0
people israel armenian jews said armenians turkish israeli jewish arab war government did killed turkey
Topic # 1
use know like does just don good used problem time ve work power need medical
Topic # 2
game team year don think like just good games time players play writes better season
Topic # 3
god people jesus believe church think christ does say christian know don christians just bible
Topic # 4
edu writes article don ca know cs just writes article think like edu writes com ve does
Topic # 5
edu com writes article mit ac mit edu uk uiuc subject uiuc edu hp mail sun available
Topic # 6
10 25 12 11 15 16 14 20 13 17 18 30 19 00 55
Topic # 7
window use file program server mac windows software using dos display motif set version application
