## 토픽모델링 : 문서집합의 주제 찾기
* 사람은 문장을 요약하는 것에 비해, ML은 주제를 효과적 표현하는 중심단어 추출
* LSA(Latent Semantic Analysis), LDA(Latent Dirichlet Allocation)
 cf) 차원축소의 LDA(Linear Discriminant Analysis) 다름
* 20뉴스그룹 데이터 사용 -> count기반 벡터화(fit_transform) -> lda.fit =>
  - components_속성 (개별 토픽별 각 word 피처가 얼마나 많이 그 토픽에 할당됐는지에 대한 수치)
* cf) 8-3 CountVectorizer() 피처벡터화에서는  
    - 테스트데이터의 피처벡터화는 학습데이터를 이용해 fit()이 수행된 CountVectorizer객체를 이용해 테스트데이터를 변환(transform)해야 함
      ==> 학습시 설정된 CountVectorizer의 피처개수와 테스트데이터를 CountVectorizer로 변환할 피처개수가 같아짐
    - fit_transform()사용치 못함. cnt_vect.transform() 이용해 변환
      ==> 테스트 데이터기반으로 다시 CountVectorizer가 fit()수행하고 transform() 하기 때문에 
          학습시 사용된 피처개수와 예측시 사용할 피처개수가 달라짐

In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# 모토사이클, 야구, 그래픽스, 윈도우즈, 중동, 기독교, 의학, 우주 주제를 추출. 
cats = ['rec.motorcycles', 'rec.sport.baseball', 'comp.graphics', 'comp.windows.x',
        'talk.politics.mideast', 'soc.religion.christian', 'sci.electronics', 'sci.med'  ]

# 위에서 cats 변수로 기재된 category만 추출. featch_20newsgroups( )의 categories에 cats 입력
news_df= fetch_20newsgroups(subset='all',remove=('headers', 'footers', 'quotes'), 
                            categories=cats, random_state=0)

In [2]:
print(f'{type(news_df)}')
# news_df => 데이터의 형태가 dictionary
# print(news_df.shape)  => error
print(f'{news_df.keys()}\n')

print(len(news_df.data)) # 7862   # dict key가 data이므로 value가 list => news_df.data => list [ , , , ]
print(news_df.data[0])

print(news_df.filenames) # 8

print(news_df.target_names)
print(len(news_df.target_names)) # 8

print(news_df.target)
print(len(news_df.target)) # 7862


<class 'sklearn.utils.Bunch'>
dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

7862
I appreciate if anyone can point out some good books about the dead sea
scrolls of Qumran. Thanks in advance.
['/home/jovyan/scikit_learn_data/20news_home/20news-bydate-train/soc.religion.christian/20630'
 '/home/jovyan/scikit_learn_data/20news_home/20news-bydate-test/sci.med/59422'
 '/home/jovyan/scikit_learn_data/20news_home/20news-bydate-test/comp.graphics/38765'
 ...
 '/home/jovyan/scikit_learn_data/20news_home/20news-bydate-train/rec.sport.baseball/102656'
 '/home/jovyan/scikit_learn_data/20news_home/20news-bydate-train/sci.electronics/53606'
 '/home/jovyan/scikit_learn_data/20news_home/20news-bydate-train/talk.politics.mideast/76505']
['comp.graphics', 'comp.windows.x', 'rec.motorcycles', 'rec.sport.baseball', 'sci.electronics', 'sci.med', 'soc.religion.christian', 'talk.politics.mideast']
8
[6 5 0 ... 3 4 7]
7862


In [None]:
# import pandas as pd
# df = pd.DataFrame(news_df), index=news_df.data. columns=news_df.target_names) # ===> error keys들이 column으로 가기때문에
# df.head()

In [3]:
#LDA 는 Count기반의 Vectorizer만 적용합니다.  
count_vect = CountVectorizer(max_df=0.95, max_features=1000, min_df=2, stop_words='english', ngram_range=(1,2))
feat_vect = count_vect.fit_transform(news_df.data)

In [4]:
# print(f'{news_df.data.shape}') # error => 'list' object has no attribute 'shape'
print(f'type(count_vect):{type(count_vect)}') # count.vect.shape => error
print('CountVectorizer Shape(feat_vect.shape):', feat_vect.shape) # 추출된 feature가 1000개 (7862,1000) => sparse matrix(희소행렬)


type(count_vect):<class 'sklearn.feature_extraction.text.CountVectorizer'>
CountVectorizer Shape(feat_vect.shape): (7862, 1000)


In [9]:
# print(count_vect[:2, :5]) => error
print(f'type(feat_vect):{type(feat_vect)}')
print(feat_vect[:2, :])

type(feat_vect):<class 'scipy.sparse.csr.csr_matrix'>
  (0, 93)	1
  (0, 669)	1
  (0, 390)	1
  (0, 148)	1
  (0, 251)	1
  (0, 876)	1
  (0, 70)	1
  (0, 877)	1
  (1, 390)	1
  (1, 428)	1
  (1, 391)	1
  (1, 237)	1
  (1, 607)	1
  (1, 403)	1
  (1, 955)	2
  (1, 512)	2
  (1, 678)	2
  (1, 655)	2
  (1, 881)	2
  (1, 733)	1
  (1, 688)	1
  (1, 23)	1
  (1, 894)	1
  (1, 15)	1
  (1, 12)	1
  :	:
  (1, 431)	1
  (1, 563)	1
  (1, 748)	1
  (1, 200)	1
  (1, 975)	1
  (1, 995)	1
  (1, 915)	1
  (1, 650)	1
  (1, 222)	1
  (1, 332)	1
  (1, 510)	1
  (1, 656)	1
  (1, 930)	1
  (1, 162)	1
  (1, 120)	1
  (1, 713)	1
  (1, 407)	1
  (1, 853)	1
  (1, 712)	1
  (1, 210)	1
  (1, 350)	1
  (1, 368)	1
  (1, 787)	1
  (1, 364)	1
  (1, 84)	1


In [10]:
# LDA(Latent Dirichlet Allocation 중심단어 추출
lda = LatentDirichletAllocation(n_components=8, random_state=0)
lda.fit(feat_vect)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=8, n_jobs=None,
                          perp_tol=0.1, random_state=0, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [11]:
print(type(lda.components_))
print(lda.components_.shape)
lda.components_
# 8개의 토픽별로 1000개의 word 피처가 해당토픽별로 연관도 값을 가지고 있음

<class 'numpy.ndarray'>
(8, 1000)


array([[3.60992018e+01, 1.35626798e+02, 2.15751867e+01, ...,
        3.02911688e+01, 8.66830093e+01, 6.79285199e+01],
       [1.25199920e-01, 1.44401815e+01, 1.25045596e-01, ...,
        1.81506995e+02, 1.25097844e-01, 9.39593286e+01],
       [3.34762663e+02, 1.25176265e-01, 1.46743299e+02, ...,
        1.25105772e-01, 3.63689741e+01, 1.25025218e-01],
       ...,
       [3.60204965e+01, 2.08640688e+01, 4.29606813e+00, ...,
        1.45056650e+01, 8.33854413e+00, 1.55690009e+01],
       [1.25128711e-01, 1.25247756e-01, 1.25005143e-01, ...,
        9.17278769e+01, 1.25177668e-01, 3.74575887e+01],
       [5.49258690e+01, 4.47009532e+00, 9.88524814e+00, ...,
        4.87048440e+01, 1.25034678e-01, 1.25074632e-01]])

In [15]:
# 각 토피별로 연관도가 높은 순으로 word 나열
def display_topics(model, feature_names, no_top_words):
    for topic_index, topic in enumerate(model.components_):
        print('Topic #',topic_index)

        # components_ array에서 가장 값이 큰 순으로 정렬했을 때, 그 값의 array index를 반환. # pd에서 idxmax() 명칭기반(보이는 index) 리턴함
        topic_word_indexes = topic.argsort()[::-1]  # topic 1000개중에서 slicing
        print(f'topic_word_indexes:{topic_word_indexes[:15]}')
        # no_top_words 인자(15개)만큼만 
        top_indexes=topic_word_indexes[:no_top_words]
        
        # top_indexes대상인 index별로 feature_names에 해당하는 word feature 추출 후 join으로 concat
#         for i in top_indexes:
#             print(i)
#             print(feature_names[i])
        feature_concat = ' '.join([feature_names[i] for i in top_indexes]) 
        # feature_names[i] ===> year, 10, game, ,,,,,
        # 공백' ' +  list안의 문자(str)들을 합쳐 하나의 문장으로
        print(f'{feature_concat}\n')

# CountVectorizer객체내의 전체 word들의 명칭을 get_features_names( )를 통해 추출
feature_names = count_vect.get_feature_names() # 
print(f'type(feature_names):{type(feature_names)}\n') # list
print(f'len(feature_names):{len(feature_names)}\n') # 1000
print(f'feature_names[:15]:{feature_names[:15]}\n') # ['00', '000', '01', '02', '03', '04', '05', '10', '100', '11', '12', '128', '13', '14', '15']
# 강사님 설명에서 단어가 반환된다고 함

# Topic별 가장 연관도가 높은 word를 15개만 추출
display_topics(lda, feature_names, 15)


type(feature_names):<class 'list'>

len(feature_names):1000

feature_names[:15]:['00', '000', '01', '02', '03', '04', '05', '10', '100', '11', '12', '128', '13', '14', '15']

Topic # 0
topic_word_indexes:[994   7 374 563 420 868  10  23 280 164  22 375 995 651 390]
year 10 game medical health team 12 20 disease cancer 1993 games years patients good

Topic # 1
topic_word_indexes:[291 485 517 492 655 769 881 885 934 269 758 388 773 528 951]
don just like know people said think time ve didn right going say ll way

Topic # 2
topic_word_indexes:[451 353 484 699 382 452 634 365 354 203 319   0 921 140   4]
image file jpeg program gif images output format files color entry 00 use bit 03

Topic # 3
topic_word_indexes:[517 492 291 881 921 286 485 390 885 147 719 462 655 922 678]
like know don think use does just good time book read information people used post

Topic # 4
topic_word_indexes:[107 474 108 481 905 655 475 480 393 947 296 904  98 106   1]
armenian israel armenians jews turkish peopl