<a href="https://colab.research.google.com/github/seonae0223/machine_learning/blob/main/06_Text_Mining_Topic_Modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation #LDA 사용

# 컴퓨터 그래픽스, 기독교, 야구
cats = ['comp.graphics', 'soc.religion.christian','rec.sport.baseball']

news_df = fetch_20newsgroups(
    subset='all',
    remove=('headers', 'footers', 'quotes'),
    categories=cats,
    random_state=42
)

# 코퍼스
news_df

{'data': ["Hi all,\n\n\tI got tired of waiting for a mailing list so I am going to make one....\njust send me your complete e-mail address on your message and I'll send\nyou a confirmation and the Yankee mailing list address\n\n-john",
  "\nDominik,\n\n\tHave you tried xgrasp?  It's out there on several ftp sites.(not sure which, but archie can find it, I'm sure.)  It works ok but it lacks an interface.",
  'There was an article on Jewish major leaguers in a recent issue of "Elysian \nFields", what used to be the "Minnesota Review of Baseball".  As I recall, \nit had an amazing amount of research, with a long list of players and a \nlarge bibliography.',
  '  \n\n',
  "\n\n\n\tWarren, I agree with your premise... but... The Babe won on\ntwo teams. That's right, he was part of the Red Sox Dynasty of the\n1910s. And everyone knows that the Yankee Dynsaty wouldn't have\nhappened without thier famous bullpen catcher whose name escapes me at\nthe moment.",
  '\n\n\n\nI opened the file with 

In [None]:
# 테스트 전처리
# - lemma 추출, stopwords, 특수기호 제거 등등 ...

토픽 모델링의 경우에는 DTM을 사용한다.
- 확률을 구해야 하기 때문에 빈도수(Count)가 중요하다.
- TFIDF를 사용하면 제대로 된 토픽 모델링이 안될 가능성이 커진다.

In [7]:
count_vectorizer = CountVectorizer(
    max_df = 0.95,
    min_df = 2,
    max_features=1000, # 단어를 최대 몇 개 사용할 건지 설정
    stop_words = 'english',
    ngram_range=(1,2)
)

dtm_vector = count_vectorizer.fit_transform(news_df['data'])
dtm_vector.shape

(2964, 1000)

In [10]:
count_vectorizer.get_feature_names_out()[:10]

array(['00', '00 00', '000', '01', '02', '03', '03 03', '04', '05', '06'],
      dtype=object)

#LDA 적용

In [11]:
lda = LatentDirichletAllocation(
    n_components=len(cats), # 사용할 토픽의 개수
    random_state=42
)

lda.fit(dtm_vector)

## 주제(토픽) 별 단어 연관도 확인
토픽에 포함되는 단어의 연관도 확인

In [13]:
lda.components_ #각 순서대로 첫 번째, 두 번째, 세 번째 토픽에 대한 점수가 출력

array([[ 63.63332482,   0.34963495,  17.44704079, ...,  16.61505824,
          2.5384632 , 122.4210569 ],
       [208.02174752,  68.31556307,  90.89958785, ...,  80.29277085,
         54.27731685,   3.2448167 ],
       [  0.34492766,   0.33480198,   0.65337136, ...,   2.09217091,
         49.18421995,   0.3341264 ]])

In [16]:
def get_filename_list(newsdata):
    filename_list=[]

    for file in newsdata.filenames:
            #print(file)
            filename_temp = file.split('\\')[-2:]
            filename = '.'.join(filename_temp)
            filename_list.append('/'.join(filename.split('/')[-2:]))

    return filename_list

filename_list = get_filename_list(news_df)
print("filename 개수:",len(filename_list), "filename list 10개만:",filename_list[:10])

Topic : 0
['image', 'graphics', 'jpeg', 'edu', 'file', 'images', 'data', 'available', 'software', 'use']

Topic : 1
['year', 'game', 'don', 'think', 'good', 'team', 'time', 'games', 'just', 'like']

Topic : 2
['god', 'people', 'jesus', 'church', 'think', 'know', 'does', 'just', 'christ', 'don']



각 문서 별 토픽 분포 확인

In [17]:
doc_topic = lda.transform(dtm_vector)
doc_topic[:3]

array([[0.85839148, 0.12070316, 0.02090536],
       [0.90436513, 0.05643273, 0.03920214],
       [0.22961015, 0.5236614 , 0.24672845]])

In [22]:
def get_filename_list(newsdata):
    filename_list=[]

    for file in newsdata.filenames:
            #print(file)
            filename_temp = file.split('\\')[-2:]
            filename = '.'.join(filename_temp)
            filename_list.append('/'.join(filename.split('/')[-2:]))

    return filename_list

filename_list = get_filename_list(news_df)
print("filename 개수:",len(filename_list), "filename list 10개만:",filename_list[:10])

filename 개수: 2964 filename list 10개만: ['rec.sport.baseball/105154', 'comp.graphics/38805', 'rec.sport.baseball/104616', 'comp.graphics/37928', 'rec.sport.baseball/104823', 'comp.graphics/38908', 'comp.graphics/38659', 'comp.graphics/38691', 'comp.graphics/38876', 'comp.graphics/38700']


In [23]:
import pandas as pd

topic_names = ["topic # {}".format(i) for i in range(0, len(cats))]

topic_df = pd.DataFrame(doc_topic, columns=topic_names, index=filename_list)
topic_df

Unnamed: 0,topic # 0,topic # 1,topic # 2
rec.sport.baseball/105154,0.858391,0.120703,0.020905
comp.graphics/38805,0.904365,0.056433,0.039202
rec.sport.baseball/104616,0.229610,0.523661,0.246728
comp.graphics/37928,0.333333,0.333333,0.333333
rec.sport.baseball/104823,0.028380,0.940660,0.030959
...,...,...,...
comp.graphics/38311,0.424100,0.416729,0.159172
comp.graphics/38435,0.333333,0.333333,0.333333
soc.religion.christian/20894,0.003928,0.004123,0.991949
comp.graphics/38275,0.922023,0.036687,0.041290


In [25]:
test = "I prayed to God to win this game. God answered, and we could win this game."

test_vector = count_vectorizer.transform([test])
lda.transform(test_vector)

array([[0.04767431, 0.61900264, 0.33332305]])

# 토픽 모델링 시각화

In [26]:
!pip install pyLDAvis

Collecting pyLDAvis
  Downloading pyLDAvis-3.4.1-py3-none-any.whl.metadata (4.2 kB)
Collecting funcy (from pyLDAvis)
  Downloading funcy-2.0-py2.py3-none-any.whl.metadata (5.9 kB)
Downloading pyLDAvis-3.4.1-py3-none-any.whl (2.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m37.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading funcy-2.0-py2.py3-none-any.whl (30 kB)
Installing collected packages: funcy, pyLDAvis
Successfully installed funcy-2.0 pyLDAvis-3.4.1


In [29]:
import pyLDAvis.lda_model

prepare = pyLDAvis.lda_model.prepare(lda, dtm_vector, count_vectorizer)
prepare

  and should_run_async(code)


PreparedData(topic_coordinates=              x         y  topics  cluster       Freq
topic                                                
2      0.104368  0.149782       1        1  42.598546
0     -0.229945 -0.008935       2        1  36.607569
1      0.125577 -0.140847       3        1  20.793885, topic_info=         Term         Freq        Total Category  logprob  loglift
384       god  1902.000000  1902.000000  Default  30.0000  30.0000
445     image  1167.000000  1167.000000  Default  29.0000  29.0000
394  graphics   866.000000   866.000000  Default  28.0000  28.0000
994      year   594.000000   594.000000  Default  27.0000  27.0000
482      jpeg   791.000000   791.000000  Default  26.0000  26.0000
..        ...          ...          ...      ...      ...      ...
510      like   281.310026  1067.459669   Topic3  -4.7112   0.2369
995     years   191.994209   368.794266   Topic3  -5.0932   0.9177
768       run   157.474646   274.735600   Topic3  -5.2914   1.0140
490      know   1

In [30]:
pyLDAvis.display(prepare)

  and should_run_async(code)
