# 토픽 모델 - LDA

## 20 news group 데이터 사례

In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.datasets import fetch_20newsgroups
news = fetch_20newsgroups(subset='all', random_state=2021,
                          remove=('headers', 'footers', 'quotes'))

In [3]:
df = pd.DataFrame({'article' : news.data})
df.shape

(18846, 1)

In [4]:
#특수문자제거
df['article'] = df.article.str.replace('[^A-Za-z]',' ')

In [5]:
# 소문자로 변환 및 3글자이하 단어 삭제
df['article']=df.article.apply(lambda x : ' '.join(w.lower() for w in x.split() if len(w) > 3))

In [6]:
df.article[0][:1000]

'just case original poster looking serious answer supply even when steering hands something quite similar countersteering basically turn left quick wiggle bike right first causing counteracting lean occur left more difficult motorcycle than bicycle though because extra weight motorcycle heavy maybe yous'

### NLTK 를 통해서 토큰화

In [7]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
tokenized_doc = df.article.apply(lambda x : [w for w in x.split() if w not in stop_words])
#안되면 리스트 형태로

In [8]:
tokenized_doc[:5]

0    [case, original, poster, looking, serious, ans...
1    [thinking, sending, magazine, idea, parody, bo...
2    [dreamed, great, judgment, morning, dawned, tr...
3    [file, bignums, ripem, last, updated, april, r...
4    [peanut, butter, definitely, favorite, think, ...
Name: article, dtype: object

## 정수 인코딩과 단어 집합만들기 - gensim

In [9]:
from gensim import corpora
dictionary = corpora.Dictionary(tokenized_doc)

In [10]:
len(dictionary)

83145

In [11]:
corpus = [dictionary.doc2bow(text) for text in tokenized_doc]

In [12]:
print(corpus[0])

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 2), (16, 1), (17, 1), (18, 2), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1)]


In [13]:
dictionary[0], dictionary[1], dictionary[2], dictionary[3]

('answer', 'basically', 'bicycle', 'bike')

### LDA 모델로 훈련시키기

In [14]:
from gensim.models.ldamodel import LdaModel
NUM_TOPICS = 20

In [15]:
ldamodel = LdaModel(
    corpus, num_topics=NUM_TOPICS, random_state=2021,
    id2word=dictionary, passes=20
)
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.021*"hockey" + 0.018*"team" + 0.012*"games" + 0.011*"game"')
(1, '0.020*"bike" + 0.016*"engine" + 0.015*"cars" + 0.009*"miles"')
(2, '0.011*"deleted" + 0.010*"colorado" + 0.008*"exhaust" + 0.006*"flaming"')
(3, '0.016*"appears" + 0.014*"vitamin" + 0.013*"green" + 0.012*"candida"')
(4, '0.017*"israel" + 0.016*"armenian" + 0.014*"turkish" + 0.013*"jews"')
(5, '0.015*"government" + 0.010*"president" + 0.008*"public" + 0.007*"encryption"')
(6, '0.027*"jesus" + 0.019*"church" + 0.017*"bible" + 0.015*"christ"')
(7, '0.014*"drive" + 0.010*"system" + 0.009*"card" + 0.009*"would"')
(8, '0.018*"space" + 0.008*"research" + 0.007*"university" + 0.006*"nasa"')
(9, '0.023*"file" + 0.015*"window" + 0.015*"windows" + 0.012*"program"')
(10, '0.031*"please" + 0.028*"mail" + 0.022*"thanks" + 0.017*"send"')
(11, '0.019*"health" + 0.019*"medical" + 0.012*"disease" + 0.012*"cancer"')
(12, '0.012*"people" + 0.011*"would" + 0.008*"think" + 0.006*"believe"')
(13, '0.030*"battery" + 0.014*"dont" + 0.012*

## 훈련결과 시각화

In [16]:
## !pip install pyLDAvis==2.1.2

In [21]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(ldamodel, corpus, dictionary)
pyLDAvis.display(vis)

  and should_run_async(code)


In [None]:
pyLDAvis.save_html(vis, 'news_group_20.html')

## 문서별 토픽 분포

In [None]:
for i, topic_list in enumerate(ldamodel[corpus]):
    if i==5:
        break
    print(i,'번째 문서의 topic 비율은',topic_list)

0 번째 문서의 topic 비율은 [(5, 0.63293415), (11, 0.07180142), (12, 0.13647082), (16, 0.13653353)]
1 번째 문서의 topic 비율은 [(2, 0.12067504), (4, 0.030950991), (5, 0.34957498), (7, 0.056273896), (10, 0.116060704), (12, 0.026531072), (16, 0.18088074), (19, 0.11180363)]
2 번째 문서의 topic 비율은 [(2, 0.014474366), (5, 0.019560626), (10, 0.012859776), (15, 0.4569791), (16, 0.42143556), (19, 0.05292789)]
3 번째 문서의 topic 비율은 [(0, 0.012948004), (2, 0.14172329), (3, 0.034144606), (4, 0.5036073), (7, 0.074763566), (8, 0.011893726), (12, 0.027960783), (13, 0.13929835), (14, 0.023164311)]
4 번째 문서의 topic 비율은 [(5, 0.7010949), (10, 0.05164012), (11, 0.044979796), (16, 0.18173891)]


  and should_run_async(code)


In [None]:
def make_topictable_per_doc(ldamodel, corpus):
    topic_table = pd.DataFrame()

    # 몇 번째 문서인지를 의미하는 문서 번호와 해당 문서의 토픽 비중을 한 줄씩 꺼내온다.
    for i, topic_list in enumerate(ldamodel[corpus]):
        doc = topic_list[0] if ldamodel.per_word_topics else topic_list            
        doc = sorted(doc, key=lambda x: (x[1]), reverse=True)
        # 각 문서에 대해서 비중이 높은 토픽순으로 토픽을 정렬한다.
        # EX) 정렬 전 0번 문서 : (2번 토픽, 48.5%), (8번 토픽, 25%), (10번 토픽, 5%), (12번 토픽, 21.5%), 
        # Ex) 정렬 후 0번 문서 : (2번 토픽, 48.5%), (8번 토픽, 25%), (12번 토픽, 21.5%), (10번 토픽, 5%)
        # 48 > 25 > 21 > 5 순으로 정렬이 된 것.

        # 모든 문서에 대해서 각각 아래를 수행
        for j, (topic_num, prop_topic) in enumerate(doc): #  몇 번 토픽인지와 비중을 나눠서 저장한다.
            if j == 0:  # 정렬을 한 상태이므로 가장 앞에 있는 것이 가장 비중이 높은 토픽
                topic_table = topic_table.append(pd.Series([int(topic_num), round(prop_topic,4), topic_list]), ignore_index=True)
                # 가장 비중이 높은 토픽과, 가장 비중이 높은 토픽의 비중과, 전체 토픽의 비중을 저장한다.
            else:
                break
    return(topic_table)

  and should_run_async(code)


In [None]:
topictable = make_topictable_per_doc(ldamodel, corpus)
topictable = topictable.reset_index() # 문서 번호을 의미하는 열(column)로 사용하기 위해서 인덱스 열을 하나 더 만든다.
topictable.columns = ['문서 번호', '가장 비중이 높은 토픽', '가장 높은 토픽의 비중', '각 토픽의 비중']
topictable[:10]

  and should_run_async(code)


Unnamed: 0,문서 번호,가장 비중이 높은 토픽,가장 높은 토픽의 비중,각 토픽의 비중
0,0,5.0,0.6328,"[(5, 0.63284755), (11, 0.07179656), (12, 0.136..."
1,1,5.0,0.3496,"[(2, 0.12066079), (4, 0.030925583), (5, 0.3495..."
2,2,15.0,0.457,"[(2, 0.014474882), (5, 0.01956061), (10, 0.012..."
3,3,4.0,0.5036,"[(0, 0.0129488725), (2, 0.14172307), (3, 0.034..."
4,4,5.0,0.7018,"[(5, 0.70175576), (10, 0.051676225), (11, 0.04..."
5,5,16.0,0.52,"[(2, 0.15251753), (9, 0.10250779), (13, 0.0109..."
6,6,5.0,0.3628,"[(1, 0.1338806), (5, 0.3627732), (7, 0.1852659..."
7,7,2.0,0.5237,"[(2, 0.52365583), (15, 0.45325464)]"
8,8,9.0,0.5134,"[(9, 0.51341605), (11, 0.034861695), (14, 0.05..."
9,9,5.0,0.2241,"[(1, 0.21433285), (2, 0.07331828), (5, 0.22412..."


## NUM-TOPICS=24

In [None]:
ldamodel2 = LdaModel(
    corpus, num_topics = 24, random_state=2021,
    id2word=dictionary, passes = 20
)

  and should_run_async(code)


In [None]:
topics = ldamodel2.print_topics(num_words=4)
for topic in topics:
    print(topic)

(4, '0.019*"mask" + 0.018*"chinese" + 0.016*"music" + 0.014*"fitted"')
(2, '0.016*"people" + 0.011*"israel" + 0.010*"said" + 0.008*"would"')
(17, '0.015*"year" + 0.014*"runs" + 0.013*"myers" + 0.012*"ball"')
(14, '0.018*"would" + 0.012*"like" + 0.010*"think" + 0.009*"time"')
(0, '0.019*"espn" + 0.016*"leafs" + 0.013*"frank" + 0.013*"playoff"')
(1, '0.025*"period" + 0.013*"kent" + 0.012*"gordon" + 0.012*"banks"')
(23, '0.027*"food" + 0.018*"xfree" + 0.014*"colorado" + 0.011*"indiana"')
(10, '0.029*"please" + 0.026*"mail" + 0.024*"thanks" + 0.020*"anyone"')
(16, '0.061*"file" + 0.034*"jpeg" + 0.032*"image" + 0.025*"format"')
(9, '0.025*"windows" + 0.017*"file" + 0.015*"window" + 0.013*"program"')
(5, '0.018*"president" + 0.017*"medical" + 0.014*"health" + 0.011*"disease"')
(20, '0.007*"state" + 0.006*"states" + 0.006*"national" + 0.005*"government"')
(13, '0.047*"entry" + 0.043*"master" + 0.037*"slave" + 0.026*"jumper"')
(6, '0.013*"dreams" + 0.011*"canon" + 0.010*"runner" + 0.008*"peopl

  and should_run_async(code)
