### Latent Dirichlet Allocation

In [1]:
import random
import pandas as pd
from collections import Counter

- 토픽의 개수를 미리 결정(T=4) 하고
- 문서(documents) 에 대한 데이터 부여, 원래는 

In [120]:
T = 4

In [121]:
documents = [["은행", "플랫폼", "이자", "비이자", "Valuation", "성장성", "주가", "대출"], 
    ["자동차", "반도체", "성장성", "자율주행","당기순이익","칩"],
    ["CMO", "바이오", "코로나", "백신", "공정", "매출액", "성장성", "모더나", "바이러스"],
    ["실적", "이자", "비은행", "은행", "대출", "증권","비이자"],
    ["바이오시밀러", "코로나", "바이러스", "백신", "수익성", "바이오"],
    ["5G", "이동통신", "매출", "이익", "커머스", "지주"],
    ["스마트팩토리", "수익률", "통신", "5G", "IPTV"],
    ["IPTV", "턴어라운드", "5G", "이익", "인터넷", "통신"],
    ["반도체", "인공지능", "IP", "플랫폼", "칩", "자율주행"],
    ["원자현미경", "공정", "반도체", "성장", "수익성", "디스플레이"]]

In [None]:
- 데이터 개수를 count하기 위한 Counter() 함수 사용
- 인덱스를 무시하고 각 문서(document) 별로 Counter() 생성
- 각 문서(Document) 별로 

In [122]:
# a list of Counters, one for each document
document_topic_counts = [Counter() for _ in documents]

# a list of Counters, one for each topic
topic_word_counts = [Counter() for _ in range(T)]

# a list of numbers, one for each topic
topic_counts = [0 for _ in range(T)]

In [138]:
document_topic_counts

[Counter({3: 3, 1: 0, 2: 5, 0: 0}),
 Counter({3: 6, 2: 0, 0: 0, 1: 0}),
 Counter({3: 0, 1: 9, 0: 0, 2: 0}),
 Counter({2: 7, 3: 0, 0: 0, 1: 0}),
 Counter({1: 5, 2: 0, 0: 0, 3: 1}),
 Counter({3: 0, 0: 6, 2: 0, 1: 0}),
 Counter({0: 5, 1: 0, 2: 0, 3: 0}),
 Counter({1: 0, 3: 0, 0: 6, 2: 0}),
 Counter({3: 6, 0: 0, 2: 0, 1: 0}),
 Counter({0: 0, 3: 0, 2: 0, 1: 6})]

- 각 문서(d)에 존재하는 단어의 개수 $N_{d}$ 집계

In [140]:
document_lengths = list(map(len, documents))

- 전체 문서의 개수 $(D)$ 집계

In [147]:
D = len(documents)

- 데이터에 존재하는 전체 단어의 Unique 개수 $(M)$ 집계

In [165]:
word_vec = list()
for d in range(D):
    N_d = list(map(len, documents))[d]
    for n in range(N_d):
        word_vec.append(documents[d][n])

unique_word_vec = set(word_vec)
M = len(unique_word_vec)

In [164]:
unique_word_vec = set(word_vec)
M = len(unique_word_vec)

#### Gibbs Sampling Step

- Gibbs Sampling 실행 과정에서 사용하게 될 conditional distribution 에 대한 코드 구현

$ p(z_{n}^{(d)}=t \vert z_{-n}^{(d)},\mathbf{w},\alpha,\beta) \propto \frac{ \beta_{m}+\sum_{d=1}^{D}\sum_{n=1}^{N_{d}}I(w_{n}^{(d)}=m)I(z_{n}^{(d)}=t)}{ \sum_{m=1}^{M}\left(\beta_{m}+\sum_{d=1}^{D}\sum_{n=1}^{N_{d}}I(w_{n}^{(d)}=m)I(z_{n}^{(d)}=t) \right) } \times \left( \alpha_{t}+ \sum_{n=1}^{N_{d}} I(z_{n}^{(d)}=t) \right) $

In [175]:
def gibbs_sampler_code(d, word, topic) :
    
    ((topic_word_counts[topic][word] + beta) / (topic_counts[topic] + M*beta)) * ((document_topic_counts[d][topic] + alpha))

In [167]:
def topic_weight(d, word, topic):
    """given a document and a word in that document,
    return the weight for the kth topic"""
    
    def p_topic_given_document(topic, d, alpha=0.1):
        """the fraction of words in document _d_
        that are assigned to _topic_ (plus some smoothing)"""
        return ((document_topic_counts[d][topic] + alpha))

    def p_word_given_topic(word, topic, beta=0.1):
        """the fraction of words assigned to _topic_
        that equal _word_ (plus some smoothing)"""
        return ((topic_word_counts[topic][word] + beta) / (topic_counts[topic] + M*beta))
    
    
    return p_word_given_topic(word, topic) * p_topic_given_document(topic, d)

In [168]:
def choose_new_topic(d, word):
    
    def sample_from(weights):
        """returns i with probability weights[i] / sum(weights)"""
        total = sum(weights)
        rnd = total * random.random() # uniform between 0 and total
        for i, p in enumerate(weights):
            rnd -= p # return the smallest i such that
            if rnd <= 0: 
                return i # weights[0] + ... + weights[i] >= rnd
        
    return sample_from([topic_weight(d, word, topic) for topic in range(T)])

In [169]:
document_topics = [[random.randrange(T) for word in document] for document in documents]
document_topics

[[3, 2, 1, 3, 3, 2, 0, 0],
 [0, 1, 3, 0, 2, 1],
 [1, 0, 2, 3, 1, 3, 0, 1, 0],
 [3, 0, 2, 3, 3, 2, 1],
 [3, 0, 2, 0, 1, 2],
 [2, 3, 1, 1, 3, 3],
 [2, 2, 0, 1, 2],
 [0, 1, 1, 2, 2, 3],
 [3, 2, 2, 2, 1, 3],
 [1, 3, 1, 3, 3, 3]]

In [170]:
for d in range(D):
    for word, topic in zip(documents[d], document_topics[d]):
        document_topic_counts[d][topic] += 1 
        topic_word_counts[topic][word] += 1
        topic_counts[topic] += 1

In [171]:
document_topic_counts

[Counter({3: 6, 1: 1, 2: 7, 0: 2}),
 Counter({3: 7, 2: 1, 0: 2, 1: 2}),
 Counter({3: 2, 1: 12, 0: 3, 2: 1}),
 Counter({2: 9, 3: 3, 0: 1, 1: 1}),
 Counter({1: 6, 2: 2, 0: 2, 3: 2}),
 Counter({3: 3, 0: 6, 2: 1, 1: 2}),
 Counter({0: 6, 1: 1, 2: 3, 3: 0}),
 Counter({1: 2, 3: 1, 0: 7, 2: 2}),
 Counter({3: 8, 0: 0, 2: 3, 1: 1}),
 Counter({0: 0, 3: 4, 2: 0, 1: 8})]

In [172]:
for epoch in range(3000): # repetition
    for d in range(D): # each documnet
        for i, (word, topic) in enumerate(zip(documents[d],document_topics[d])):
            
            # gibbs sampling: 특정 하나의 topic assignment z를 제거하고 나머지들(-z)의 조건부 확률  
            
            # remove this word / topic from the counts
            # so that it doesn't influence the weights
            document_topic_counts[d][topic] -= 1 # 문서별 토픽 갯수
            topic_word_counts[topic][word] -= 1 # 토픽별 단어 갯수
            topic_counts[topic] -= 1 # 토픽별 카운트
            document_lengths[d] -= 1 # 문서별 단어갯수
            
            # choose a new topic based on the weights
            new_topic = choose_new_topic(d, word)
            document_topics[d][i] = new_topic
            
            # and now add it back to the counts
            document_topic_counts[d][new_topic] += 1 # 문서별 토픽 갯수
            topic_word_counts[new_topic][word] += 1 # 토픽별 단어 갯수
            topic_counts[new_topic] += 1 # 토픽별 카운트
            document_lengths[d] += 1 # 문서별 단어갯수

In [173]:
df = pd.DataFrame(columns=['Topic1','Topic2','Topic3','Topic4'], index=['Top'+str(i) for i in range(1,6)])

for k, word_counts in enumerate(topic_word_counts):
    for ix, (word, count) in enumerate(word_counts.most_common(6)): # 각 토픽별로 top 10 단어
            df.loc['Top'+str(ix+1),'Topic'+str(k+1)] = word+'({})'.format(count)

In [174]:
df

Unnamed: 0,Topic1,Topic2,Topic3,Topic4
Top1,5G(6),바이오(4),비이자(4),반도체(4)
Top2,통신(4),수익성(4),대출(4),플랫폼(4)
Top3,IPTV(4),공정(4),은행(4),칩(4)
Top4,이익(4),코로나(4),이자(3),성장성(4)
Top5,이동통신(2),백신(4),실적(2),자율주행(4)
Top6,스마트팩토리(2),바이러스(4),증권(2),자동차(2)
