<a href="https://colab.research.google.com/github/sabumjung/Machine-Learning-Algorithm/blob/master/ch13_04.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#확률적 잠재 의미 분석

In [0]:
import numpy as np

from nltk.corpus import brown

from sklearn.feature_extraction.text import CountVectorizer

In [0]:
 # For reproducibility
np.random.seed(1000)

rank = 2
alpha_1 = 1000.0
alpha_2 = 10.0

In [0]:
# Compose a corpus
sentences_1 = brown.sents(categories=['editorial'])[0:20]
sentences_2 = brown.sents(categories=['fiction'])[0:20]
corpus = []

for s in sentences_1 + sentences_2:
    corpus.append(' '.join(s))

In [0]:
# Vectorize the corpus
cv = CountVectorizer(strip_accents='unicode', stop_words='english')
Xc = np.array(cv.fit_transform(corpus).todense())

In [0]:
# Define the probability matrices
Ptd = np.random.uniform(0.0, 1.0, size=(len(corpus), rank))
Pwt = np.random.uniform(0.0, 1.0, size=(rank, len(cv.vocabulary_)))
Ptdw = np.zeros(shape=(len(cv.vocabulary_), len(corpus), rank))

In [0]:
# Normalize the probability matrices
for d in range(len(corpus)):
    nf = np.sum(Ptd[d, :])
    for t in range(rank):
        Ptd[d, t] /= nf

for t in range(rank):
    nf = np.sum(Pwt[t, :])
    for w in range(len(cv.vocabulary_)):
        Pwt[t, w] /= nf

In [0]:
def log_likelihood():
    value = 0.0

    for d in range(len(corpus)):
        for w in range(len(cv.vocabulary_)):
            real_topic_value = 0.0

            for t in range(rank):
                real_topic_value += Ptd[d, t] * Pwt[t, w]

            if real_topic_value > 0.0:
                value += Xc[d, w] * np.log(real_topic_value)

    return value


def expectation():
    global Ptd, Pwt, Ptdw

    for d in range(len(corpus)):
        for w in range(len(cv.vocabulary_)):
            nf = 0.0

            for t in range(rank):
                Ptdw[w, d, t] = Ptd[d, t] * Pwt[t, w]
                nf += Ptdw[w, d, t]

            Ptdw[w, d, :] = (Ptdw[w, d, :] / nf) if nf != 0.0 else 0.0


def maximization():
    global Ptd, Pwt, Ptdw

    for t in range(rank):
        nf = 0.0

        for d in range(len(corpus)):
            ps = 0.0

            for w in range(len(cv.vocabulary_)):
                ps += Xc[d, w] * Ptdw[w, d, t]

            Pwt[t, w] = ps
            nf += Pwt[t, w]

        Pwt[:, w] /= nf if nf != 0.0 else alpha_1

    for d in range(len(corpus)):
        for t in range(rank):
            ps = 0.0
            nf = 0.0

            for w in range(len(cv.vocabulary_)):
                ps += Xc[d, w] * Ptdw[w, d, t]
                nf += Xc[d, w]

            Ptd[d, t] = ps / (nf if nf != 0.0 else alpha_2)

In [27]:
print('Initial Log-Likelihood: %f' % log_likelihood())

for i in range(30):
    expectation()
    maximization()
    print('Step %d - Log-Likelihood: %f' % (i, log_likelihood()))

Initial Log-Likelihood: -2380.516058
Step 0 - Log-Likelihood: -2375.232302
Step 1 - Log-Likelihood: -2369.644941
Step 2 - Log-Likelihood: -2366.693462
Step 3 - Log-Likelihood: -2365.264721
Step 4 - Log-Likelihood: -2364.678040
Step 5 - Log-Likelihood: -2364.480283
Step 6 - Log-Likelihood: -2364.372574
Step 7 - Log-Likelihood: -2364.217400
Step 8 - Log-Likelihood: -2364.015236
Step 9 - Log-Likelihood: -2363.816327
Step 10 - Log-Likelihood: -2363.652013
Step 11 - Log-Likelihood: -2363.525275
Step 12 - Log-Likelihood: -2363.427930
Step 13 - Log-Likelihood: -2363.351643
Step 14 - Log-Likelihood: -2363.290491
Step 15 - Log-Likelihood: -2363.240594
Step 16 - Log-Likelihood: -2363.199369
Step 17 - Log-Likelihood: -2363.165010
Step 18 - Log-Likelihood: -2363.136188
Step 19 - Log-Likelihood: -2363.111882
Step 20 - Log-Likelihood: -2363.091288
Step 21 - Log-Likelihood: -2363.073763
Step 22 - Log-Likelihood: -2363.058786
Step 23 - Log-Likelihood: -2363.045932
Step 24 - Log-Likelihood: -2363.03485

In [28]:
# Show the top 5 words per topic
Pwts = np.argsort(Pwt, axis=1)[::-1]

for t in range(rank):
    print('\nTopic ' + str(t))
    for i in range(5):
        print(cv.get_feature_names()[Pwts[t, i]])


Topic 0
years
questions
south
reform
social

Topic 1
convened
maintenance
penal
year
legislators
