In [1]:
from collections import Counter
import random

def p_topic_given_document(topic, d, alpha=0.1):
    return ((document_topic_counts[d][topic] + alpha) /
            (document_lengths[d] + K * alpha))

def p_word_given_topic(word, topic, beta=0.1):
    return ((topic_word_counts[topic][word] + beta) /
            (topic_counts[topic] + V * beta))

def topic_weight(d, word, k):
    return p_word_given_topic(word, k) * p_topic_given_document(k, d)

def choose_new_topic(d, word):
    return sample_from([topic_weight(d, word, k) for k in range(K)])

def sample_from(weights):
    total = sum(weights)
    rnd = total * random.random()
    for i, w in enumerate(weights):
        rnd -= w
        if rnd <= 0:
            return i

documents = [["Hadoop", "Big Data", "HBase", "Java", "Spark", "Storm", "Cassandra"],
    ["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"],
    ["Python", "scikit-learn", "scipy", "numpy", "statsmodels", "pandas"],
    ["R", "Python", "statistics", "regression", "probability"],
    ["machine learning", "regression", "decision trees", "libsvm"],
    ["Python", "R", "Java", "C++", "Haskell", "programming languages"],
    ["statistics", "probability", "mathematics", "theory"],
    ["machine learning", "scikit-learn", "Mahout", "neural networks"],
    ["neural networks", "deep learning", "Big Data", "artificial intelligence"],
    ["Hadoop", "Java", "MapReduce", "Big Data"],
    ["statistics", "R", "statsmodels"],
    ["C++", "deep learning", "artificial intelligence", "probability"],
    ["pandas", "R", "Python"],
    ["databases", "HBase", "Postgres", "MySQL", "MongoDB"],
    ["libsvm", "regression", "support vector machines"]]

random.seed(0)
K=4
document_topics = [[random.randrange(K) for word in document]
                    for document in documents]
document_topic_counts = [Counter() for _ in documents]
topic_word_counts = [Counter() for _ in range(K)]
topic_counts = [0 for _ in range(K)]
document_lengths = [len(document) for document in documents]
distinct_words = set(word for document in documents for word in document)
V = len(distinct_words)
D = len(documents)

for d in range(D):
    for word, topic in zip(documents[d], document_topics[d]):
        document_topic_counts[d][topic] += 1
        topic_word_counts[topic][word] += 1
        topic_counts[topic] += 1

for iter in range(1000):
    for d in range(D):
        for i, (word, topic) in enumerate(zip(documents[d],
                                              document_topics[d])):
            document_topic_counts[d][topic] -= 1
            topic_word_counts[topic][word] -= 1
            topic_counts[topic] -= 1
            document_lengths[d] -= 1
            new_topic = choose_new_topic(d, word)
            document_topics[d][i] = new_topic
            document_topic_counts[d][new_topic] += 1
            topic_word_counts[new_topic][word] += 1
            topic_counts[new_topic] += 1
            document_lengths[d] += 1

In [12]:
print(len(document_topic_counts))
print(len(topic_word_counts))

15
4


In [13]:
document_topic_counts

[Counter({0: 7, 1: 0, 2: 0, 3: 0}),
 Counter({0: 0, 1: 5, 2: 0, 3: 0}),
 Counter({0: 0, 1: 2, 2: 2, 3: 2}),
 Counter({0: 0, 1: 0, 2: 2, 3: 3}),
 Counter({0: 0, 1: 2, 2: 2, 3: 0}),
 Counter({0: 3, 1: 0, 2: 3, 3: 0}),
 Counter({0: 0, 1: 0, 2: 1, 3: 3}),
 Counter({0: 0, 1: 2, 2: 2, 3: 0}),
 Counter({0: 1, 1: 3, 2: 0, 3: 0}),
 Counter({0: 4, 1: 0, 2: 0, 3: 0}),
 Counter({0: 0, 1: 0, 2: 0, 3: 3}),
 Counter({0: 1, 1: 0, 2: 0, 3: 3}),
 Counter({0: 0, 1: 0, 2: 0, 3: 3}),
 Counter({0: 0, 1: 5, 2: 0, 3: 0}),
 Counter({0: 0, 1: 0, 2: 3, 3: 0})]

In [3]:
document_topic_counts[0]

Counter({0: 7, 1: 0, 2: 0, 3: 0})

In [4]:
topic_word_counts[0]

Counter({'Big Data': 3,
         'C++': 1,
         'Cassandra': 1,
         'HBase': 1,
         'Hadoop': 2,
         'Haskell': 0,
         'Java': 3,
         'Mahout': 0,
         'MapReduce': 1,
         'MongoDB': 0,
         'MySQL': 0,
         'NoSQL': 0,
         'Postgres': 0,
         'Python': 0,
         'R': 0,
         'Spark': 1,
         'Storm': 1,
         'artificial intelligence': 0,
         'databases': 0,
         'decision trees': 0,
         'deep learning': 1,
         'libsvm': 0,
         'machine learning': 0,
         'mathematics': 0,
         'neural networks': 0,
         'numpy': 0,
         'pandas': 0,
         'probability': 0,
         'programming languages': 1,
         'regression': 0,
         'scikit-learn': 0,
         'scipy': 0,
         'statistics': 0,
         'statsmodels': 0,
         'support vector machines': 0,
         'theory': 0})

In [5]:
document_topic_counts[1]

Counter({0: 0, 1: 5, 2: 0, 3: 0})

In [7]:
topic_word_counts[1]

Counter({'Big Data': 0,
         'C++': 0,
         'Cassandra': 1,
         'HBase': 2,
         'Hadoop': 0,
         'Haskell': 0,
         'Java': 0,
         'Mahout': 0,
         'MapReduce': 0,
         'MongoDB': 2,
         'MySQL': 1,
         'NoSQL': 1,
         'Postgres': 2,
         'Python': 0,
         'R': 0,
         'Spark': 0,
         'Storm': 0,
         'artificial intelligence': 1,
         'databases': 1,
         'decision trees': 1,
         'deep learning': 1,
         'libsvm': 0,
         'machine learning': 2,
         'mathematics': 0,
         'neural networks': 2,
         'numpy': 1,
         'pandas': 0,
         'probability': 0,
         'programming languages': 0,
         'regression': 0,
         'scikit-learn': 0,
         'scipy': 1,
         'statistics': 0,
         'statsmodels': 0,
         'support vector machines': 0,
         'theory': 0})

In [14]:
topic_word_counts[2]

Counter({'Big Data': 0,
         'C++': 0,
         'Cassandra': 0,
         'HBase': 0,
         'Hadoop': 0,
         'Haskell': 1,
         'Java': 0,
         'Mahout': 1,
         'MapReduce': 0,
         'MongoDB': 0,
         'MySQL': 0,
         'NoSQL': 0,
         'Postgres': 0,
         'Python': 2,
         'R': 2,
         'Spark': 0,
         'Storm': 0,
         'artificial intelligence': 0,
         'databases': 0,
         'decision trees': 0,
         'deep learning': 0,
         'libsvm': 2,
         'machine learning': 0,
         'mathematics': 1,
         'neural networks': 0,
         'numpy': 0,
         'pandas': 0,
         'probability': 0,
         'programming languages': 0,
         'regression': 3,
         'scikit-learn': 2,
         'scipy': 0,
         'statistics': 0,
         'statsmodels': 0,
         'support vector machines': 1,
         'theory': 0})

In [15]:
topic_word_counts[3]

Counter({'Big Data': 0,
         'C++': 1,
         'Cassandra': 0,
         'HBase': 0,
         'Hadoop': 0,
         'Haskell': 0,
         'Java': 0,
         'Mahout': 0,
         'MapReduce': 0,
         'MongoDB': 0,
         'MySQL': 0,
         'NoSQL': 0,
         'Postgres': 0,
         'Python': 2,
         'R': 2,
         'Spark': 0,
         'Storm': 0,
         'artificial intelligence': 1,
         'databases': 0,
         'decision trees': 0,
         'deep learning': 0,
         'libsvm': 0,
         'machine learning': 0,
         'mathematics': 0,
         'neural networks': 0,
         'numpy': 0,
         'pandas': 2,
         'probability': 3,
         'programming languages': 0,
         'regression': 0,
         'scikit-learn': 0,
         'scipy': 0,
         'statistics': 3,
         'statsmodels': 2,
         'support vector machines': 0,
         'theory': 1})