In [1]:
import random
from collections import Counter

random.seed(0)

Import data

In [2]:
documents = [
    ["Hadoop", "Big Data", "HBase", "Java", "Spark", "Storm", "Cassandra"],
    ["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"],
    ["Python", "scikit-learn", "scipy", "numpy", "statsmodels", "pandas"],
    ["R", "Python", "statistics", "regression", "probability"],
    ["machine learning", "regression", "decision trees", "libsvm"],
    ["Python", "R", "Java", "C++", "Haskell", "programming languages"],
    ["statistics", "probability", "mathematics", "theory"],
    ["machine learning", "scikit-learn", "Mahout", "neural networks"],
    ["neural networks", "deep learning", "Big Data", "artificial intelligence"],
    ["Hadoop", "Java", "MapReduce", "Big Data"],
    ["statistics", "R", "statsmodels"],
    ["C++", "deep learning", "artificial intelligence", "probability"],
    ["pandas", "R", "Python"],
    ["databases", "HBase", "Postgres", "MySQL", "MongoDB"],
    ["libsvm", "regression", "support vector machines"]
]

K = 4

Create data structures for various counts

In [3]:
# Number of times each topic is assigned to each document
document_topic_counts = [Counter() for _ in documents]

# Number of times each word is assigned to each topic
topic_word_counts = [Counter() for _ in range(K)]

# Total number of words assigned to each topic
topic_counts = [0 for _ in range(K)]

# Total number of words in each document
document_lengths = [len(d) for d in documents]

# Number of distinct words
distinct_words = set(word for document in documents for word in document)
W = len(distinct_words)

# Number of documents
D = len(documents)

Define the conditional probability functions

In [4]:
def probability_topic_given_document(topic, document, alpha=0.1):
    return ((document_topic_counts[document][topic] + alpha) / 
            (document_lengths[document] + K * alpha))

def probability_word_given_topic(word, topic, beta=0.1):
    return ((topic_word_counts[topic][word] + beta) /
           (topic_counts[topic] + W * beta))

Functions for calculating weights

In [5]:
def sample_from(weights):
    # returns i with probability weights[i] / sum(weights)
    
    total = sum(weights)
    rnd = total * random.random()
    for i, weight in enumerate(weights):
        rnd -= weight
        if rnd <=0:
            return i

def topic_weight(document, word, topic):
    return probability_word_given_topic(word, topic) * probability_topic_given_document(topic, document)

def choose_new_topic(document, word):
    return sample_from([topic_weight(document, word, topic) for topic in range(K)])

Initialize data structures

In [6]:
document_topics = [[random.randrange(K) for word in document] for document in documents]

for d in range(D):
    for word, topic in zip(documents[d], document_topics[d]):
        document_topic_counts[d][topic] += 1
        topic_word_counts[topic][word] += 1
        topic_counts[topic] += 1

Learn the topics

In [7]:
for iter in range(1000):
    for d in range(D):
        for i, (word, topic) in enumerate(zip(documents[d], document_topics[d])):
            # Remove current word/topic from the counts so they don't influence weights
            document_topic_counts[d][topic] -= 1
            topic_word_counts[topic][word] -= 1
            topic_counts[topic] -= 1
            document_lengths[d] -= 1
            
            # Choose a new topic based on the weights
            new_topic = choose_new_topic(d, word)
            document_topics[d][i] = new_topic
            
            # Add word/topic back to the counts
            document_topic_counts[d][topic] += 1
            topic_word_counts[topic][word] += 1
            topic_counts[topic] += 1
            document_lengths[d] += 1

Print top five words for each topic

In [8]:
for k, word_counts in enumerate(topic_word_counts):
    for word, count in word_counts.most_common(5):
        if count > 0:
            print(k, word, count)

0 scikit-learn 2
0 pandas 2
0 HBase 1
0 R 1
0 regression 1
1 neural networks 2
1 deep learning 2
1 Cassandra 1
1 HBase 1
1 Python 1
2 Java 2
2 Python 2
2 regression 2
2 R 2
2 Cassandra 1
3 Big Data 2
3 probability 2
3 Hadoop 1
3 Spark 1
3 Storm 1


In [9]:
topic_names = ['Big Data and Programming Languages', 'Python and Statistics', 'Databases', 'Machine Learning']

for document, topic_counts in zip(documents, document_topic_counts):
    print(document)
    for topic, count in topic_counts.most_common(5):
        if count > 0:
            print('\t', topic_names[topic], count)
    print()

['Hadoop', 'Big Data', 'HBase', 'Java', 'Spark', 'Storm', 'Cassandra']
	 Machine Learning 4
	 Databases 2
	 Big Data and Programming Languages 1

['NoSQL', 'MongoDB', 'Cassandra', 'HBase', 'Postgres']
	 Databases 2
	 Python and Statistics 2
	 Machine Learning 1

['Python', 'scikit-learn', 'scipy', 'numpy', 'statsmodels', 'pandas']
	 Python and Statistics 2
	 Big Data and Programming Languages 2
	 Databases 2

['R', 'Python', 'statistics', 'regression', 'probability']
	 Big Data and Programming Languages 2
	 Databases 2
	 Machine Learning 1

['machine learning', 'regression', 'decision trees', 'libsvm']
	 Machine Learning 2
	 Databases 1
	 Python and Statistics 1

['Python', 'R', 'Java', 'C++', 'Haskell', 'programming languages']
	 Big Data and Programming Languages 3
	 Machine Learning 2
	 Databases 1

['statistics', 'probability', 'mathematics', 'theory']
	 Big Data and Programming Languages 1
	 Machine Learning 1
	 Databases 1
	 Python and Statistics 1

['machine learning', 'scikit-l