In [3]:
import math, random, re
from collections import defaultdict, Counter
from bs4 import BeautifulSoup
import requests

#
# TOPIC MODELING
#

def sample_from(weights):
    total = sum(weights)
    rnd = total * random.random()       # uniform between 0 and total
    for i, w in enumerate(weights):
        rnd -= w                        # return the smallest i such that
        if rnd <= 0: return i           # sum(weights[:(i+1)]) >= rnd

documents = [
    ["Hadoop", "Big Data", "HBase", "Java", "Spark", "Storm", "Cassandra"],
    ["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"],
    ["Python", "scikit-learn", "scipy", "numpy", "statsmodels", "pandas"],
    ["R", "Python", "statistics", "regression", "probability"],
    ["machine learning", "regression", "decision trees", "libsvm"],
    ["Python", "R", "Java", "C++", "Haskell", "programming languages"],
    ["statistics", "probability", "mathematics", "theory"],
    ["machine learning", "scikit-learn", "Mahout", "neural networks"],
    ["neural networks", "deep learning", "Big Data", "artificial intelligence"],
    ["Hadoop", "Java", "MapReduce", "Big Data"],
    ["statistics", "R", "statsmodels"],
    ["C++", "deep learning", "artificial intelligence", "probability"],
    ["pandas", "R", "Python"],
    ["databases", "HBase", "Postgres", "MySQL", "MongoDB"],
    ["libsvm", "regression", "support vector machines"]
]

K = 4

document_topic_counts = [Counter()
                         for _ in documents]

topic_word_counts = [Counter() for _ in range(K)]

topic_counts = [0 for _ in range(K)]

document_lengths = [len(d) for d in documents]

distinct_words = set(word for document in documents for word in document)
W = len(distinct_words)

D = len(documents)

def p_topic_given_document(topic, d, alpha=0.1):
    """the fraction of words in document _d_
    that are assigned to _topic_ (plus some smoothing)"""

    return ((document_topic_counts[d][topic] + alpha) /
            (document_lengths[d] + K * alpha))

def p_word_given_topic(word, topic, beta=0.1):
    """the fraction of words assigned to _topic_
    that equal _word_ (plus some smoothing)"""

    return ((topic_word_counts[topic][word] + beta) /
            (topic_counts[topic] + W * beta))

def topic_weight(d, word, k):
    """given a document and a word in that document,
    return the weight for the k-th topic"""

    return p_word_given_topic(word, k) * p_topic_given_document(k, d)

def choose_new_topic(d, word):
    return sample_from([topic_weight(d, word, k)
                        for k in range(K)])


random.seed(0)
document_topics = [[random.randrange(K) for word in document]
                   for document in documents]
print(document_topics)

[[3, 3, 0, 2, 3, 3, 2], [3, 2, 1, 1, 2], [1, 0, 2, 1, 2, 0], [0, 2, 3, 0, 2], [3, 2, 1, 3], [3, 2, 0, 0, 0, 3], [0, 3, 2, 1], [2, 0, 1, 1], [1, 1, 3, 0], [0, 2, 3, 0], [2, 2, 0], [2, 1, 2, 3], [0, 3, 2], [1, 2, 1, 1, 1], [0, 2, 3]]


In [4]:
for d in range(D):
    for word, topic in zip(documents[d], document_topics[d]):
        document_topic_counts[d][topic] += 1
        topic_word_counts[topic][word] += 1
        topic_counts[topic] += 1
print(document_topic_counts,"\n",topic_word_counts,"\n",topic_counts)        

[Counter({3: 4, 2: 2, 0: 1}), Counter({2: 2, 1: 2, 3: 1}), Counter({1: 2, 0: 2, 2: 2}), Counter({0: 2, 2: 2, 3: 1}), Counter({3: 2, 2: 1, 1: 1}), Counter({0: 3, 3: 2, 2: 1}), Counter({0: 1, 3: 1, 2: 1, 1: 1}), Counter({1: 2, 2: 1, 0: 1}), Counter({1: 2, 3: 1, 0: 1}), Counter({0: 2, 2: 1, 3: 1}), Counter({2: 2, 0: 1}), Counter({2: 2, 1: 1, 3: 1}), Counter({0: 1, 3: 1, 2: 1}), Counter({1: 4, 2: 1}), Counter({0: 1, 2: 1, 3: 1})] 
 [Counter({'scikit-learn': 2, 'pandas': 2, 'HBase': 1, 'R': 1, 'regression': 1, 'Java': 1, 'C++': 1, 'Haskell': 1, 'statistics': 1, 'artificial intelligence': 1, 'Hadoop': 1, 'Big Data': 1, 'statsmodels': 1, 'libsvm': 1}), Counter({'neural networks': 2, 'deep learning': 2, 'Cassandra': 1, 'HBase': 1, 'Python': 1, 'numpy': 1, 'decision trees': 1, 'theory': 1, 'Mahout': 1, 'databases': 1, 'Postgres': 1, 'MySQL': 1, 'MongoDB': 1}), Counter({'Java': 2, 'Python': 2, 'regression': 2, 'R': 2, 'Cassandra': 1, 'MongoDB': 1, 'Postgres': 1, 'scipy': 1, 'statsmodels': 1, 'pr

In [5]:

for iter in range(1000):
    for d in range(D):
        for i, (word, topic) in enumerate(zip(documents[d],
                                              document_topics[d])):

            # remove this word / topic from the counts
            # so that it doesn't influence the weights
            document_topic_counts[d][topic] -= 1
            topic_word_counts[topic][word] -= 1
            topic_counts[topic] -= 1
            document_lengths[d] -= 1

            # choose a new topic based on the weights
            new_topic = choose_new_topic(d, word)
            document_topics[d][i] = new_topic

            # and now add it back to the counts
            document_topic_counts[d][new_topic] += 1
            topic_word_counts[new_topic][word] += 1
            topic_counts[new_topic] += 1
            document_lengths[d] += 1

In [None]:

if __name__ == "__main__":

    # topic MODELING

    for k, word_counts in enumerate(topic_word_counts):
        for word, count in word_counts.most_common():
            if count > 0: print(k, word, count)

    topic_names = ["Big Data and programming languages",
                   "databases",
                   "machine learning",
                   "statistics"]

    for document, topic_counts in zip(documents, document_topic_counts):
        print(document)
        for topic, count in topic_counts.most_common():
            if count > 0:
                print(topic_names[topic], count)
        print()
