In [1]:
%matplotlib inline

In [2]:
import numpy as np
import matplotlib.pyplot as plt

In [3]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.decomposition import TruncatedSVD,NMF,LatentDirichletAllocation
from sklearn.cluster import KMeans

In [4]:
def generate_clearcut_topics():
    ## for demostration purpose, don't take it personally : )
    return np.repeat(["we love bergers", "we hate sandwiches"], [1000, 1000])

def generate_unbalanced_topics():
    return np.repeat(["we love bergers", "we hate sandwiches"], [10, 1000])

def generate_semantic_context_topics():
    return np.repeat(["we love bergers"
                      , "we hate bergers"
                      , "we love sandwiches"
                      , "we hate sandwiches"], 1000)

def generate_noisy_topics():
    def _random_typos(word, n):
        typo_index = np.random.randint(0, len(word), n)
        return [word[:i]+"X"+word[i+1:] for i in typo_index]
    t1 = ["we love %s" % w for w in _random_typos("bergers", 15)]
    t2 = ["we hate %s" % w for w in _random_typos("sandwiches", 15)]
    return np.r_[t1, t2]

sample_texts = {
     "clearcut topics": generate_clearcut_topics()
    , "unbalanced topics": generate_unbalanced_topics()
    , "semantic topics": generate_semantic_context_topics()
    , "noisy topics": generate_noisy_topics()
}

In [6]:
from collections import Counter
for desc,texts in sample_texts.items():
    print desc
    print Counter(texts).most_common()
    print ""

noisy topics
[('we hate saXdwiches', 5), ('we love beXgers', 5), ('we love bergerX', 3), ('we love berXers', 3), ('we hate Xandwiches', 3), ('we love bergXrs', 2), ('we hate sXndwiches', 2), ('we love Xergers', 1), ('we hate sandwiXhes', 1), ('we hate sandwicheX', 1), ('we hate sandwichXs', 1), ('we hate sandwicXes', 1), ('we hate sandXiches', 1), ('we love bergeXs', 1)]

clearcut topics
[('we love bergers', 1000), ('we hate sandwiches', 1000)]

unbalanced topics
[('we hate sandwiches', 1000), ('we love bergers', 10)]

semantic topics
[('we love bergers', 1000), ('we love sandwiches', 1000), ('we hate sandwiches', 1000), ('we hate bergers', 1000)]



In [18]:
def find_topic(texts,topic_model,n_topics,vec_model='tf',thr=1e-2,**kwargs):
    vectorizer=CountVectorizer() if vec_model=='tf' else TfidfVectorizer()
    text_vec=vectorizer.fit_transform(texts)
    words=np.array(vectorizer.get_feature_names())
    topic_models={'nmf':NMF,'svd':TruncatedSVD,'lda':LatentDirichletAllocation,'kmeans':KMeans}
    topicfinder=topic_models[topic_model](n_topics,**kwargs).fit(text_vec)
    topic_dists=topicfinder.components_ if topic_model is not "kmeans" else topicfinder.cluster_centers_
    topic_dists/=topic_dists.max(axis=1).reshape((-1,1))
    def _topic_keywords(topic_dist):
        keywords_index=np.abs(topic_dist)>=thr
        keywords_prefix=np.where(np.sign(topic_dist)>0,"","^")[keywords_index]
        keywords=' | '.join(map(lambda x:''.join(x),zip(keywords_prefix,words[keywords_index])))
        return keywords
    topic_keywords=map(_topic_keywords,topic_dists)
    return "\n".join("Topic %i: %s" % (i, t) for i, t in enumerate(topic_keywords))

In [19]:
print(find_topic(sample_texts['clearcut topics'],'svd',4,vec_model='tf'))

Topic 0: bergers | hate | love | sandwiches | we
Topic 1: bergers | ^hate | love | ^sandwiches
Topic 2: bergers | hate | love | sandwiches | ^we
Topic 3: ^bergers | ^hate | love | sandwiches


In [20]:
print(find_topic(sample_texts['clearcut topics'],'svd',4,vec_model='tfidf'))

Topic 0: bergers | hate | love | sandwiches | we
Topic 1: bergers | ^hate | love | ^sandwiches
Topic 2: ^bergers | ^hate | ^love | ^sandwiches | we
Topic 3: bergers | ^hate | ^love | sandwiches


In [22]:
print(find_topic(sample_texts['unbalanced topics'],'svd',3,vec_model='tf'))

Topic 0: hate | sandwiches | we
Topic 1: bergers | ^hate | love | ^sandwiches | we
Topic 2: bergers | hate | love | sandwiches | ^we


In [23]:
print(find_topic(sample_texts['noisy topics'],'svd',2,vec_model='tf'))

Topic 0: bergerx | bergexs | bergxrs | berxers | bexgers | hate | love | sandwichex | sandwichxs | sandwicxes | sandwixhes | sandxiches | saxdwiches | sxndwiches | we | xandwiches | xergers
Topic 1: ^bergerx | ^bergexs | ^bergxrs | ^berxers | ^bexgers | hate | ^love | sandwichex | sandwichxs | sandwicxes | sandwixhes | sandxiches | saxdwiches | sxndwiches | we | xandwiches | ^xergers


In [25]:
print(find_topic(sample_texts['clearcut topics'],'kmeans',10,vec_model='tf'))

Topic 0: hate | sandwiches | we
Topic 1: bergers | love | we
Topic 2: hate | sandwiches | we
Topic 3: bergers | love | we
Topic 4: bergers | love | we
Topic 5: bergers | love | we
Topic 6: bergers | love | we
Topic 7: bergers | love | we
Topic 8: bergers | love | we
Topic 9: bergers | love | we
