In [1]:
import itertools
import gensim
from gifter.modeling.data import lemmatized_frame
from gifter.modeling.tokenizer import lemmatize
from gensim import corpora, models
from nltk.corpus import brown

In [2]:
def LDA(texts="../data/1/data.json",documents_list=[],num=2, passes=100,save_model_as='lda.model', save_dic_as='dictionary.dic'):
    if not documents_list:
        df = lemmatized_frame(open(texts, "r"), with_tags=False)
        texts = [df['lemmas'].irow(i) for i in range(df.shape[0])]
    else:
        texts2 = []
        for d in documents_list:
            texts2 = texts2 + [lemmatize(d, with_tags=False)]
            texts = texts2
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    #print "corpus: " + str(corpus)
    corpora.MmCorpus.serialize('./corpus.mm', corpus)
    mm_corpus = corpora.MmCorpus('./corpus.mm')
    #print "mm_corpus: " + str(mm_corpus)
    id2word = {}
    for word in dictionary.token2id:
        id2word[dictionary.token2id[word]] = word
    lda = gensim.models.ldamulticore.LdaMulticore(
        corpus=mm_corpus,
        num_topics=num,
        id2word=id2word,
        eval_every=1,
        passes=passes,
        workers=1
    )
    lda.save(save_model_as)
    dictionary.save(save_dic_as)
    for i in range(0, lda.num_topics):
        print "Topic number " + str(i) + " consists of words : " + lda.print_topic(i)
    return lda, dictionary

In [3]:
l, dic = LDA()

  pattern = re.compile("(#|RT |{})".format(unidecode(to_remove)), re.I)


Topic number 0 consists of words : 0.015*state + 0.012*lead + 0.010*secretary + 0.009*curry + 0.008*minister + 0.007*conservative + 0.007*tonight + 0.007*amp + 0.006*new + 0.006*vote
Topic number 1 consists of words : 0.029*fcb + 0.012*win + 0.011*title + 0.010*atleti + 0.009*game + 0.009*atm + 0.009*campion + 0.008*climate + 0.008*president + 0.007*league


In [4]:
l2 = models.LdaModel.load('lda.model')

In [5]:
dic2 = corpora.Dictionary.load('dictionary.dic')

In [6]:
D = [
    "I like to eat broccoli and bananas.",
    "I ate a banana and spinach smoothie for breakfast.",
    "Chinchillas and kittens are cute.",
    "My sister adopted a kitten yesterday.",
    "Look at this cute hamster munching on a piece of broccoli."
 ]

In [8]:
l, dic = LDA(documents_list=D)

Topic number 0 consists of words : 0.119*cute + 0.119*kitten + 0.072*broccoli + 0.071*piece + 0.071*munch + 0.071*look + 0.071*hamster + 0.071*yesterday + 0.071*sister + 0.071*adopt
Topic number 1 consists of words : 0.155*eat + 0.155*banana + 0.093*spinach + 0.093*smoothie + 0.093*breakfast + 0.093*broccoli + 0.032*chinchilla + 0.032*kitten + 0.032*cute + 0.032*yesterday


In [7]:
l, dic = LDA(texts=brown.words(), num=40, passes=1)

Topic number 0 consist of words : 0.130*tell + 0.116*thing + 0.084*turn + 0.069*woman + 0.029*job + 0.029*field + 0.025*line + 0.024*bit + 0.022*air + 0.021*clear
Topic number 1 consist of words : 0.099*mr + 0.074*night + 0.062*live + 0.032*rest + 0.025*early + 0.017*coffee + 0.017*force + 0.016*sick + 0.014*suit + 0.013*studio
Topic number 2 consist of words : 0.032*change + 0.022*hate + 0.016*lose + 0.014*eat + 0.013*freddy + 0.012*layer + 0.012*blanche + 0.011*subject + 0.010*pet + 0.010*doll
Topic number 3 consist of words : 0.209*man + 0.055*miss + 0.043*smile + 0.035*book + 0.033*minute + 0.026*cousin + 0.023*finger + 0.022*pretty + 0.020*ready + 0.016*paint
Topic number 4 consist of words : 0.218*look + 0.126*way + 0.078*sit + 0.035*half + 0.032*catch + 0.031*meet + 0.020*heavy + 0.019*church + 0.017*write + 0.016*clearly
Topic number 5 consist of words : 0.223*know + 0.067*place + 0.062*open + 0.054*stand + 0.042*set + 0.041*grow + 0.032*speak + 0.026*realize + 0.024*touch + 0.

In [9]:
def find_topic(new_doc="../data/data.json",documents_list=[], dictionary=dic, lda=l):
    if not documents_list:
        df = lemmatized_frame(open(new_doc, "r"), with_tags=False)
        new_doc = list(
            itertools.chain(
                *[df['lemmas'].irow(i) for i in range(df.shape[0])]
            )
        )
    else:
        new_doc = new_doc.lower().split()
    new_vec = dictionary.doc2bow(new_doc)
    vec_lda = sorted(lda[new_vec], key=lambda vec: vec[1], reverse=True)
    print str(vec_lda)
    print "Top topic is topic number " + str(
        vec_lda[0][0]
    ) + " consists of words : " + lda.print_topic(vec_lda[0][0])
    return vec_lda

In [10]:
%time find_topic(dictionary=dic2,lda=l2)

[(0, 0.52099177690025189), (1, 0.47900822309974816)]
Top topic is topic number 0 consists of words : 0.015*state + 0.012*lead + 0.010*secretary + 0.009*curry + 0.008*minister + 0.007*conservative + 0.007*tonight + 0.007*amp + 0.006*new + 0.006*vote
CPU times: user 1.53 s, sys: 64.2 ms, total: 1.59 s
Wall time: 1.59 s


[(0, 0.52099177690025189), (1, 0.47900822309974816)]

In [12]:
find_topic(documents_list = "I love watching films about animals with my family and friends. I will write review of the film we watched yesterday")

[(0, 0.5), (1, 0.5)]
Top topic is topic number 0 consists of words : 0.119*cute + 0.119*kitten + 0.072*broccoli + 0.071*piece + 0.071*munch + 0.071*look + 0.071*hamster + 0.071*yesterday + 0.071*sister + 0.071*adopt


[(0, 0.5), (1, 0.5)]