# Non supervised text topic detection

## LDA with Gensim

In [1]:
import os
import re
import string

#import pandas as pd

data = '../data'

In [2]:
from sklearn.datasets import fetch_20newsgroups

twenty_train = fetch_20newsgroups(subset='train',
                 remove=('headers', 'footers', 'quotes'), shuffle=True, random_state=42)
print(len(twenty_train['data']))

11314


In [3]:
from nltk.tokenize import sent_tokenize
from nltk.tokenize import TreebankWordTokenizer

def tokenize_doc(text):
    sents = sent_tokenize(text)
    tokens = []
    for s in sents:
        for t in TreebankWordTokenizer().tokenize(s):
            if len(t)<2 or '\n' in t or '\t' in t or t.isnumeric():
                continue
            else:
                tokens += [t.lower()]
    return tokens

In [4]:
%%time

docs_tokenized = []
for text in twenty_train['data']:
    docs_tokenized += [tokenize_doc(text)]
print(len(docs_tokenized))

11314
CPU times: user 24 s, sys: 265 ms, total: 24.3 s
Wall time: 24.5 s


In [6]:
# Get the NLTK stopwords list
import nltk

def nltk_stopwords():
    return set(nltk.corpus.stopwords.words('english'))

In [11]:
# create dictionary and bag of words corpus

from gensim.corpora import Dictionary, MmCorpus
    
print('Building dictionary...')
dictionary = Dictionary(docs_tokenized)

# Stopwords

additional_stopwords=set(['...', '\'s','\'\'',  "``", "n\'t", "\'re", "\'m", "\'ve",
                          '--', '"', "\'ax", 'max', 'q,3'])

#additional_stopwords=set()

stopwords = nltk_stopwords().union(additional_stopwords)
stopword_ids = map(dictionary.token2id.get, stopwords)
dictionary.filter_tokens(stopword_ids)
dictionary.compactify()

# Filter extremes
dictionary.filter_extremes(no_below=5, no_above=0.95, keep_n=None)
dictionary.compactify()

print('Building corpus...')
corpus = [dictionary.doc2bow(doc) for doc in docs_tokenized]

Building dictionary...
Building corpus...


In [12]:
%%time

# Train the LDA model

from gensim import models

lda = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=25, passes=10)
                                      
lda.save(os.path.join(data,'newsgroups_50_lda.model'))

CPU times: user 1min 6s, sys: 1.73 s, total: 1min 7s
Wall time: 1min 7s


In [13]:
lda.show_topics()

[(4,
  '0.021*"mr." + 0.019*"president" + 0.013*"stephanopoulos" + 0.007*"tax" + 0.007*"going" + 0.006*"jobs" + 0.005*"year" + 0.005*"states" + 0.005*"american" + 0.005*"new"'),
 (2,
  '0.020*"god" + 0.016*"people" + 0.010*"would" + 0.009*"one" + 0.008*"jesus" + 0.007*"us" + 0.006*"say" + 0.005*"life" + 0.005*"even" + 0.005*"believe"'),
 (5,
  '0.022*"year" + 0.017*"gm" + 0.015*"player" + 0.014*"season" + 0.012*"players" + 0.012*"good" + 0.011*"last" + 0.010*"team" + 0.010*"john" + 0.008*"mike"'),
 (8,
  '0.016*"government" + 0.013*"gun" + 0.012*"law" + 0.011*"state" + 0.009*"right" + 0.009*"rights" + 0.008*"public" + 0.007*"guns" + 0.007*"control" + 0.007*"would"'),
 (14,
  '0.010*"software" + 0.010*"data" + 0.010*"use" + 0.009*"image" + 0.007*"computer" + 0.007*"color" + 0.007*"internet" + 0.006*"system" + 0.006*"images" + 0.006*"available"'),
 (24,
  '0.015*"israel" + 0.014*"war" + 0.012*"turkish" + 0.012*"jews" + 0.012*"armenian" + 0.010*"israeli" + 0.009*"armenians" + 0.008*"peopl

In [20]:
lda.get_document_topics(corpus[:10])

<gensim.interfaces.TransformedCorpus at 0x133cad240>

In [19]:
for i in range(lda.num_topics):
    print(i, lda.show_topic(i))
    print('\n')

0 [('one', 0.014967264), ('evidence', 0.007125512), ('true', 0.006811183), ('believe', 0.006538611), ('would', 0.0064531323), ('point', 0.005327043), ('book', 0.0052255443), ('many', 0.005205821), ('exist', 0.0048122657), ('claim', 0.0047982084)]


1 [('article', 0.010416013), ('master', 0.009780715), ('read', 0.0069780317), ('wiring', 0.006480035), ('posting', 0.0058471775), ('may', 0.005492524), ('context', 0.0054858625), ('given', 0.0050711967), ('faq', 0.0049202973), ('part', 0.0048426054)]


2 [('god', 0.020054193), ('people', 0.015518317), ('would', 0.00987888), ('one', 0.008836058), ('jesus', 0.008279402), ('us', 0.0074752066), ('say', 0.0056957416), ('life', 0.005287182), ('even', 0.0048442464), ('believe', 0.004740599)]


3 [('thanks', 0.020954615), ('please', 0.02049899), ('anyone', 0.016487125), ('would', 0.013559956), ('know', 0.012694959), ('email', 0.009846773), ('help', 0.0094499495), ('send', 0.00923283), ('e-mail', 0.00881854), ('information', 0.0074023246)]


4 [('mr.

# Visualization

In [14]:
import pyLDAvis

pyLDAvis.enable_notebook()

In [15]:
import pyLDAvis.gensim

pyLDAvis.gensim.prepare(lda, corpus, dictionary)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  topic_term_dists = topic_term_dists.ix[topic_order]
