# Non supervised text topic detection

## LDA with Gensim

In [13]:
import os
from glob import glob
import re
import string
import funcy as fp

import pandas as pd

data = '../data'

In [2]:
from sklearn.datasets import fetch_20newsgroups

twenty_train = fetch_20newsgroups(subset='train',
                 remove=('headers', 'footers', 'quotes'), shuffle=True, random_state=42)
print(len(twenty_train['data']))

11314


In [3]:
from nltk.tokenize import sent_tokenize
from nltk.tokenize import TreebankWordTokenizer

def tokenize_doc(text):
    sents = sent_tokenize(text)
    tokens = []
    for s in sents:
        for t in TreebankWordTokenizer().tokenize(s):
            if len(t)<2 or '\n' in t or '\t' in t or t.isnumeric():
                continue
            else:
                tokens += [t.lower()]
    return tokens

In [5]:
%%time

docs_tokenized = []
for text in twenty_train['data']:
    docs_tokenized += [tokenize_doc(text)]
print(len(docs_tokenized))

11314
CPU times: user 22.6 s, sys: 113 ms, total: 22.7 s
Wall time: 22.7 s


In [6]:
# Get the NLTK stopwords list
import nltk

def nltk_stopwords():
    return set(nltk.corpus.stopwords.words('english'))

In [7]:
# create dictionary and bag of words corpus

from gensim.corpora import Dictionary, MmCorpus
    
print('Building dictionary...')
dictionary = Dictionary(docs_tokenized)

# Stopwords
additional_stopwords=set(['...', '\'s','\'\'',  "``", "n\'t", "\'re", "\'m", "\'ve",
                          '--', '"', "\'ax", 'max', 'q,3'])

stopwords = nltk_stopwords().union(additional_stopwords)
stopword_ids = map(dictionary.token2id.get, stopwords)
dictionary.filter_tokens(stopword_ids)
dictionary.compactify()

# Filter extremes
dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=None)
dictionary.compactify()

print('Building corpus...')
corpus = [dictionary.doc2bow(doc) for doc in docs_tokenized]

Building dictionary...
Building corpus...


In [14]:
%%time

# Train the LDA model

from gensim import models

lda = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=50, passes=10)
                                      
lda.save(os.path.join(data,'newsgroups_50_lda.model'))

CPU times: user 1min 25s, sys: 1.43 s, total: 1min 26s
Wall time: 1min 25s


In [15]:
lda.show_topics()

[(2,
  '0.019*"new" + 0.019*"apple" + 0.015*"cd" + 0.015*"shipping" + 0.013*"cover" + 0.013*"dos" + 0.012*"copy" + 0.011*"appears" + 0.011*"copies" + 0.011*"price"'),
 (35,
  '0.021*"book" + 0.021*"jumper" + 0.020*"myers" + 0.019*".." + 0.017*"manual" + 0.015*"reference" + 0.012*"turbo" + 0.012*"de" + 0.011*"ms" + 0.010*"page"'),
 (28,
  '0.018*"trial" + 0.013*"dog" + 0.013*"news" + 0.012*"new" + 0.012*"media" + 0.010*"joseph" + 0.009*"washington" + 0.009*"authorities" + 0.008*"court" + 0.008*"attorney"'),
 (26,
  '0.037*"gun" + 0.020*"guns" + 0.019*"crime" + 0.018*"control" + 0.016*"police" + 0.013*"weapon" + 0.013*"firearms" + 0.012*"weapons" + 0.011*"firearm" + 0.010*"law"'),
 (8,
  '0.031*"team" + 0.031*"game" + 0.023*"games" + 0.021*"hockey" + 0.020*"play" + 0.017*"season" + 0.016*"league" + 0.013*"san" + 0.013*"nhl" + 0.012*"period"'),
 (41,
  '0.020*"car" + 0.013*"price" + 0.013*"new" + 0.011*"get" + 0.009*"like" + 0.009*"bike" + 0.008*"used" + 0.008*"buy" + 0.008*"good" + 0.007

# Visualization

In [16]:
import pyLDAvis

pyLDAvis.enable_notebook()

In [17]:
import pyLDAvis.gensim

pyLDAvis.gensim.prepare(lda, corpus, dictionary)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  topic_term_dists = topic_term_dists.ix[topic_order]
