# Non supervised text topic detection

## LDA with Gensim

In [1]:
import os
import re
import string

#import pandas as pd

data_path = '.'

In [2]:
from sklearn.datasets import fetch_20newsgroups

twenty_train = fetch_20newsgroups(subset='train',
                 remove=('headers', 'footers', 'quotes'), shuffle=True, random_state=42)
print(len(twenty_train['data']))

11314


In [3]:
from nltk.tokenize import sent_tokenize
from nltk.tokenize import TreebankWordTokenizer

def tokenize_doc(text):
    sents = sent_tokenize(text)
    tokens = []
    for s in sents:
        for t in TreebankWordTokenizer().tokenize(s):
            if len(t)<2 or '\n' in t or '\t' in t or t.isnumeric():
                continue
            else:
                tokens += [t.lower()]
    return tokens

In [4]:
%%time
import nltk
nltk.download('punkt')

docs_tokenized = []
for text in twenty_train['data']:
    docs_tokenized += [tokenize_doc(text)]
print(len(docs_tokenized))

[nltk_data] Downloading package punkt to /Users/jorge/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


11314
CPU times: user 23.9 s, sys: 122 ms, total: 24 s
Wall time: 24.2 s


In [5]:
# Get the NLTK stopwords list
import nltk

def nltk_stopwords():
    return set(nltk.corpus.stopwords.words('english'))

In [7]:
# create dictionary and bag of words corpus

from gensim.corpora import Dictionary, MmCorpus
nltk.download('stopwords')
print('Building dictionary...')
dictionary = Dictionary(docs_tokenized)

# Stopwords

additional_stopwords=set(['...', '\'s','\'\'',  "``", "n\'t", "\'re", "\'m", "\'ve",
                          '--', '"', "\'ax", 'max', 'q,3'])

#additional_stopwords=set()

stopwords = nltk_stopwords().union(additional_stopwords)
stopword_ids = map(dictionary.token2id.get, stopwords)
dictionary.filter_tokens(stopword_ids)
dictionary.compactify()

# Filter extremes
dictionary.filter_extremes(no_below=5, no_above=0.95, keep_n=None)
dictionary.compactify()

print('Building corpus...')
corpus = [dictionary.doc2bow(doc) for doc in docs_tokenized]

[nltk_data] Downloading package stopwords to /Users/jorge/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Building dictionary...
Building corpus...


In [8]:
%%time

# Train the LDA model
from gensim import models

lda = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=25, passes=10)
                                      
lda.save(os.path.join(data_path, 'newsgroups_50_lda.model'))


CPU times: user 3min 40s, sys: 24.8 s, total: 4min 5s
Wall time: 1min 3s


In [9]:
# Print summary of topics
lda.show_topics()


[(18,
  '0.017*"would" + 0.015*"one" + 0.012*"people" + 0.009*"think" + 0.007*"like" + 0.007*"even" + 0.006*"say" + 0.006*"many" + 0.006*"see" + 0.005*"know"'),
 (24,
  '0.034*"db" + 0.017*"\\/" + 0.016*"m/" + 0.016*"q\\" + 0.015*"mv" + 0.010*"sq" + 0.009*"p\\" + 0.009*"m6" + 0.008*"/-" + 0.008*"m+"'),
 (16,
  '0.020*"thanks" + 0.020*"would" + 0.017*"anyone" + 0.016*"know" + 0.014*"please" + 0.013*"like" + 0.011*"one" + 0.011*"get" + 0.011*"problem" + 0.009*"need"'),
 (2,
  '0.022*"list" + 0.010*"faq" + 0.009*"send" + 0.009*"_/" + 0.008*"min" + 0.006*"posted" + 0.006*"group" + 0.006*"random" + 0.006*"posting" + 0.006*"mail"'),
 (10,
  '0.026*"soon" + 0.021*"gordon" + 0.020*"quality" + 0.019*"motorola" + 0.019*"banks" + 0.018*"surrender" + 0.018*"marriage" + 0.016*"skepticism" + 0.015*"shameful" + 0.015*"intellect"'),
 (6,
  '0.033*"file" + 0.029*"entry" + 0.027*"*/" + 0.025*"/*" + 0.015*"output" + 0.014*"entries" + 0.013*"program" + 0.013*"section" + 0.012*"rules" + 0.010*"int"'),
 (5,

In [10]:
# get_document_topics
lda.get_document_topics(corpus[:10])


<gensim.interfaces.TransformedCorpus at 0x117716748>

In [11]:
# Print topics details
for i in range(lda.num_topics):
    print(i, lda.show_topic(i))
    print('\n')
    

0 [('b8f', 0.034056496), ('a86', 0.028083092), ('1d9', 0.021552382), ('pl+', 0.016177818), ('0t-', 0.010828815), ('7ey', 0.01075693), ('giz', 0.010474111), ('/3t', 0.009324538), ("'as", 0.008282298), (',3', 0.007772687)]


1 [('mac', 0.016375238), ('system', 0.014542106), ('memory', 0.013801628), ('bus', 0.011037328), ('apple', 0.010419637), ('speed', 0.010084982), ('ram', 0.009396631), ('data', 0.008245424), ('keyboard', 0.0077940994), ('machine', 0.00779364)]


2 [('list', 0.022212967), ('faq', 0.009787526), ('send', 0.009478541), ('_/', 0.00890162), ('min', 0.007608283), ('posted', 0.0062425067), ('group', 0.0061428696), ('random', 0.00594738), ('posting', 0.0059285676), ('mail', 0.005835992)]


3 [('drive', 0.045417838), ('disk', 0.017504357), ('hard', 0.016568689), ('drives', 0.0156257), ('sale', 0.014944168), ('price', 0.014706716), ('controller', 0.012591627), ('scsi', 0.012544425), ('new', 0.012431246), ('offer', 0.011125698)]


4 [('1st', 0.011929442), ('..', 0.011810667), ('c

# Visualization

In [12]:
# Install pyLDAvis in colab

! pip install pyldavis


In [13]:
import pyLDAvis

pyLDAvis.enable_notebook()


In [14]:
import pyLDAvis.gensim

pyLDAvis.gensim.prepare(lda, corpus, dictionary)


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
