In [1]:
# Import required Python packages
import nltk
import gensim
from sklearn.datasets import fetch_20newsgroups

In [2]:
# Download news groups dataset
news = fetch_20newsgroups().data

In [3]:
# Original topic names
original_topics = fetch_20newsgroups().target_names

In [4]:
# Dataset size
len(news)

11314

In [5]:
# First record in dataset
news[0]

"From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n"

In [6]:
# Download stop words for English
stop_words = nltk.corpus.stopwords.words('english')

In [7]:
# Initialize lemmatization object for English
lemmatizer = nltk.stem.WordNetLemmatizer()

In [8]:
# Covert raw text into word tokens, select only those where length >= 3 and it's not in stop words list
# Convert all words to lowercase and remore extra whitespaces
# Perform POS tagging and convert each word to it's lemma
documents = []
for item in news:
    words = []
    _words = nltk.word_tokenize(item)
    _words = nltk.pos_tag(_words)
    for _word, _tag in _words:
        if isinstance(_word, str) and len(_word) >= 3 and not _word.isdigit() and _word not in stop_words:
            _word = _word.lower().strip()
            _word = lemmatizer.lemmatize(_word)
            words.append(_word)
    documents.append(words)    

In [9]:
# Convert tokenized documents into a dictionary (id -> term)
dictionary = gensim.corpora.Dictionary(documents)

In [10]:
# Convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(doc) for doc in documents]

In [11]:
# Train LDA models for 20 topics
lda = gensim.models.ldamodel.LdaModel(corpus, num_topics=20, id2word=dictionary, passes=30)

In [12]:
# Print all 20 topics, and top-5 words related to each
lda.print_topics(num_topics=20, num_words=5)

[(0,
  '0.006*"dyer" + 0.006*"chi" + 0.004*"jaeger" + 0.004*"edm" + 0.004*"steve"'),
 (1,
  '0.008*"target" + 0.008*"article" + 0.008*"from" + 0.007*"organization" + 0.007*"men"'),
 (2, '0.004*"55.0" + 0.004*"a86" + 0.003*"6um" + 0.002*"/3t" + 0.002*"0el"'),
 (3,
  '0.026*"drive" + 0.012*"disk" + 0.009*"system" + 0.007*"hard" + 0.007*"controller"'),
 (4,
  '0.021*"n\'t" + 0.009*"one" + 0.008*"the" + 0.007*"people" + 0.007*"would"'),
 (5,
  '0.016*"entry" + 0.010*"file" + 0.007*"program" + 0.006*"line" + 0.006*"output"'),
 (6, '0.007*"det" + 0.007*"bos" + 0.006*"tor" + 0.006*"min" + 0.006*"que"'),
 (7,
  '0.013*"the" + 0.009*"space" + 0.003*"system" + 0.003*"year" + 0.003*"nasa"'),
 (8,
  '0.007*"the" + 0.006*"health" + 0.005*"medical" + 0.004*"center" + 0.004*"research"'),
 (9,
  '0.023*"..." + 0.019*"line" + 0.018*"from" + 0.018*"subject" + 0.017*"organization"'),
 (10,
  '0.016*"line" + 0.015*"from" + 0.015*"subject" + 0.014*"organization" + 0.011*"n\'t"'),
 (11, '0.564*"\'ax" + 0.04

In [13]:
# Save the model
lda.save('news_groups.lda')

In [14]:
# Print original topic names
original_topics

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']