# Topic Modelling
## Example of the ’20 Newsgroups’ dataset and use LDA to extract discussed topics

In [1]:
import sys
import re, numpy as np, pandas as pd

In [2]:
# Gensim
import gensim, spacy, logging, warnings
import gensim.corpora as corpora
from gensim.utils import lemmatize, simple_preprocess
#from gensim.models import CoherenceModel
import matplotlib.pyplot as plt

In [3]:
%matplotlib inline
warnings.filterwarnings("ignore",category=DeprecationWarning)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

### We will use a portion of the 20 Newsgroups 
#### Let us select these topics from dataset: Christianity, Hockey, MidEast and Motorcycles.

In [15]:
# Import Dataset
df = pd.read_json('newsgroups.json')
df = df.loc[df.target_names.isin(['soc.religion.christian', 
                                  'rec.sport.hockey', 
                                  'talk.politics.mideast', 
                                  'rec.motorcycles']) , :]
print(df.shape)

(2361, 3)


In [16]:
pd.set_option('max_colwidth', 125)
df.head()

Unnamed: 0,content,target,target_names
10,From: irwin@cmptrc.lonestar.org (Irwin Arnstein)\nSubject: Re: Recommendation on Duc\nSummary: What's it worth?\nDistribu...,8,rec.motorcycles
10007,From: jet@netcom.Netcom.COM (J. Eric Townsend)\nSubject: Re: Insurance and lotsa points...\nIn-Reply-To: cjackson@adobe.c...,8,rec.motorcycles
10008,From: gld@cunixb.cc.columbia.edu (Gary L Dare)\nSubject: Re: ABC coverage\nNntp-Posting-Host: cunixb.cc.columbia.edu\nRep...,10,rec.sport.hockey
10017,From: nstramer@supergas.dazixco.ingr.com (Naftaly Stramer)\nSubject: Peace talks ...\nNntp-Posting-Host: supergas\nReply-...,17,talk.politics.mideast
10019,"From: mussack@austin.ibm.com (Christopher Mussack)\nSubject: Re: Questioning Authority\nLines: 60\n\nDespite my trendy, l...",15,soc.religion.christian


In [17]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'not', 'would', 'say', 
                   'could', '_', 'be', 'know', 'good', 'go', 'get', 'do', 'done', 
                   'try', 'many', 'some', 'nice', 'thank', 'think', 'see', 'rather', 
                   'easy', 'easily', 'lot', 'lack', 'make', 'want', 'seem', 'run', 'need', 
                   'even', 'right', 'line', 'even', 'also', 'may', 'take', 'come'])

### Tokenize and Clean
#### Removing the emails, new line characters, single quotes and finally split the sentence into a list of words

In [18]:
def sent_to_words(sentences):
    for sent in sentences:
        sent = re.sub('\S*@\S*\s?', '', sent)  # remove emails
        sent = re.sub('\s+', ' ', sent)  # remove newline chars
        sent = re.sub("\'", "", sent)  # remove single quotes
        sent = gensim.utils.simple_preprocess(str(sent), deacc=True) 
        yield(sent)

In [19]:
# Convert to list
data = df.content.values.tolist()
data_words = list(sent_to_words(data))
print(data_words[:1])

[['from', 'irwin', 'arnstein', 'subject', 're', 'recommendation', 'on', 'duc', 'summary', 'whats', 'it', 'worth', 'distribution', 'usa', 'expires', 'sat', 'may', 'gmt', 'organization', 'computrac', 'inc', 'richardson', 'tx', 'keywords', 'ducati', 'gts', 'how', 'much', 'lines', 'have', 'line', 'on', 'ducati', 'gts', 'model', 'with', 'on', 'the', 'clock', 'runs', 'very', 'well', 'paint', 'is', 'the', 'bronze', 'brown', 'orange', 'faded', 'out', 'leaks', 'bit', 'of', 'oil', 'and', 'pops', 'out', 'of', 'st', 'with', 'hard', 'accel', 'the', 'shop', 'will', 'fix', 'trans', 'and', 'oil', 'leak', 'they', 'sold', 'the', 'bike', 'to', 'the', 'and', 'only', 'owner', 'they', 'want', 'and', 'am', 'thinking', 'more', 'like', 'any', 'opinions', 'out', 'there', 'please', 'email', 'me', 'thanks', 'it', 'would', 'be', 'nice', 'stable', 'mate', 'to', 'the', 'beemer', 'then', 'ill', 'get', 'jap', 'bike', 'and', 'call', 'myself', 'axis', 'motors', 'tuba', 'irwin', 'honk', 'therefore', 'am', 'computrac', 'r

### Build the Bigram, Trigram Models and Lemmatize

In [24]:
# Build the bigram and trigram models
# higher threshold fewer phrases.
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) 
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

### Lemmatize each word to its root form, keeping only nouns, adjectives, verbs and adverbs

In [25]:
def process_words(texts, stop_words=stop_words, 
                  allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """Remove Stopwords, Form Bigrams, Trigrams and Lemmatization"""
    texts = [[word for word in simple_preprocess(str(doc)) 
              if word not in stop_words] for doc in texts]
    texts = [bigram_mod[doc] for doc in texts]
    texts = [trigram_mod[bigram_mod[doc]] for doc in texts]
    texts_out = []
    nlp = spacy.load('en', disable=['parser', 'ner'])
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    
    # remove stopwords once more after lemmatization
    texts_out = [[word for word in simple_preprocess(str(doc)) 
                  if word not in stop_words] for doc in texts_out]    
    return texts_out

data_ready = process_words(data_words)

### Build the Topic Model

In [26]:
# Create Dictionary
id2word = corpora.Dictionary(data_ready)
print(id2word)

Dictionary(24628 unique tokens: ['accel', 'arnstein', 'axis', 'beemer', 'bike']...)


In [27]:
# Create Corpus: Term Document Frequency
corpus = [id2word.doc2bow(text) for text in data_ready]
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 2), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 2), (11, 1), (12, 1), (13, 2), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 2), (20, 1), (21, 1), (22, 1), (23, 1), (24, 2), (25, 1), (26, 1), (27, 2), (28, 1), (29, 1), (30, 1), (31, 1), (32, 2), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 2), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1)]]


#### In the above corpus (0, 1) above implies, word id 0 occurs once in the first document. Likewise, word id 2 occurs twice and so on. 
#### If you want to see what word a given id corresponds to, pass the id as a key to the dictionary.

In [28]:
id2word[0]

'accel'

#### chunksize is the number of documents to be used in each training chunk. 
#### update_every determines how often the model parameters should be updated 
#### passes is the total number of training passes.

In [29]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=4, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=10,
                                           passes=10,
                                           alpha='symmetric',
                                           iterations=100,
                                           per_word_topics=True)

In [30]:
print(lda_model.print_topics())

[(0, '0.021*"team" + 0.021*"game" + 0.015*"hockey" + 0.012*"nhl" + 0.011*"player" + 0.010*"play" + 0.010*"win" + 0.010*"hawk" + 0.010*"season" + 0.008*"year"'), (1, '0.020*"armenian" + 0.011*"israel" + 0.009*"israeli" + 0.008*"people" + 0.008*"state" + 0.007*"kill" + 0.006*"article" + 0.006*"government" + 0.005*"turk" + 0.005*"center"'), (2, '0.010*"people" + 0.009*"god" + 0.007*"christian" + 0.007*"believe" + 0.007*"write" + 0.007*"time" + 0.006*"question" + 0.006*"law" + 0.005*"organization" + 0.005*"book"'), (3, '0.012*"organization" + 0.012*"write" + 0.011*"article" + 0.009*"time" + 0.009*"bike" + 0.008*"new" + 0.007*"well" + 0.007*"look" + 0.004*"university" + 0.004*"bad"')]


### What is the Dominant topic and its percentage contribution in each document

In [31]:
def format_topics_sentences(ldamodel=None, corpus=corpus, texts=data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(
                    pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), 
                    ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

In [32]:
df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, 
                                                  texts=data_ready)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 
                             'Keywords', 'Text']
df_dominant_topic.head(10)

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,3.0,0.8938,"organization, write, article, time, bike, new, well, look, university, bad","[irwin, arnstein, recommendation, duc, summary, worth, expire, sit, gmt, organization, computrac, inc, richardson_tx, key..."
1,1,3.0,0.6377,"organization, write, article, time, bike, new, well, look, university, bad","[eric, townsend, insurance, lotsa_point, reply, message, mon, organization, netcom_online, communication, service, cjacks..."
2,2,0.0,0.4612,"team, game, hockey, nhl, player, play, win, hawk, season, year","[gary_dare, reply, gary_dare, organization, phds, hall, caleb, cohen, write, boy, everyone, rip, espns, hockey, coverage,..."
3,3,1.0,0.675,"armenian, israel, israeli, people, state, kill, article, government, turk, center","[naftaly_stramer, peace_talk, reply, organization, israeline, today, maariv, report, yesterday, follow, egyptian, preside..."
4,4,2.0,0.7443,"people, god, christian, believe, write, time, question, law, organization, book","[question, authority, trendy, liberal, feminist, tendency, fact, basically, agree, rebut, write, chris_mussack, write, pe..."
5,5,2.0,0.6329,"people, god, christian, believe, write, time, question, law, organization, book","[harrasse, work, prayer, reply, organization, university, chicago, repeat, emphasize, someone_else, feel, horrible, worth..."
6,6,3.0,0.8103,"organization, write, article, time, bike, new, well, look, university, bad","[andrew_infante, little_hasty, organization, duke, university, durham, apparently, last, little, hasy, call, place, quote..."
7,7,2.0,0.9511,"people, god, christian, believe, write, time, question, law, organization, book","[james_sledd, afterlife, organization, social, science, computing, way, look, die, release, arc, time, able, comprehend, ..."
8,8,0.0,0.5199,"team, game, hockey, nhl, player, play, win, hawk, season, year","[canadien, stanley_cup, organization, canada, dean, pereira, write, kind, team, montreal, cup, problem, everyone, steal, ..."
9,9,0.0,0.8704,"team, game, hockey, nhl, player, play, win, hawk, season, year","[steve_gallichio, possible, canadian, wc, team, organization, cadkey_inc, point, team, canadian, nhler, playoff, bind, ce..."


### Visualize

In [33]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary=lda_model.id2word)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
