In [4]:
# Run in python console
import nltk; nltk.download('stopwords')

# Run in terminal or command prompt
import spacy

[nltk_data] Downloading package stopwords to /Users/priya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
# Regex
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy


# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [6]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

In [7]:
# Import Dataset
df = pd.read_json('https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')
print(df.target_names.unique())
df.head()

['rec.autos' 'comp.sys.mac.hardware' 'comp.graphics' 'sci.space'
 'talk.politics.guns' 'sci.med' 'comp.sys.ibm.pc.hardware'
 'comp.os.ms-windows.misc' 'rec.motorcycles' 'talk.religion.misc'
 'misc.forsale' 'alt.atheism' 'sci.electronics' 'comp.windows.x'
 'rec.sport.hockey' 'rec.sport.baseball' 'soc.religion.christian'
 'talk.politics.mideast' 'talk.politics.misc' 'sci.crypt']


Unnamed: 0,content,target,target_names
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7,rec.autos
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4,comp.sys.mac.hardware
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,4,comp.sys.mac.hardware
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,1,comp.graphics
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,14,sci.space


In [8]:
# Convert to list
data = df.content.values.tolist()

# Remove Emails
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]




In [10]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

print(data_words[:1])

[['from', 'wheres', 'my', 'thing', 'subject', 'what', 'car', 'is', 'this', 'nntp', 'posting', 'host', 'rac', 'wam', 'umd', 'edu', 'organization', 'university', 'of', 'maryland', 'college', 'park', 'lines', 'was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'enlighten', 'me', 'on', 'this', 'car', 'saw', 'the', 'other', 'day', 'it', 'was', 'door', 'sports', 'car', 'looked', 'to', 'be', 'from', 'the', 'late', 'early', 'it', 'was', 'called', 'bricklin', 'the', 'doors', 'were', 'really', 'small', 'in', 'addition', 'the', 'front', 'bumper', 'was', 'separate', 'from', 'the', 'rest', 'of', 'the', 'body', 'this', 'is', 'all', 'know', 'if', 'anyone', 'can', 'tellme', 'model', 'name', 'engine', 'specs', 'years', 'of', 'production', 'where', 'this', 'car', 'is', 'made', 'history', 'or', 'whatever', 'info', 'you', 'have', 'on', 'this', 'funky', 'looking', 'car', 'please', 'mail', 'thanks', 'il', 'brought', 'to', 'you', 'by', 'your', 'neighborhood', 'lerxst']]


In [11]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram [same Output can be received by just using the above code as well]
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['from', 'wheres', 'my', 'thing', 'subject', 'what', 'car', 'is', 'this', 'nntp_posting_host', 'rac_wam_umd_edu', 'organization', 'university', 'of', 'maryland_college_park', 'lines', 'was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'enlighten', 'me', 'on', 'this', 'car', 'saw', 'the', 'other', 'day', 'it', 'was', 'door', 'sports', 'car', 'looked', 'to', 'be', 'from', 'the', 'late', 'early', 'it', 'was', 'called', 'bricklin', 'the', 'doors', 'were', 'really', 'small', 'in', 'addition', 'the', 'front_bumper', 'was', 'separate', 'from', 'the', 'rest', 'of', 'the', 'body', 'this', 'is', 'all', 'know', 'if', 'anyone', 'can', 'tellme', 'model', 'name', 'engine', 'specs', 'years', 'of', 'production', 'where', 'this', 'car', 'is', 'made', 'history', 'or', 'whatever', 'info', 'you', 'have', 'on', 'this', 'funky', 'looking', 'car', 'please', 'mail', 'thanks', 'il', 'brought', 'to', 'you', 'by', 'your', 'neighborhood', 'lerxst']


In [12]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [13]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['where', 'thing', 'car', 'nntp_poste', 'host', 'park', 'line', 'wonder', 'could', 'enlighten', 'car', 'see', 'day', 'door', 'sport', 'car', 'look', 'late', 'early', 'call', 'door', 'really', 'small', 'addition', 'separate', 'rest', 'body', 'know', 'model', 'name', 'engine', 'spec', 'year', 'production', 'car', 'make', 'history', 'info', 'funky', 'look', 'car', 'mail', 'thank', 'bring', 'neighborhood', 'lerxst']]


In [14]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 5), (5, 1), (6, 1), (7, 2), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 2), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1)]]


In [16]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=200,
                                           update_every=1,
                                           chunksize=200,
                                           passes=20,
                                           alpha='auto',
                                           per_word_topics=True)

In [17]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics(num_topics=20, num_words=5))
doc_lda = lda_model[corpus]

[(0,
  '0.039*"believe" + 0.036*"religion" + 0.034*"man" + 0.030*"faith" + '
  '0.030*"christian"'),
 (1,
  '0.023*"child" + 0.023*"people" + 0.021*"war" + 0.021*"kill" + '
  '0.019*"publish"'),
 (2,
  '0.022*"space" + 0.019*"high" + 0.016*"cost" + 0.016*"power" + 0.014*"low"'),
 (3,
  '0.022*"people" + 0.015*"say" + 0.013*"may" + 0.010*"mean" + 0.010*"many"'),
 (4,
  '0.151*"file" + 0.067*"image" + 0.039*"format" + 0.037*"page" + '
  '0.028*"picture"'),
 (5,
  '0.096*"team" + 0.082*"game" + 0.050*"year" + 0.041*"win" + 0.027*"score"'),
 (6,
  '0.020*"line" + 0.020*"thank" + 0.019*"program" + 0.017*"include" + '
  '0.016*"mail"'),
 (7,
  '0.051*"drug" + 0.040*"pin" + 0.033*"univ" + 0.031*"pain" + 0.031*"patient"'),
 (8, '0.074*"sin" + 0.069*"scsi" + 0.049*"ide" + 0.044*"pen" + 0.040*"cd"'),
 (9,
  '0.102*"entry" + 0.052*"sun" + 0.041*"rule" + 0.027*"setup" + '
  '0.027*"decent"'),
 (10,
  '0.184*"drive" + 0.121*"car" + 0.037*"bike" + 0.035*"tape" + 0.029*"ride"'),
 (11,
  '0.906*"ax" +

In [19]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -11.273734055407212

Coherence Score:  0.5271735865343894


In [21]:
# supporting function
def compute_coherence_values(corpus, dictionary, k):
    
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word, #id2word=dictionary, dictionary=dictioanry
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           eta=None,
                                           per_word_topics=True)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

In [22]:
import numpy as np
import tqdm
grid = {}
grid['Validation_Set'] = {}
# Topics range
min_topics = 1
max_topics = 50
step_size = 1
topics_range = range(min_topics, max_topics, step_size)

# Validation sets
num_of_docs = len(corpus)
corpus_sets = [gensim.utils.ClippedCorpus(corpus, int(num_of_docs*0.75)),
              corpus]
corpus_title = ['75% Corpus','100% Corpus']
model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Coherence': []
                }
# Can take a long time to run
if 1 == 1:
    pbar = tqdm.tqdm(total=len(topics_range)*len(corpus_title))
    
    # iterate through validation corpuses
    for i in range(len(corpus_sets)):
        # iterate through number of topics
        for k in topics_range:
           
                    # get the coherence score for the given parameters
                    cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=id2word, 
                                                  k=k)
                    # Save the model results
                    model_results['Validation_Set'].append(corpus_title[i])
                    model_results['Topics'].append(k)
                    model_results['Coherence'].append(cv)
                    
                    pbar.update(1)
    pd.DataFrame(model_results).to_csv('lda_tuning_results_3.csv', index=False)
    pbar.close()

100%|██████████| 96/96 [4:41:36<00:00, 176.00s/it]  


In [23]:
print(model_results['Topics'])
print(model_results['Coherence'])

[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
[0.5639999220453062, 0.4134510143577836, 0.4805086576384838, 0.5578726559079044, 0.5556438294903753, 0.5300922519842863, 0.4565147967675681, 0.5243530129766254, 0.49639490151158244, 0.5009399913965354, 0.48829437854210817, 0.5015877042196504, 0.4735082986399629, 0.5181655362830875, 0.49151707021242097, 0.5117149628672204, 0.5007605289569745, 0.4822819186938179, 0.460534645526535, 0.4886237697082003, 0.4893449166920446, 0.4842133360023087, 0.4472507921084079, 0.463895387078949, 0.46278428513778486, 0.43775948018184124, 0.4773205201308389, 0.5224731385815885, 0.4583037656407138, 0.47720149713174564, 0.478193648534877, 0.46907

In [46]:
#Final Model
lda_model = gensim.models.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=4, 
                                           random_state=400,
                                           chunksize=200,
                                           passes=10,
                                           alpha='auto',
                                           )

In [43]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics(num_words=10))
doc_lda = lda_model[corpus]

[(0,
  '0.016*"line" + 0.014*"write" + 0.012*"go" + 0.011*"would" + 0.011*"be" + '
  '0.009*"get" + 0.009*"article" + 0.008*"think" + 0.008*"good" + '
  '0.007*"year"'),
 (1,
  '0.572*"ax" + 0.002*"rlk" + 0.002*"lq" + 0.002*"tdo" + 0.002*"um_um" + '
  '0.001*"oo" + 0.001*"dy" + 0.001*"chz" + 0.001*"mh" + 0.001*"tq"'),
 (2,
  '0.015*"line" + 0.011*"use" + 0.009*"system" + 0.007*"key" + 0.006*"program" '
  '+ 0.006*"need" + 0.006*"drive" + 0.006*"file" + 0.006*"thank" + '
  '0.005*"work"'),
 (3,
  '0.012*"people" + 0.010*"say" + 0.009*"would" + 0.007*"write" + 0.007*"may" '
  '+ 0.006*"make" + 0.006*"evidence" + 0.006*"reason" + 0.006*"think" + '
  '0.006*"believe"')]


In [44]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -7.8013424363341946

Coherence Score:  0.5488185085714622


import numpy as np
import tqdm
grid = {}
grid['Validation_Set'] = {} 
# Topics range
min_topics = 14
max_topics = 50
step_size = 2
topics_range = range(min_topics, max_topics, step_size)
# Alpha parameter
#alpha = list(np.arange(0.05, 1, 0.3))
alpha = ['auto']
#alpha = auto

# Beta parameter

# Validation sets #made changes int code and alpha = auto was giving error
num_of_docs = len(corpus)
corpus_sets = [gensim.utils.ClippedCorpus(corpus, int(num_of_docs*0.25))] 
               # gensim.utils.ClippedCorpus(corpus, num_of_docs*0.5), 
               #gensim.utils.ClippedCorpus(corpus, int(num_of_docs*0.75)), 
               #corpus]
corpus_title = ['25% Corpus']
model_results = {'Topics': [],
                 'Coherence': [],
                 'Alpha':[]
                }
# Can take a long time to run
#if 1 == 1:
pbar = tqdm.tqdm(total=250)

# iterate through validation corpuses
# for i in range(len(corpus_sets)):
    # iterate through number of topics
for k in topics_range:
    print(k)
    # iterate through alpha values
    for a in alpha:
            # get the coherence score for the given parameters
        cv = compute_coherence_values(corpus=corpus_sets[0], dictionary=id2word, 
                                      k=k, a=a)
        # Save the model results

        model_results['Topics'].append(k)
        model_results['Alpha'].append(a)
        model_results['Coherence'].append(cv)

        pbar.update(1)
pd.DataFrame(model_results).to_csv('lda_tuning_results1.csv', index=False)
pbar.close()