## NLP Topic Modeling Exercise

In [1]:
# import TfidfVectorizer and CountVectorizer from sklearn
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# import fetch_20newsgroups from sklearn.datasets
from sklearn.datasets import fetch_20newsgroups

# import NMF and LatentDirichletAllocation from sklearn
from sklearn.decomposition import NMF, LatentDirichletAllocation

In [2]:
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data

* create a variable called `'no_features'` and set its value to 100.

In [3]:
#create a varriable called no_features and set it to 100
no_features = 100

* create a variable `'no_topics'` and set its value to 100

In [4]:
no_topics = 100

## NMF

* instantiate a TfidfVectorizer with the following parameters:


    * max_df = 0.95
    * min_df = 2
    * max_features = no_features
    * stop_words = 'english'

In [5]:
# instatiate TfidfVectorizer and set max_df to 0.95 and min_df to 2
tfidf = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')

* use fit_transform method of TfidfVectorizer to transform the documents

In [6]:
# use fit_transform to fit and transform the documents
tfidf_transformed = tfidf.fit_transform(documents)

* get the features names from TfidfVectorizer

In [10]:
# get the feature names from the tfidf object

feature_names = tfidf.get_feature_names_out()
print(feature_names)

['00' '10' '12' '14' '15' '16' '20' '25' 'a86' 'available' 'ax' 'b8f'
 'believe' 'best' 'better' 'bit' 'case' 'com' 'come' 'course' 'data' 'day'
 'did' 'didn' 'different' 'does' 'doesn' 'don' 'drive' 'edu' 'fact' 'far'
 'file' 'g9v' 'god' 'going' 'good' 'got' 'government' 'help' 'information'
 'jesus' 'just' 'key' 'know' 'law' 'let' 'like' 'line' 'list' 'little'
 'll' 'long' 'look' 'lot' 'mail' 'make' 'max' 'mr' 'need' 'new' 'number'
 'people' 'point' 'power' 'probably' 'problem' 'program' 'question' 'read'
 'really' 'right' 'run' 'said' 'say' 'second' 'set' 'software' 'space'
 'state' 'sure' 'tell' 'thanks' 'thing' 'things' 'think' 'time' 'true'
 'try' 'use' 'used' 'using' 've' 'want' 'way' 'windows' 'work' 'world'
 'year' 'years']


* instantiate NMF and fit transformed data

In [12]:
#Instantiate NMF and fit the tfidf_transformed data
nmf = NMF(n_components=no_topics, random_state=1, l1_ratio=.5, init='nndsvd').fit(tfidf_transformed)

## LDA w/ Sklearn

* instantiate a CountVectorizer with following parameters:


    * max_df = 0.95
    * min_df = 2
    * max_features = no_features
    * stop_words = 'english'

In [14]:
# instantiate count vectorizer and fit the documents
vectorizer = CountVectorizer(max_df=0.95, 
                             min_df=2, 
                             max_features=no_features, 
                             stop_words='english')

* use fit_transform method of CountVectorizer to transform documents

In [15]:
# use fit_transform to fit and transform the documents
transformed = vectorizer.fit_transform(documents)

* get the features names from TfidfVectorizer

In [18]:
# get the feature names from the vectorizer object
feature_names = vectorizer.get_feature_names_out()
print(feature_names)

['00' '10' '12' '14' '15' '16' '20' '25' 'a86' 'available' 'ax' 'b8f'
 'believe' 'best' 'better' 'bit' 'case' 'com' 'come' 'course' 'data' 'day'
 'did' 'didn' 'different' 'does' 'doesn' 'don' 'drive' 'edu' 'fact' 'far'
 'file' 'g9v' 'god' 'going' 'good' 'got' 'government' 'help' 'information'
 'jesus' 'just' 'key' 'know' 'law' 'let' 'like' 'line' 'list' 'little'
 'll' 'long' 'look' 'lot' 'mail' 'make' 'max' 'mr' 'need' 'new' 'number'
 'people' 'point' 'power' 'probably' 'problem' 'program' 'question' 'read'
 'really' 'right' 'run' 'said' 'say' 'second' 'set' 'software' 'space'
 'state' 'sure' 'tell' 'thanks' 'thing' 'things' 'think' 'time' 'true'
 'try' 'use' 'used' 'using' 've' 'want' 'way' 'windows' 'work' 'world'
 'year' 'years']


* instantiate LatentDirichletAllocation and fit transformed data 

In [19]:
# Instantiate LDAModel and fit the transformed data
lda = LatentDirichletAllocation(n_components=no_topics, 
                                max_iter=5, 
                                learning_method='online', 
                                learning_offset=50.,
                                random_state=0).fit(transformed)

* create a function `display_topics` that is able to display the top words in a topic for different models

In [20]:
# create a function called display_topics that takes in a model and feature names and returns the top words for each topic

def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
        print()


In [21]:
# call the display_topics function on the lda model and pass in the feature names and no_top_words

no_top_words = 10
display_topics(lda, feature_names, no_top_words)

Topic 0:
point state right need long second fact does people things

Topic 1:
day tell like going just read problem need think know

Topic 2:
sure point want good help did government say use question

Topic 3:
think need best 00 a86 tell want using new people

Topic 4:
ax max b8f g9v a86 14 mr ll 25 probably

Topic 5:
god true say jesus believe things people does did know

Topic 6:
said second years tell work new right ll true like

Topic 7:
probably look like tell need used point want long run

Topic 8:
available software run like new need people used probably work

Topic 9:
mail list like new time tell does different use 15

Topic 10:
think 16 program point com space case let probably use

Topic 11:
help set use like does tell things work used new

Topic 12:
com know g9v way mr let look don point try

Topic 13:
don like know just think say ll tell need try

Topic 14:
time years long like far just work make better people

Topic 15:
good like tell just day think probably lot believe ri

* display top 10 words from each topic from NMF model

In [22]:
display_topics(nmf, feature_names, no_top_words)

Topic 0:
did just ll data years going don drive edu fact

Topic 1:
thanks 14 file data years going don drive edu fact

Topic 2:
does know just ll data years god don drive edu

Topic 3:
edu 14 file just ll data line going don drive

Topic 4:
just don like tell ll ve years god doesn drive

Topic 5:
like 14 just file data years going don drive edu

Topic 6:
just a86 g9v does think doesn don drive edu fact

Topic 7:
use don ve data years going doesn drive edu fact

Topic 8:
people don just mr ll ve years going drive edu

Topic 9:
good 14 ll just data years going don drive edu

Topic 10:
think don years going doesn drive edu fact far file

Topic 11:
god don tell jesus 14 just ll 25 years going

Topic 12:
time just ll ve did want key doesn don drive

Topic 13:
windows tell data years going doesn don drive edu fact

Topic 14:
drive problem power file data years does don edu fact

Topic 15:
tell ll power mail program software data doesn did years

Topic 16:
don ll data want did years going dri

* display top 10 words from each topic from LDA model

### Stretch: Use LDA w/ Gensim to do the same thing.

In [23]:
# use LDA with Genism
# import gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

In [24]:
data = dataset.data

`### Sentences to words ###`

In [25]:
# create a function called sent_to_words that takes in a list of sentences and returns a list of words
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True, min_len=3))  # deacc=True removes punctuations
        # yield(gensim.utils.simple_preprocess(str(sentence), deacc=True, min_len=3))  # deacc=True removes punctuations
data_words = list(sent_to_words(data))

print(data_words[:1])

[['well', 'not', 'sure', 'about', 'the', 'story', 'nad', 'did', 'seem', 'biased', 'what', 'disagree', 'with', 'your', 'statement', 'that', 'the', 'media', 'out', 'ruin', 'israels', 'reputation', 'that', 'rediculous', 'the', 'media', 'the', 'most', 'pro', 'israeli', 'media', 'the', 'world', 'having', 'lived', 'europe', 'realize', 'that', 'incidences', 'such', 'the', 'one', 'described', 'the', 'letter', 'have', 'occured', 'the', 'media', 'whole', 'seem', 'try', 'ignore', 'them', 'the', 'subsidizing', 'israels', 'existance', 'and', 'the', 'europeans', 'are', 'not', 'least', 'not', 'the', 'same', 'degree', 'think', 'that', 'might', 'reason', 'they', 'report', 'more', 'clearly', 'the', 'atrocities', 'what', 'shame', 'that', 'austria', 'daily', 'reports', 'the', 'inhuman', 'acts', 'commited', 'israeli', 'soldiers', 'and', 'the', 'blessing', 'received', 'from', 'the', 'government', 'makes', 'some', 'the', 'holocaust', 'guilt', 'away', 'after', 'all', 'look', 'how', 'the', 'jews', 'are', 'trea

`### Remove stop words ###`

In [26]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

# Spacy lemmatization:
# Lemmatization is the process of converting a word to its base form.
import spacy
spacy.prefer_gpu()
nlp = spacy.load("en_core_web_sm")


def lemmatization(texts, allowed_postags=['NOUN']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [27]:
data_words_nostops = remove_stopwords(data_words)

`### Lematization ###`

In [28]:
# Do lemmatization keeping only nouns

data_lemmatized = lemmatization(data_words_nostops, allowed_postags=['NOUN'])

`### Create dictionary(id2word) and the corpus ###`

In [29]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)
print(id2word.token2id)



In [30]:
# Create Corpus
# Term Document Frequency
texts = data_lemmatized

# Create Corpus
# Term Document Frequency

corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 4), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1)]]


`########################## **Building the LDA model** #############################`

In [31]:
# Baseline Model

# instantiate a baseline LDA model using gensim.models.LdaMulticore() and save it to ‘lda_model’
# Train the baseline model using gensim.models.LdaMulticore() and save it to ‘lda_model’
# Print the topics and their corresponding keywords using lda_model.print_topics() and save it to ‘baseline_topics’
# Print the coherence score of the baseline model using CoherenceModel() and save it to ‘baseline_coherence_score’

# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                        id2word=id2word,
                                        num_topics=10,
                                        random_state=100,
                                        chunksize=150,
                                        passes=10,
                                        per_word_topics=True)

In [32]:
import pprint

In [34]:
# Print the Keyword in the 10 topics
pprint.pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.017*"car" + 0.015*"time" + 0.013*"people" + 0.011*"thing" + 0.009*"way" + '
  '0.007*"day" + 0.006*"man" + 0.006*"bike" + 0.005*"woman" + 0.005*"year"'),
 (1,
  '0.020*"people" + 0.011*"religion" + 0.011*"thing" + 0.010*"question" + '
  '0.009*"group" + 0.008*"belief" + 0.008*"church" + 0.008*"faith" + '
  '0.008*"post" + 0.007*"article"'),
 (2,
  '0.015*"chip" + 0.013*"system" + 0.012*"encryption" + 0.012*"key" + '
  '0.011*"information" + 0.010*"government" + 0.009*"clipper" + 0.008*"number" '
  '+ 0.008*"privacy" + 0.008*"technology"'),
 (3,
  '0.018*"list" + 0.015*"com" + 0.010*"thank" + 0.009*"mail" + 0.008*"edu" + '
  '0.008*"bank" + 0.008*"request" + 0.008*"address" + 0.007*"time" + '
  '0.007*"email"'),
 (4,
  '0.024*"people" + 0.017*"gun" + 0.013*"law" + 0.012*"government" + '
  '0.010*"state" + 0.009*"year" + 0.007*"weapon" + 0.007*"time" + 0.007*"case" '
  '+ 0.007*"crime"'),
 (5,
  '0.020*"space" + 0.010*"time" + 0.007*"earth" + 0.007*"launch" + '
  '0.007*"satell

In [35]:
# get coherence score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence score: ', coherence_lda)


Coherence score:  0.46131267138861143


In [37]:
# Visualize the topics
# import pyLDAvis
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary = lda_model.id2word)
vis


`### Get Optimal number of topics ###`

In [38]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=1):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.ldamodel.LdaModel(corpus=corpus, num_topics=num_topics,random_state=100,
                                                chunksize=200,passes=10,per_word_topics=True,id2word=id2word)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [39]:
model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, 
                                        texts=data_lemmatized, start=2, limit=10, step=1)

In [47]:
# print coherence scores for each model topic
limit=10; start=2; step=1;
x = range(start, limit, step)

for m, cv in zip(x, coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))
    

Num Topics = 2  has Coherence Value of 0.4764
Num Topics = 3  has Coherence Value of 0.4975
Num Topics = 4  has Coherence Value of 0.4875
Num Topics = 5  has Coherence Value of 0.4487
Num Topics = 6  has Coherence Value of 0.502
Num Topics = 7  has Coherence Value of 0.5025
Num Topics = 8  has Coherence Value of 0.4321
Num Topics = 9  has Coherence Value of 0.461


In [50]:
# # Select the model and print the topics
# optimal_model = model_list[7]
# model_topics = optimal_model.show_topics(formatted=False)
# pprint.pprint(optimal_model.print_topics(num_words=10))



`### Other hyper-parameters ###`

`1.Number of Topics (K) document word matrox---`\
`2.Dirichlet hyperparameter alpha: Document-Topic Density--`\
`3. Dirichlet hyperparameter beta: Word-Topic Density`

In [51]:
# supporting function
def compute_coherence_values(corpus, dictionary, k, a, b):
    
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=5, 
                                           random_state=100,
                                           chunksize=200,
                                           passes=10,
                                           alpha=a,
                                           eta=b,
                                           per_word_topics=True)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

In [52]:
import numpy as np
import tqdm
grid = {}
grid['Validation_Set'] = {}
# Topics range
min_topics = 2
max_topics = 8
step_size = 1
topics_range = range(min_topics, max_topics, step_size)
# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')
# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')

In [54]:
import pandas as pd

In [55]:
# Validation sets
num_of_docs = len(corpus)
corpus_sets = [gensim.utils.ClippedCorpus(corpus, int(num_of_docs*0.75)), 
               corpus]
corpus_title = ['75% Corpus', '100% Corpus']
model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
if 1 == 1:
    pbar = tqdm.tqdm(total=540)
    
    for i in range(len(corpus_sets)):
        # iterate through number of topics
        for k in topics_range:
            # iterate through alpha values
            for a in alpha:
                # iterare through beta values
                for b in beta:
                    # get the coherence score for the given parameters
                    cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=id2word, 
                                                  k=k, a=a, b=b)
                    # Save the model results
                    model_results['Validation_Set'].append(corpus_title[i])
                    model_results['Topics'].append(k)
                    model_results['Alpha'].append(a)
                    model_results['Beta'].append(b)
                    model_results['Coherence'].append(cv)
                    
                    pbar.update(1)
    pd.DataFrame(model_results).to_csv('lda_tuning_results_new.csv', index=False)
    pbar.close()

 67%|██████▋   | 360/540 [8:57:42<4:28:51, 89.62s/it]


KeyboardInterrupt: 