## Objective 01 - describe the latent dirichlet allocation process



## Objective 02 - implement a topic model using the gensim library


### Topic Modeling Steps

Prepare text: Load text file, split into documents, tokenize/lemmatize, remove stop words

Create the term dictionary for the corpus

Create a document term matrix (DTM)

Set-up the LDA model, decide on number of topics

Run and train the model

Topics!

In [1]:
# imports
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS

# add additional stop words
# STOPWORDS = set(STOPWORDS).union(set(['said', 'mr', 'mrs'])))

# Function for tokenizing 
def tokenize(text):
    return [token for token in simple_preprocess(text) if token not in STOPWORDS]


In [2]:
# read in the text 
with open('wonderland.txt', 'r',) as file:
    text_str = file.read()

# Split the string on the newline character
text = text_str.split('\n')

# Tokenize each chunk of text
text_tokens = [tokenize(chunk) for chunk in text]

# look at the first 10 tokens
text_tokens[0][0:10]

['alice',
 'beginning',
 'tired',
 'sitting',
 'sister',
 'bank',
 'having',
 'twice',
 'peeped',
 'book']

In [3]:
# imports 
from gensim import corpora

# create the term dictionary of our corpus
# every unique term is assigned an index
dictionary = corpora.Dictionary(text_tokens)

# Convert the list of documents (corpus) into Document Term Matrix
# using the dictionary we just created
doc_term_matrix = [dictionary.doc2bow(doc) for doc in text_tokens]

# what does this matrix looks like
print(doc_term_matrix[0][0:25])

[(0, 1), (1, 4), (2, 1), (3, 1), (4, 2), (5, 1), (6, 1), (7, 1), (8, 1), (9, 2), (10, 1), (11, 1), (12, 1), (13, 1), (14, 2), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1)]


In [4]:
from gensim.models.ldamulticore import LdaMulticore

# Create the object for LDA model
lda = gensim.models.ldamodel.LdaModel

# Train LDA model on the document term matrix
# topics = 5
ldamodel = lda(doc_term_matrix, num_topics=5, id2word=dictionary, passes=50)

# Print out the topics
print(ldamodel.print_topics(num_topics=3, num_words=5))

[(1, '0.022*"little" + 0.018*"alice" + 0.014*"thing" + 0.014*"cakes" + 0.009*"thought"'), (3, '0.017*"time" + 0.017*"executioner" + 0.013*"duchess" + 0.013*"went" + 0.013*"alice"'), (4, '0.021*"rabbit" + 0.021*"alice" + 0.014*"little" + 0.014*"sister" + 0.011*"eyes"')]


In [8]:
import re

words = [re.findall(r'"([^"]*)"', t[1]) for t in ldamodel.print_topics()]
topics = [' '.join(t[0:5]) for t in words]
for id, t in enumerate(topics):
    print(f"-------Topic {id}-------")
    print(t, end="\n \n")

-------Topic 0-------
little rabbit gloves fan alice
 
-------Topic 1-------
alice little ran said went
 
-------Topic 2-------
said turtle mock executioner alice
 
-------Topic 3-------
alice came rabbit hearts thought
 
-------Topic 4-------
alice little look thought king
 


## Objective 03 - interpret document topic distributions and summarize findings


In [10]:
!pip install warnings



In [11]:
# suppress annoying warnings
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)

# import the library
import pyLDAvis.gensim

# Use the visualization in a notebook
pyLDAvis.enable_notebook()

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  method='lar', copy_X=True, eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  method='lar', copy_X=True, eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_Gram=True, verbose=0,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True):
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-note

In [14]:
# repeat the topic model from the previous objective 

#imports
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS

# additional stop words
# STOPWORDS = set(STOPWORDS).union(set(['said', 'mr', 'mrs']))

# Function for tokenizing the text
def tokenize(text):
    return [token for token in simple_preprocess(text) if token not in STOPWORDS]

# Read the file text
with open('wonderland.txt', 'r') as file:
    text_str = file.read()

# split the string on the newline character
text = text_str.split('\n')

# Tokenize each chunk of text
text_tokens = [tokenize(chunk) for chunk in text]

# imports
from gensim import corpora

# Create the term dictionary of our corpus
# every unique term is assigned as index
dictionary = corpora.Dictionary(text_tokens)

# Convert eht list of documents(corpus) into Document Term Matrix
# using the dictionary we just created
doc_term_matrix = [dictionary.doc2bow(doc) for doc in text_tokens]

# import 
from gensim.models.ldamulticore import LdaMulticore

# Create the object for LDA model
lda = gensim.models.ldamodel.LdaModel

# Train LDA model on the document term matrix
# topic = 5
ldamodel = lda(doc_term_matrix, num_topics=5, id2word=dictionary, passes=50)

  and should_run_async(code)


In [15]:
# Interactive visualization for topic modeling 
pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dictionary)

  and should_run_async(code)
