# Topic modeling test: Latent Dirichlet Allocation (LDA)

### Form a corpus

In [1]:
# make random text
article1 = "Sugar is bad to consume. My sister likes to have sugar, but not my father."
article2 = "My father spends a lot of time driving my sister around to dance practice."
article3 = "Doctors suggest that driving may cause increased stress and blood pressure."
article4 = "Sometimes I feel pressure to perform well at school, but my father never seems to drive my sister to do better."
article5 = "Health experts say that Sugar is not good for your lifestyle."

# compile documents
articles_complete = [article1, article2, article3, article4, article5]
print(articles_complete)

['Sugar is bad to consume. My sister likes to have sugar, but not my father.', 'My father spends a lot of time driving my sister around to dance practice.', 'Doctors suggest that driving may cause increased stress and blood pressure.', 'Sometimes I feel pressure to perform well at school, but my father never seems to drive my sister to do better.', 'Health experts say that Sugar is not good for your lifestyle.']


### Preprocessing

In [2]:
# install and import nltk
%pip install nltk
import nltk

# import necessary libraries 
nltk.download('wordnet') # I think this is necessary
nltk.download('stopwords') # don't know if this is necessary
from nltk.corpus import stopwords # don't know if this is necessary
from nltk.stem.wordnet import WordNetLemmatizer
import string

# clean data (pre-steps)
stop = set(stopwords.words('english')) # Create a set of English stopwords
exclude = set(string.punctuation) # Create a set of punctuation characters
lemma = WordNetLemmatizer() # Initialize the lemmatizer

# Define a function to clean the articles
def clean(article):
    # Convert to lowercase and remove stopwords
    stop_free = " ".join([i for i in article.lower().split() if i not in stop])
    
    # Remove punctuation from the text
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    
    # Lemmatize each word in the cleaned text
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    
    return normalized

# Apply the cleaning function to each document in the complete document list
articles_clean = [clean(article).split() for article in articles_complete]  
print(articles_clean)


Note: you may need to restart the kernel to use updated packages.


[nltk_data] Downloading package wordnet to /home/ucloud/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/ucloud/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


[['sugar', 'bad', 'consume', 'sister', 'like', 'sugar', 'father'], ['father', 'spends', 'lot', 'time', 'driving', 'sister', 'around', 'dance', 'practice'], ['doctor', 'suggest', 'driving', 'may', 'cause', 'increased', 'stress', 'blood', 'pressure'], ['sometimes', 'feel', 'pressure', 'perform', 'well', 'school', 'father', 'never', 'seems', 'drive', 'sister', 'better'], ['health', 'expert', 'say', 'sugar', 'good', 'lifestyle']]


### Prepare Document-Term Matrix
##### aka how to convert a corpus into a document-term matrix.

In [3]:
# install numpy and scipy
%pip install numpy scipy

Note: you may need to restart the kernel to use updated packages.


In [4]:
# import numpy and scipy
import numpy as np
import scipy

In [5]:
# Importing Gensim
%pip install gensim
import gensim
from gensim import corpora

Note: you may need to restart the kernel to use updated packages.


In [7]:
# Creating the term dictionary of our courpus, where every unique term is assigned an index. 
dictionary = corpora.Dictionary(articles_clean)

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(article) for article in articles_clean]
print(doc_term_matrix)

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 2)], [(2, 1), (4, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1)], [(8, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1)], [(2, 1), (4, 1), (18, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1)], [(5, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1)]]


### Running LDA model

In [13]:
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

# Running and Trainign LDA model on the document term matrix.
ldamodel = Lda(doc_term_matrix, num_topics=3, id2word = dictionary, passes=50)

In [14]:
# see results
print(ldamodel.print_topics(num_topics=3, num_words=3))

[(0, '0.076*"sugar" + 0.076*"sister" + 0.076*"father"'), (1, '0.050*"pressure" + 0.050*"increased" + 0.050*"stress"'), (2, '0.065*"driving" + 0.064*"dance" + 0.064*"around"')]


### Donzo pretty cool!!