In [18]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk import download

download('punkt_tab')
download('wordnet')
download('stopwords')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/juliusc/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/juliusc/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/juliusc/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [19]:
documents = [
    "I love to eat pizza. Pizza is my favorite food.",
    "The cat is playing with the ball.",
    "I enjoy reading books on machine learning.",
    "The dog is chasing the cat.",
    "Pizza and pasta are popular Italian dishes."
]

In [20]:
# preprocess
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def preprocess(text):
    tokens = word_tokenize(text.lower())
    # tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalnum() and word not in stop_words]
    tokens = [stemmer.stem(word) for word in tokens if word.isalnum() and word not in stop_words]
    return tokens

In [25]:
preprocessed_docs = [preprocess(doc) for doc in documents]

In [26]:
preprocessed_docs

[['love', 'eat', 'pizza', 'pizza', 'favorit', 'food'],
 ['cat', 'play', 'ball'],
 ['enjoy', 'read', 'book', 'machin', 'learn'],
 ['dog', 'chase', 'cat'],
 ['pizza', 'pasta', 'popular', 'italian', 'dish']]

# Algorithm

In [33]:
from gensim import corpora

dictionary = corpora.Dictionary(preprocessed_docs)

dictionary.doc2bow(preprocessed_docs[0])

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 2)]

In [34]:
preprocessed_docs[0]

['love', 'eat', 'pizza', 'pizza', 'favorit', 'food']

In [36]:
dictionary[4]

'pizza'

In [37]:
corpus = [dictionary.doc2bow(doc) for doc in preprocessed_docs]

In [40]:
corpus

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 2)],
 [(5, 1), (6, 1), (7, 1)],
 [(8, 1), (9, 1), (10, 1), (11, 1), (12, 1)],
 [(6, 1), (13, 1), (14, 1)],
 [(4, 1), (15, 1), (16, 1), (17, 1), (18, 1)]]

## Build LDA

In [54]:
from gensim.models.ldamodel import LdaModel

num_topics = 2
passes = 15

lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=passes)

In [55]:
lda_model.print_topics()

[(0,
  '0.108*"pizza" + 0.064*"eat" + 0.064*"love" + 0.064*"food" + 0.064*"favorit" + 0.064*"learn" + 0.064*"read" + 0.064*"machin" + 0.064*"enjoy" + 0.064*"book"'),
 (1,
  '0.091*"cat" + 0.085*"dish" + 0.085*"pasta" + 0.085*"popular" + 0.085*"italian" + 0.085*"dog" + 0.085*"chase" + 0.084*"pizza" + 0.029*"play" + 0.029*"ball"')]