In [2]:
doc_a = "Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother."
doc_b = "My mother spends a lot of time driving my brother around to baseball practice."
doc_c = "Some health experts suggest that driving may cause increased tension and blood pressure."
doc_d = "I often feel pressure to perform well at school, but my mother never seems to drive my brother to do better."
doc_e = "Health professionals say that brocolli is good for your health."

docs = [doc_a, doc_b, doc_c, doc_d, doc_e]

In [3]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.util import ngrams

In [4]:
def preprocess(text, lemmatizer, n):

    # To lower case and tokenization
    tokens = word_tokenize(text.lower())

    # Stop word and punctuation removal
    filtered_tokens = [token for token in tokens if token.isalpha() and (token not in stopwords.words('english'))]

    # Lemmatize the tokens
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]

    if n <= 1:
        return lemmatized_tokens
    
    # NGram generation
    ngram_set = []
    
    for i in range(1, n + 1):
        processed_text = ngrams(lemmatized_tokens, i)
        ngram_set.extend([' '.join(grams) for grams in processed_text])

    return ngram_set

In [5]:
lemmatizer = WordNetLemmatizer()
token_list = [preprocess(doc, lemmatizer, 1) for doc in docs]

print(token_list)

[['brocolli', 'good', 'eat', 'brother', 'like', 'eat', 'good', 'brocolli', 'mother'], ['mother', 'spends', 'lot', 'time', 'driving', 'brother', 'around', 'baseball', 'practice'], ['health', 'expert', 'suggest', 'driving', 'may', 'cause', 'increased', 'tension', 'blood', 'pressure'], ['often', 'feel', 'pressure', 'perform', 'well', 'school', 'mother', 'never', 'seems', 'drive', 'brother', 'better'], ['health', 'professional', 'say', 'brocolli', 'good', 'health']]


In [6]:
from gensim import corpora, models
import gensim

In [7]:
# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(token_list)
    
print(dictionary)

Dictionary<33 unique tokens: ['brocolli', 'brother', 'eat', 'good', 'like']...>


In [8]:
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(tokens) for tokens in token_list]

print(corpus)

[[(0, 2), (1, 1), (2, 2), (3, 2), (4, 1), (5, 1)], [(1, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1)], [(8, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1)], [(1, 1), (5, 1), (19, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1)], [(0, 1), (3, 1), (16, 2), (31, 1), (32, 1)]]


In [10]:
# generate LDA model
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=2, id2word = dictionary, passes=20)

print(ldamodel.print_topics(num_topics=2, num_words=4))

[(0, '0.070*"driving" + 0.043*"health" + 0.042*"pressure" + 0.042*"expert"'), (1, '0.080*"brocolli" + 0.080*"good" + 0.058*"brother" + 0.058*"mother"')]
