In [1]:
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim
import glob


In [2]:
tokenizer = RegexpTokenizer(r'\w+')#Regular-Expression Tokenizers -> splits a string into substrings using a regular expression

# create English stop words list
en_stop = get_stop_words('english')#meaningless words in english eg: is, are, the

# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()#removing similar objects

In [3]:
doc_set = list()
#sample documents
files = glob.glob('*.txt')
for file_name in files:
    readall = open(file_name,'r')
    doc_set.append(readall.read())
print(doc_set)

[]


In [4]:
doc_a = "Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother."
doc_b = "My mother spends a lot of time driving my brother around to baseball practice."
doc_c = "Some health experts suggest that driving may cause increased tension and blood pressure."
doc_d = "I often feel pressure to perform well at school, but my mother never seems to drive my brother to do better."
doc_e = "Health professionals say that brocolli is good for your health." 

# compile sample documents into a list
doc_set = [doc_a, doc_b, doc_c, doc_d, doc_e]

# list for tokenized documents in loop
texts = []

# loop through document list
for i in doc_set:# clean and tokenize document string
    raw = i.lower()
    print ('raw\n%s'%raw)
    tokens = tokenizer.tokenize(raw)# remove stop words from tokens
    print ('tokens\n%s'%tokens)
    stopped_tokens = [i for i in tokens if not i in en_stop]# stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]# add tokens to list
    texts.append(stopped_tokens)
print ('tokens = %s \n'%tokens)
print ('stopped_tokens = %s \n'%stopped_tokens)
print ('stemmed_tokens = %s \n'%stemmed_tokens)


raw
brocolli is good to eat. my brother likes to eat good brocolli, but not my mother.
tokens
['brocolli', 'is', 'good', 'to', 'eat', 'my', 'brother', 'likes', 'to', 'eat', 'good', 'brocolli', 'but', 'not', 'my', 'mother']
raw
my mother spends a lot of time driving my brother around to baseball practice.
tokens
['my', 'mother', 'spends', 'a', 'lot', 'of', 'time', 'driving', 'my', 'brother', 'around', 'to', 'baseball', 'practice']
raw
some health experts suggest that driving may cause increased tension and blood pressure.
tokens
['some', 'health', 'experts', 'suggest', 'that', 'driving', 'may', 'cause', 'increased', 'tension', 'and', 'blood', 'pressure']
raw
i often feel pressure to perform well at school, but my mother never seems to drive my brother to do better.
tokens
['i', 'often', 'feel', 'pressure', 'to', 'perform', 'well', 'at', 'school', 'but', 'my', 'mother', 'never', 'seems', 'to', 'drive', 'my', 'brother', 'to', 'do', 'better']
raw
health professionals say that brocolli is g

In [11]:
# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts)
print(dictionary.token2id)
print (len(dictionary))
for j in  dictionary:
    print (dictionary[j])
#print (dictionary)
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]
print ('\n')
print (corpus)
print ('\n')
# generate LDA model
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=1, id2word = dictionary, passes=20)
print(ldamodel.print_topics(num_topics=1, num_words=len(dictionary)))


{'brocolli': 0, 'brother': 1, 'eat': 2, 'good': 3, 'likes': 4, 'mother': 5, 'around': 6, 'baseball': 7, 'driving': 8, 'lot': 9, 'practice': 10, 'spends': 11, 'time': 12, 'blood': 13, 'cause': 14, 'experts': 15, 'health': 16, 'increased': 17, 'may': 18, 'pressure': 19, 'suggest': 20, 'tension': 21, 'better': 22, 'drive': 23, 'feel': 24, 'never': 25, 'often': 26, 'perform': 27, 'school': 28, 'seems': 29, 'well': 30, 'professionals': 31, 'say': 32}
33
brocolli
brother
eat
good
likes
mother
around
baseball
driving
lot
practice
spends
time
blood
cause
experts
health
increased
may
pressure
suggest
tension
better
drive
feel
never
often
perform
school
seems
well
professionals
say


[[(0, 2), (1, 1), (2, 2), (3, 2), (4, 1), (5, 1)], [(1, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1)], [(8, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1)], [(1, 1), (5, 1), (19, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), 