In [3]:
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim

tokenizer = RegexpTokenizer(r'\w+')

# create English stop words list
en_stop = get_stop_words('en')

# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()
    
# create sample documents
doc_a = "Tis the Season for Christmas Cookies!  My all time favorite Christmas Cookie is, and always has been, the Peanut Butter Kiss Cookie.  However, I have cut peanut butter, sugar and crap out of my life.  But it’s Christmas y’all!  If you’re going to sneak a treat, this is the time to do it!  That being said, I decided to peace out the peanut butter cookie and make a Paleo Almond Butter Cookie!"
doc_b = "I grew up with a heart for travel.  Then I grew up, got a job and realized I have also heart for vacation.  I’m the kind of traveler who is happy with a trusty backpack, comfy sneakers, a camera and a eurail train pass… I’m also one who won’t say no to luxury hotels, beach views and a fruity beverages."
doc_c = "If it were at all possible, I would pay anything and everything to rewind the clock to exactly one year ago today: November 9, 2013.  A day full of glitter and details.  A day of love, joy, and plenty of happy tears.  A day my creative party planning skills were at their finest.  A day when every single person who means the world to me, were all in the exact same room.  A day I partied like a rockstar on cold medicine.  The day I said “I do” to the love of my life."
doc_d = "It’s been one month now with the house on the market.  I would by lying if I said I wasn’t on edge about it.  Between keeping it clean and staying nearby just incase I have to swoop up the dogs for a showing… it’s just high stress.  Add to it that we have not had ONE SINGLE SHOWING!  I’m not blaming my realtor one bit.  She’s a rockstar.  She is also my mom and is also not taking a commission if when the house finally sells.  I blame the weather.  We have either had snow on the ground or temps below 40 every day this month.  Only crazy people buy houses in this weather.  Albeit I’m totally okay with selling to anyone, crazy or not."
doc_e = "As a professional event planner, I know every shindig needs a Plan A and a Plan B.  Our Plan A included the back deck, a fire pit, tons of space for tons of guests, a canvas in the middle of the yard, sunshine and eggs filled with either blue or pink paint.  Plan B?  Well Plan B involved a cake with either pink or blue on the inside and a lot less space in the house… My parents knew how much Plan A meant to me, so when the forecast was terrible we tried to work around it!"
doc_f = "Two years ago, he experienced the lowest of the lows, but now Robert Herjavec is bouncing back from a painful divorce – and moving on with his brand-new fiancée, his former Dancing with the Stars partner Kym Johnson. But it wasn't an easy road to his new-found happiness. "
doc_g = "Chere Rush was a 39-year-old mother with a 10-year-old and 8-year-old twins when she first noticed a small lump in her breast. "
doc_h = "Redmond police on Wednesday night confirmed that Seahawks safety Kam Chancellor was involved in an incident that led to a 911 call at the Redmond Athletic Club."
doc_i = "The grocery chain, which previously accepted only cash and debit cards, is now accepting all major credit cards, including Visa, MasterCard, Discover, and American Express."
doc_j = "A new witness has come forward saying he was in the looker room and saw the event at the heart of new sexual-assault allegations against Peyton Manning involving then-University of Tennessee trainer Jamie Naughright."

# compile sample documents into a list
doc_set = [doc_a, 
           doc_b, 
           doc_c, doc_d, doc_e, doc_f, doc_g, doc_h, doc_i, doc_j]

# list for tokenized documents in loop
texts = []

# loop through document list
for i in doc_set:
    
    # clean and tokenize document string
    raw = i.lower()
    tokens = tokenizer.tokenize(raw)

    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in en_stop]
    
    # stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    
    # add tokens to list
    texts.append(stemmed_tokens)

# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts)
    
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]

# generate LDA model
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=10, id2word = dictionary, passes=20)

In [2]:
print(ldamodel.print_topics(num_topics=10, num_words=10))

[(0, u'0.062*plan + 0.032*b + 0.021*either + 0.021*ton + 0.021*space + 0.021*blue + 0.021*pink + 0.021*includ + 0.021*accept + 0.021*card'), (1, u'0.050*redmond + 0.026*involv + 0.026*kam + 0.026*athlet + 0.026*call + 0.026*led + 0.026*confirm + 0.026*polic + 0.026*safeti + 0.026*club'), (2, u'0.004*butter + 0.004*cooki + 0.004*christma + 0.004*peanut + 0.004*time + 0.004*almond + 0.004*decid + 0.004*peac + 0.004*re + 0.004*treat'), (3, u'0.004*m + 0.004*grew + 0.004*heart + 0.004*travel + 0.004*also + 0.004*t + 0.004*luxuri + 0.004*eurail + 0.004*pass + 0.004*sneaker'), (4, u'0.072*year + 0.072*old + 0.025*twin + 0.025*breast + 0.025*10 + 0.025*chere + 0.025*first + 0.025*small + 0.025*8 + 0.025*lump'), (5, u'0.044*new + 0.023*say + 0.023*room + 0.023*heart + 0.023*involv + 0.023*event + 0.023*wit + 0.023*forward + 0.023*peyton + 0.023*trainer'), (6, u'0.054*day + 0.045*cooki + 0.037*butter + 0.028*peanut + 0.028*christma + 0.019*parti + 0.019*love + 0.019*life + 0.019*said + 0.019*ti