In [10]:
import spacy
from nltk.corpus import stopwords 
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import string
import numpy as np
import gensim
from gensim import corpora

In [4]:
D1 = 'I want to watch a movie this weekend.'
D2 =  'I went shopping yesterday. New Zealand won the World Test Championship by beating India by eight wickets at Southampton.'
D3 =  'I don’t watch cricket. Netflix and Amazon Prime have very good movies to watch.'
D4 =  'Movies are a nice way to chill however, this time I would like to paint and read some good books. It’s been long!'
D5 =  'This blueberry milkshake is so good! Try reading Dr. Joe Dispenza’s books. His work is such a game-changer! His books helped to learn so much about how our thoughts impact our biology and how we can all rewire our brains.'

In [5]:
corpus = [D1, D2, D3, D4, D5]
corpus

['I want to watch a movie this weekend.',
 'I went shopping yesterday. New Zealand won the World Test Championship by beating India by eight wickets at Southampton.',
 'I don’t watch cricket. Netflix and Amazon Prime have very good movies to watch.',
 'Movies are a nice way to chill however, this time I would like to paint and read some good books. It’s been long!',
 'This blueberry milkshake is so good! Try reading Dr. Joe Dispenza’s books. His work is such a game-changer! His books helped to learn so much about how our thoughts impact our biology and how we can all rewire our brains.']

In [6]:
stop = set(stopwords.words('english'))
print(len(stop))

179


In [7]:
print(stop, end = " ")

{'under', 'now', 'doing', 'can', 'further', 'ma', 'themselves', 'again', 'hers', 'herself', 'an', 'we', 'here', 'by', 'through', 'from', 'doesn', 'ain', 'yourselves', 'wasn', 's', 'as', "doesn't", 'hasn', 'i', 'same', 'nor', 'll', 'for', 'once', 'o', 'all', 'any', 'whom', 'needn', 'shouldn', 'theirs', 'wouldn', "should've", 'mightn', 'because', 'against', 'y', 'won', 'were', 'myself', 'to', "you'd", 'didn', 'was', 've', 'his', 'what', 'such', "hadn't", 'mustn', 'their', 'who', 'those', 'down', 'after', 'yourself', 'aren', "weren't", 'before', 'do', 'into', 'the', 'each', 'don', 'her', 'few', 'too', "didn't", 'been', "shouldn't", 'shan', 'is', 'these', "you've", 'up', 'between', 'in', 'not', 'just', 'isn', 'hadn', 'you', 'below', 'should', 'then', 'why', 'which', "don't", 'but', 'while', 'more', "she's", 'at', 'own', 'its', 'does', 'where', "needn't", 'a', "hasn't", 'be', 't', 'weren', 'off', 'my', 'only', 'he', 'some', 'when', 'had', 'him', 'himself', 'they', 'them', 'other', "won't", 

In [8]:
print(set(string.punctuation), end = " ")

{'\\', '%', '[', "'", '{', ')', '=', '-', '$', '|', '/', '"', '*', '}', '<', ';', ':', '(', '?', '@', '!', ']', '^', '~', '&', '`', '+', ',', '>', '.', '_', '#'} 

In [9]:
"I want to watch a movie this weekend".split()

['I', 'want', 'to', 'watch', 'a', 'movie', 'this', 'weekend']

In [12]:
stop = set(stopwords.words('english'))

exclude = set(string.punctuation) 

stem = PorterStemmer()

# One function for all the steps:
def clean(doc):
    
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)  
    normalized = " ".join(stem.stem(word) for word in punc_free.split())  
    return normalized

# clean data stored in a new list
clean_corpus = [clean(doc).split() for doc in corpus]   

In [13]:
print(clean_corpus, end = " ")

[['want', 'watch', 'movi', 'weekend'], ['went', 'shop', 'yesterday', 'new', 'zealand', 'world', 'test', 'championship', 'beat', 'india', 'eight', 'wicket', 'southampton'], ['don’t', 'watch', 'cricket', 'netflix', 'amazon', 'prime', 'good', 'movi', 'watch'], ['movi', 'nice', 'way', 'chill', 'howev', 'time', 'would', 'like', 'paint', 'read', 'good', 'book', 'it’', 'long'], ['blueberri', 'milkshak', 'good', 'tri', 'read', 'dr', 'joe', 'dispenza’', 'book', 'work', 'gamechang', 'book', 'help', 'learn', 'much', 'thought', 'impact', 'biolog', 'rewir', 'brain']] 

In [14]:
dict_ = corpora.Dictionary(clean_corpus)
print(dict_)

Dictionary(51 unique tokens: ['movi', 'want', 'watch', 'weekend', 'beat']...)


In [15]:
for i in dict_.values():
    print(i, end = ", ")

movi, want, watch, weekend, beat, championship, eight, india, new, shop, southampton, test, went, wicket, world, yesterday, zealand, amazon, cricket, don’t, good, netflix, prime, book, chill, howev, it’, like, long, nice, paint, read, time, way, would, biolog, blueberri, brain, dispenza’, dr, gamechang, help, impact, joe, learn, milkshak, much, rewir, thought, tri, work, 

In [16]:
doc_term_matrix = [dict_.doc2bow(i) for i in clean_corpus]
doc_term_matrix

[[(0, 1), (1, 1), (2, 1), (3, 1)],
 [(4, 1),
  (5, 1),
  (6, 1),
  (7, 1),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 1)],
 [(0, 1), (2, 2), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1)],
 [(0, 1),
  (20, 1),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1),
  (30, 1),
  (31, 1),
  (32, 1),
  (33, 1),
  (34, 1)],
 [(20, 1),
  (23, 2),
  (31, 1),
  (35, 1),
  (36, 1),
  (37, 1),
  (38, 1),
  (39, 1),
  (40, 1),
  (41, 1),
  (42, 1),
  (43, 1),
  (44, 1),
  (45, 1),
  (46, 1),
  (47, 1),
  (48, 1),
  (49, 1),
  (50, 1)]]

In [17]:
# Creating the object for LDA model using gensim library

Lda = gensim.models.ldamodel.LdaModel

In [18]:
# Running and Training LDA model on the document term matrix.

ldamodel = Lda(doc_term_matrix, num_topics=6, id2word = dict_, passes=1, random_state=0)

In [19]:
ldamodel.print_topics()

[(0,
  '0.079*"watch" + 0.042*"movi" + 0.042*"good" + 0.042*"netflix" + 0.042*"amazon" + 0.042*"cricket" + 0.042*"prime" + 0.042*"don’t" + 0.036*"world" + 0.036*"southampton"'),
 (1,
  '0.020*"movi" + 0.020*"watch" + 0.020*"good" + 0.020*"weekend" + 0.020*"want" + 0.020*"cricket" + 0.020*"don’t" + 0.020*"prime" + 0.020*"read" + 0.020*"book"'),
 (2,
  '0.046*"went" + 0.039*"new" + 0.037*"shop" + 0.036*"yesterday" + 0.034*"wicket" + 0.034*"test" + 0.034*"championship" + 0.034*"eight" + 0.034*"india" + 0.032*"zealand"'),
 (3,
  '0.093*"want" + 0.093*"watch" + 0.093*"weekend" + 0.093*"movi" + 0.013*"good" + 0.013*"book" + 0.013*"don’t" + 0.013*"read" + 0.013*"cricket" + 0.013*"prime"'),
 (4,
  '0.076*"book" + 0.041*"read" + 0.041*"good" + 0.041*"biolog" + 0.041*"impact" + 0.041*"learn" + 0.041*"thought" + 0.041*"blueberri" + 0.041*"dr" + 0.041*"joe"'),
 (5,
  '0.052*"movi" + 0.052*"good" + 0.052*"book" + 0.052*"howev" + 0.052*"chill" + 0.052*"would" + 0.052*"nice" + 0.052*"way" + 0.052*"it

In [20]:
print(ldamodel.print_topics(num_topics=5, num_words=5))

[(0, '0.079*"watch" + 0.042*"movi" + 0.042*"good" + 0.042*"netflix" + 0.042*"amazon"'), (4, '0.076*"book" + 0.041*"read" + 0.041*"good" + 0.041*"biolog" + 0.041*"learn"'), (1, '0.020*"movi" + 0.020*"watch" + 0.020*"good" + 0.020*"weekend" + 0.020*"want"'), (3, '0.093*"want" + 0.093*"watch" + 0.093*"weekend" + 0.093*"movi" + 0.013*"good"'), (2, '0.046*"went" + 0.039*"new" + 0.037*"shop" + 0.036*"yesterday" + 0.034*"wicket"')]
