### 1. Import Libraries

In [18]:
# for text preprocessing
import re
import spacy

from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string

# import vectorizers
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# import numpy for matrix operation
import numpy as np

# import LDA from sklearn
from sklearn.decomposition import LatentDirichletAllocation

In [19]:
# to suppress warnings
from warnings import filterwarnings
filterwarnings('ignore')

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
D1 = 'I want to watch a movie this weekend.'
D2 =  'I went shopping yesterday. New Zealand won the World Test Championship by beating India by eight wickets at Southampton.'
D3 =  'I don’t watch cricket. Netflix and Amazon Prime have very good movies to watch.'
D4 =  'Movies are a nice way to chill however, this time I would like to paint and read some good books. It’s been long!'
D5 =  'This blueberry milkshake is so good! Try reading Dr. Joe Dispenza’s books. His work is such a game-changer! His books helped to learn so much about how our thoughts impact our biology and how we can all rewire our brains.'

In [4]:
# combining all the documents into a list:

corpus = [D1, D2, D3, D4, D5]

In [5]:
# the complete corpus as below:

corpus

['I want to watch a movie this weekend.',
 'I went shopping yesterday. New Zealand won the World Test Championship by beating India by eight wickets at Southampton.',
 'I don’t watch cricket. Netflix and Amazon Prime have very good movies to watch.',
 'Movies are a nice way to chill however, this time I would like to paint and read some good books. It’s been long!',
 'This blueberry milkshake is so good! Try reading Dr. Joe Dispenza’s books. His work is such a game-changer! His books helped to learn so much about how our thoughts impact our biology and how we can all rewire our brains.']

### 2. Text Preprocessing

Steps to preprocess text data:

1. Convert the text into lowercase
2. Split text into words
3. Remove the stop loss words
3. Remove the Punctuation, any symbols and special characters
4. Normalize the word (I'll be using Lemmatization for normalization)

In [6]:
# Apply Preprocessing on the Corpus

# stop loss words 
stop = set(stopwords.words('english'))

# punctuation 
exclude = set(string.punctuation) 

# lemmatization
lemma = WordNetLemmatizer() 

# One function for all the steps:
def clean(doc):
    
    # convert text into lower case + split into words
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    
    # remove any stop words present
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)  
    
    # remove punctuations + normalize the text
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())  
    return normalized

# clean data stored in a new list
clean_corpus = [clean(doc).split() for doc in corpus]   

In [7]:
clean_corpus

[['want', 'watch', 'movie', 'weekend'],
 ['went',
  'shopping',
  'yesterday',
  'new',
  'zealand',
  'world',
  'test',
  'championship',
  'beating',
  'india',
  'eight',
  'wicket',
  'southampton'],
 ['don’t',
  'watch',
  'cricket',
  'netflix',
  'amazon',
  'prime',
  'good',
  'movie',
  'watch'],
 ['movie',
  'nice',
  'way',
  'chill',
  'however',
  'time',
  'would',
  'like',
  'paint',
  'read',
  'good',
  'book',
  'it’s',
  'long'],
 ['blueberry',
  'milkshake',
  'good',
  'try',
  'reading',
  'dr',
  'joe',
  'dispenza’s',
  'book',
  'work',
  'gamechanger',
  'book',
  'helped',
  'learn',
  'much',
  'thought',
  'impact',
  'biology',
  'rewire',
  'brain']]

### 3. Convert Text into Numerical Representation

Converting the clean preprocessed corpus to array

In [8]:
# Converting text into numerical representation
tf_idf_vectorizer = TfidfVectorizer(tokenizer=lambda doc: doc, lowercase=False)

# Converting text into numerical representation
cv_vectorizer = CountVectorizer(tokenizer=lambda doc: doc, lowercase=False)

In [9]:
# Array from TF-IDF Vectorizer 
tf_idf_arr = tf_idf_vectorizer.fit_transform(clean_corpus)

# Array from Count Vectorizer 
cv_arr = cv_vectorizer.fit_transform(clean_corpus)

In [10]:
# this is our converted text to numerical representation from the Tf-IDF vectorizer

tf_idf_arr

<5x52 sparse matrix of type '<class 'numpy.float64'>'
	with 58 stored elements in Compressed Sparse Row format>

In [11]:
# this is our converted text to numerical representation from the Count vectorizer
cv_arr

<5x52 sparse matrix of type '<class 'numpy.int64'>'
	with 58 stored elements in Compressed Sparse Row format>

The corpus has 52 columns and 5 rows corresponding to our document and 58 represents the unique Vocabulary present in our corpus.

In [12]:
# Creating vocabulary array which will represent all the corpus 
vocab_tf_idf = tf_idf_vectorizer.get_feature_names()

# get the vocb list
vocab_tf_idf

['amazon',
 'beating',
 'biology',
 'blueberry',
 'book',
 'brain',
 'championship',
 'chill',
 'cricket',
 'dispenza’s',
 'don’t',
 'dr',
 'eight',
 'gamechanger',
 'good',
 'helped',
 'however',
 'impact',
 'india',
 'it’s',
 'joe',
 'learn',
 'like',
 'long',
 'milkshake',
 'movie',
 'much',
 'netflix',
 'new',
 'nice',
 'paint',
 'prime',
 'read',
 'reading',
 'rewire',
 'shopping',
 'southampton',
 'test',
 'thought',
 'time',
 'try',
 'want',
 'watch',
 'way',
 'weekend',
 'went',
 'wicket',
 'work',
 'world',
 'would',
 'yesterday',
 'zealand']

In [13]:
# Creating vocabulary array which will represent all the corpus 
vocab_cv = cv_vectorizer.get_feature_names()

# get the vocb list
vocab_cv

['amazon',
 'beating',
 'biology',
 'blueberry',
 'book',
 'brain',
 'championship',
 'chill',
 'cricket',
 'dispenza’s',
 'don’t',
 'dr',
 'eight',
 'gamechanger',
 'good',
 'helped',
 'however',
 'impact',
 'india',
 'it’s',
 'joe',
 'learn',
 'like',
 'long',
 'milkshake',
 'movie',
 'much',
 'netflix',
 'new',
 'nice',
 'paint',
 'prime',
 'read',
 'reading',
 'rewire',
 'shopping',
 'southampton',
 'test',
 'thought',
 'time',
 'try',
 'want',
 'watch',
 'way',
 'weekend',
 'went',
 'wicket',
 'work',
 'world',
 'would',
 'yesterday',
 'zealand']

In [14]:
display(len(vocab_tf_idf))
display(len(vocab_cv))

52

52

### 4. Implementation of LDA

To implement LDA, pass the corpus: document-term matrix to the model. We had above obtained the unique words of vocabulary using both TF-IDF and Count Vectorizer. We can continue with either as have the same unique words in both the obtained vocabularies.

In [15]:
 # Implementation of LDA:
    
# Create object for the LDA class 
# Inside this class LDA: define the components:
lda_model = LatentDirichletAllocation(n_components = 6, max_iter = 20, random_state = 20)

# fit transform on model on our count_vectorizer : running this will return our topics 
X_topics = lda_model.fit_transform(tf_idf_arr)

# .components_ gives us our topic distribution 
topic_words = lda_model.components_

### 4a. Retrieve the Topics


In [16]:
#  Define the number of Words that we want to print in every topic : n_top_words
n_top_words = 5

for i, topic_dist in enumerate(topic_words):
    
    # np.argsort to sorting an array or a list or the matrix acc to their values
    sorted_topic_dist = np.argsort(topic_dist)
    
    # Next, to view the actual words present in those indexes we can make the use of the vocab created earlier
    topic_words = np.array(vocab_tf_idf)[sorted_topic_dist]
    
    # so using the sorted_topic_indexes we ar extracting the words from the vocabulary
    # obtaining topics + words
    # this topic_words variable contains the Topics  as well as the respective words present in those Topics
    topic_words = topic_words[:-n_top_words:-1]
    print ("Topic", str(i+1), topic_words)

Topic 1 ['movie' 'good' 'watch' 'book']
Topic 2 ['zealand' 'test' 'beating' 'world']
Topic 3 ['weekend' 'want' 'watch' 'movie']
Topic 4 ['watch' 'amazon' 'cricket' 'don’t']
Topic 5 ['movie' 'good' 'watch' 'book']
Topic 6 ['however' 'chill' 'would' 'it’s']


Above is the words per topic. The result is not so accurate as the data was less. More data will give more accurate result.

### 4b. Annotating the topics the documents

In [17]:
# To view what topics are assigned to the douments:

doc_topic = lda_model.transform(tf_idf_arr)  

# iterating over ever value till the end value
for n in range(doc_topic.shape[0]):
    
    # argmax() gives maximum index value
    topic_doc = doc_topic[n].argmax()
    
    # document is n+1  
    print ("Document", n+1, " -- Topic:" ,topic_doc)

Document 1  -- Topic: 2
Document 2  -- Topic: 1
Document 3  -- Topic: 3
Document 4  -- Topic: 5
Document 5  -- Topic: 2


This is the final output which gives us the topic along with the documents.

-----------------------