In [1]:
!python -m spacy download en

Collecting en-core-web-sm==3.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.3.0/en_core_web_sm-3.3.0-py3-none-any.whl (12.8 MB)
[!] As of spaCy v3.0, shortcuts like 'en' are deprecated. Please use the full
pipeline package name 'en_core_web_sm' instead.
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


In [None]:
# for text preprocessing
import re
import spacy

from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string

# import vectorizers
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# import numpy for matrix operation
import numpy as np

# import LDA from sklearn
from sklearn.decomposition import LatentDirichletAllocation

In [None]:
nlp = spacy.load('en_core_web_sm')

In [None]:
D1 = 'I want to watch a movie this weekend.'
D2 =  'I went shopping yesterday. New Zealand won the World Test Championship by beating India by eight wickets at Southampton.'
D3 =  'I don’t watch cricket. Netflix and Amazon Prime have very good movies to watch.'
D4 =  'Movies are a nice way to chill however, this time I would like to paint and read some good books. It’s been long!'
D5 =  'This blueberry milkshake is so good! Try reading Dr. Joe Dispenza’s books. His work is such a game-changer! His books helped to learn so much about how our thoughts impact our biology and how we can all rewire our brains.'

In [None]:
# combining all the documents into a list:
corpus = [------------, ----------, ----------------, ----------, -----------------------]

In [None]:
corpus

### 2. Text Preprocessing

Steps to preprocess text data:

1. Convert the text into lowercase
2. Split text into words
3. Remove the stop  words
3. Remove the Punctuation, any symbols and special characters
4. Lemmatization

In [None]:
# Apply Preprocessing on the Corpus

# stop  words 
stop = set(-----------------------.words('english'))

# punctuation 
exclude = set(string.punctuation) 

# lemmatization
lemma = ------------------------------------() 

# One function for all the steps:
def clean(doc):
    
    # convert text into lower case + split into words
    stop_free = " ".join([i for i in doc.----------------------().split() if i not in stop])
    
    # remove any stop words present
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)  
    
    # remove punctuations + normalize the text
    normalized = " ".join(lemma.-------------------------(word) for word in punc_free.split())  
    return normalized

# clean data stored in a new list
clean_corpus = [clean(doc).split() for doc in corpus]   

In [None]:
clean_corpus

### 3. Convert Text into Numerical Representation

Converting the clean preprocessed corpus to array

In [None]:
# Converting text into numerical representation
tf_idf_vectorizer = ----------------------(tokenizer=lambda doc: doc, lowercase=False)

tf_idf_vectorizer

In [None]:
# Array from TF-IDF Vectorizer 
tf_idf_arr = tf_idf_vectorizer.--------------------(clean_corpus)

In [None]:
# Creating vocabulary array which will represent all the corpus 
vocab_tf_idf = tf_idf_vectorizer.------------------------()

# get the vocb list
vocab_tf_idf

In [None]:
display(len(vocab_tf_idf))

### 4. Implementation of LDA

To implement LDA, pass the corpus: document-term matrix to the model. We had above obtained the unique words of vocabulary using both TF-IDF and Count Vectorizer. We can continue with either as have the same unique words in both the obtained vocabularies.

In [None]:
# Create object for the LDA class 
# Inside this class LDA: define the components:
lda_model = -------------------------(n_components = 6, max_iter = 20, random_state = 20)

# fit transform on model on our count_vectorizer : running this will return our topics 
X_topics = lda_model. -------------------------(tf_idf_arr)

# .components_ gives us our topic distribution 
topic_words = lda_model. -------------------------

### 4a. Retrieve the Topics

In [None]:
#  Define the number of Words that we want to print in every topic : n_top_words
n_top_words = 5

for i, topic_dist in enumerate(topic_words):
    
    # np.argsort to sorting an array or a list or the matrix acc to their values
    sorted_topic_dist = np.argsort(topic_dist)
    
    # Next, to view the actual words present in those indexes we can make the use of the vocab created earlier
    topic_words = np.array(vocab_tf_idf)[sorted_topic_dist]
    
    # so using the sorted_topic_indexes we ar extracting the words from the vocabulary
    # obtaining topics + words
    # this topic_words variable contains the Topics  as well as the respective words present in those Topics
    topic_words = topic_words[:-n_top_words:-1]
    print ("Topic", str(i+1), topic_words)

### 4b. Annotating the topics the documents

In [None]:
# To view what topics are assigned to the douments:

doc_topic = lda_model.transform(tf_idf_arr)  

# iterating over ever value till the end value
for n in range(doc_topic.shape[0]):
    
    # argmax() gives maximum index value
    topic_doc = doc_topic[n].argmax()
    
    # document is n+1  
    print ("Document", n+1, " -- Topic:" ,topic_doc)