<a href="https://colab.research.google.com/github/souradipta93/NLP/blob/main/topic_model_LSA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

import nltk
#nltk.download()

import re
#remove stopwords
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))

from nltk.stem.wordnet import WordNetLemmatizer 

In [None]:
movie = pd.read_csv('imdb1.csv')

In [None]:
movie.head()

Unnamed: 0,review,sentiment
0,"Ron Hall pulls a triple threat as he writes, d...",negative
1,"The first in the series was brilliant, easily ...",negative
2,I LOVED this movie because Bobbie Phillips can...,positive
3,I was @ 13 yrs of age when I saw this greatly ...,positive
4,The Coen Brothers have truly outdone themselve...,positive


In [None]:
movie.shape

(2000, 2)

In [None]:
#Adding custom stop words
new_words = ["some","one","like","time","br","movie","film","could","good",'even', 'get', 'would',
             'make', 'really', 'see', 'well', 'much', 'great', 'first', 'people', 'also', 'bad', 
             'show', 'way', 'thing', 'made', 'go', 'think', 'know', 'watch','look','many']
stop_words = stop_words.union(new_words)

In [None]:
#Text pre-processing
corpus = []
for i in range(0, movie.shape[0]):
    #Remove punctuations
    text = re.sub('[^a-zA-Z]', ' ', movie['review'][i])
    
    #Convert to lowercase
    text = text.lower()
    ##Convert to list from string
    text = text.split()
    ##Lemmatizing
    lm = WordNetLemmatizer() 
       
    
    text = [lm.lemmatize(word) for word in text if not word in stop_words] 
    text = " ".join(text)
    corpus.append(text)

In [None]:
## Count vectoriser 
tfidf = TfidfVectorizer(max_df=0.9,
                        stop_words=stop_words)
data_vectorized = tfidf.fit_transform(corpus)

In [None]:
print(data_vectorized.shape)

(2000, 21984)


In [None]:
# Build a Latent Dirichlet Allocation Model

lsa_model = TruncatedSVD(n_components=5, 
                         algorithm='randomized', 
                         n_iter=100, 
                         random_state=122)

lsa_movie = lsa_model.fit_transform(data_vectorized)

print(lsa_movie.shape)  # (NO_DOCUMENTS, NO_TOPICS)

(2000, 5)


In [None]:
#Function to print topic and top 8 words in the topic
def print_topics(model, vectorizer, top_n=8):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        
        print([(tfidf.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])
 
print_topics(lsa_model, tfidf)

Topic 0:
[('character', 0.14566058660680198), ('story', 0.12720111978111562), ('scene', 0.11512288654653778), ('acting', 0.09938736787428365), ('actor', 0.09687422452193512), ('plot', 0.09637554805006708), ('seen', 0.09421751479425959), ('life', 0.09296823192592164)]
Topic 1:
[('worst', 0.19755692284191495), ('acting', 0.17836265491996142), ('ever', 0.17518276829469645), ('waste', 0.17310339384349455), ('horror', 0.1550266489615715), ('stupid', 0.12352639064618959), ('terrible', 0.11667020553802594), ('awful', 0.11625179044320433)]
Topic 2:
[('horror', 0.43830887116797496), ('zombie', 0.12638104365445096), ('house', 0.10362522999147163), ('vampire', 0.09579483403618068), ('effect', 0.0956790203833514), ('scary', 0.09381140492708936), ('creepy', 0.08939879299467823), ('ghost', 0.08601968911547767)]
Topic 3:
[('funny', 0.26547537470198634), ('kid', 0.23698350462245457), ('child', 0.14627452768058125), ('year', 0.13108401389224478), ('family', 0.12098520968502469), ('fun', 0.1201253367242

In [None]:
n_top_words = 10

for topic_idx, topic in enumerate(lsa_model.components_):
  print("Topic {}:".format(topic_idx), end = ' ')
  print(" ".join([tfidf.get_feature_names()[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))

Topic 0: character story scene acting actor plot seen life better love
Topic 1: worst acting ever waste horror stupid terrible awful watching seen
Topic 2: horror zombie house vampire effect scary creepy ghost gore blood
Topic 3: funny kid child year family fun love old remember guy
Topic 4: funny comedy role laugh play fun actor episode joke cast
