# Import the required libraries

In [1]:
import pandas as pd
import numpy as np

#Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

#spaCy
import spacy
import nltk 
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

#vis 
import pyLDAvis
import pyLDAvis.gensim_models

import warnings
warnings.filterwarnings(action='ignore', category=UserWarning)

  from imp import reload


# Import data

In [2]:
imdb_data = pd.read_csv("imdb_labelled.txt",  names=['review','label'], delimiter=".\t")
imdb_data

  imdb_data = pd.read_csv("imdb_labelled.txt",  names=['review','label'], delimiter=".\t")


Unnamed: 0,review,label
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1
...,...,...
995,I just got bored watching Jessice Lange take h...,0
996,"Unfortunately, any virtue in this film's produ...",0
997,"In a word, it is embarrassing.",0
998,Exceptionally bad!,0


In [3]:
review = imdb_data.review.tolist()
review[:10]

['A very, very, very slow-moving, aimless movie about a distressed, drifting young man. ',
 'Not sure who was more lost - the flat characters or the audience, nearly half of whom walked out. ',
 'Attempting artiness with black & white and clever camera angles, the movie disappointed - became even more ridiculous - as the acting was poor and the plot and lines almost non-existent. ',
 'Very little music or anything to speak of. ',
 'The best scene in the movie was when Gerardo is trying to find a song that keeps running through his head. ',
 "The rest of the movie lacks art, charm, meaning... If it's about emptiness, it works I guess because it's empty. ",
 'Wasted two hours. ',
 'Saw the movie today and thought it was a good effort, good messages for kids. ',
 'A bit predictable. ',
 'Loved the casting of Jimmy Buffet as the science teacher. ']

# Prepare data

In [4]:
# create a spacy object, disable parser and ner for the script to run a bit faster
nlp = spacy.load('en_core_web_sm', disable=['parser','ner']) 

#get the list of stop words
stopwords = stopwords.words('english')
stopwords.append('movie')
stopwords.append('film')
print(stopwords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [5]:
# function to preprocess the review
def preprocess(texts, allowed_postags=['NOUN','VERB']):    
    lemmatizer = WordNetLemmatizer()
    filter_sentence = []
    temp = []
    words = gensim.utils.simple_preprocess(str(texts), deacc=True) # gensim - lowercase, tokenize, de-accents
    words = [w for w in words if not w in stopwords] # stopwords removal
    temp = ' '.join(x for x in words) # combine the tokens into a sentence again
    doc = nlp(temp)
    for word in doc: # go through each words in that clean sentence
        if word.pos_ in allowed_postags: # remove words that are not verb or noun
            filter_sentence.append(word.lemma_) # lemmatization
    return filter_sentence

In [6]:
# build the dictionary
corpus = [preprocess(line) for line in review]
corpus[:3]

[['move', 'drift', 'man'],
 ['lose', 'character', 'audience', 'half', 'walk'],
 ['attempt',
  'artiness',
  'camera',
  'angle',
  'disappoint',
  'become',
  'act',
  'plot',
  'line',
  'existent']]

In [7]:
# build the dicationary with gensim
dictionary = corpora.Dictionary(corpus)
len(dictionary)

1435

In [8]:
# convert corpus into bag-of-words format
bow = [dictionary.doc2bow(line) for line in corpus]
print(bow[0][0:20])

[(0, 1), (1, 1), (2, 1)]


# Build model with 2 topics

In [9]:
lda_model_1 = gensim.models.LdaModel(corpus=bow, id2word=dictionary, num_topics=2, passes=8, alpha="auto")

In [10]:
lda_model_1.show_topics(formatted=True, num_topics=2, num_words=20)

[(0,
  '0.025*"see" + 0.013*"make" + 0.012*"scene" + 0.010*"think" + 0.010*"go" + 0.009*"script" + 0.008*"movie" + 0.008*"give" + 0.007*"film" + 0.007*"end" + 0.007*"know" + 0.007*"find" + 0.006*"recommend" + 0.006*"line" + 0.006*"love" + 0.006*"plot" + 0.006*"thing" + 0.005*"people" + 0.005*"suck" + 0.005*"cast"'),
 (1,
  '0.018*"character" + 0.015*"time" + 0.015*"watch" + 0.013*"act" + 0.010*"make" + 0.010*"get" + 0.010*"work" + 0.007*"play" + 0.007*"way" + 0.007*"show" + 0.007*"waste" + 0.007*"story" + 0.007*"love" + 0.006*"actor" + 0.006*"look" + 0.006*"write" + 0.006*"performance" + 0.005*"screen" + 0.005*"feel" + 0.005*"see"')]

In [11]:
# Visualize model with 2 topics
pyLDAvis.enable_notebook() #allow us to see in jupyter notebook
vis1 = pyLDAvis.gensim_models.prepare(lda_model_1, bow, dictionary)
vis1

  default_term_info = default_term_info.sort_values(


# Build model with 5 topics

In [12]:
lda_model_2 = gensim.models.LdaModel(corpus=bow, id2word=dictionary, num_topics=5, passes=8, alpha="auto")

In [13]:
lda_model_2.show_topics(formatted=True, num_topics=5, num_words=20)

[(0,
  '0.012*"act" + 0.010*"scene" + 0.010*"part" + 0.010*"play" + 0.010*"year" + 0.007*"leave" + 0.006*"comedy" + 0.006*"fan" + 0.006*"way" + 0.006*"fail" + 0.005*"watch" + 0.005*"story" + 0.005*"drama" + 0.005*"scamp" + 0.005*"sound" + 0.005*"death" + 0.005*"know" + 0.005*"rating" + 0.005*"dislike" + 0.005*"suspense"'),
 (1,
  '0.027*"watch" + 0.026*"time" + 0.021*"love" + 0.016*"character" + 0.014*"line" + 0.014*"script" + 0.013*"waste" + 0.011*"cast" + 0.010*"enjoy" + 0.010*"story" + 0.007*"make" + 0.007*"performance" + 0.006*"way" + 0.006*"see" + 0.006*"look" + 0.006*"show" + 0.006*"write" + 0.006*"know" + 0.006*"game" + 0.005*"level"'),
 (2,
  '0.021*"get" + 0.014*"see" + 0.013*"make" + 0.012*"plot" + 0.012*"thing" + 0.011*"character" + 0.009*"work" + 0.008*"scene" + 0.007*"script" + 0.007*"director" + 0.007*"show" + 0.006*"say" + 0.006*"actor" + 0.006*"play" + 0.006*"watch" + 0.006*"seem" + 0.006*"garbage" + 0.005*"act" + 0.005*"world" + 0.005*"place"'),
 (3,
  '0.019*"act" + 0

In [14]:
# Visualize model with 5 topics
vis2 = pyLDAvis.gensim_models.prepare(lda_model_2, bow, dictionary)
vis2

  default_term_info = default_term_info.sort_values(


# Build model with 10 topics

In [15]:
lda_model_3 = gensim.models.LdaModel(corpus=bow, id2word=dictionary, num_topics=10, passes=8, alpha="auto")

In [16]:
lda_model_3.show_topics(formatted=True, num_topics=10, num_words=20)

[(0,
  '0.029*"give" + 0.028*"character" + 0.026*"make" + 0.022*"actor" + 0.020*"cast" + 0.019*"performance" + 0.018*"watch" + 0.017*"play" + 0.012*"job" + 0.012*"feel" + 0.012*"line" + 0.010*"story" + 0.009*"think" + 0.009*"show" + 0.007*"plot" + 0.007*"time" + 0.007*"scamp" + 0.007*"go" + 0.007*"deliver" + 0.006*"script"'),
 (1,
  '0.033*"end" + 0.026*"act" + 0.017*"go" + 0.015*"think" + 0.012*"see" + 0.012*"make" + 0.011*"music" + 0.010*"people" + 0.010*"sound" + 0.010*"time" + 0.009*"movie" + 0.009*"find" + 0.008*"love" + 0.008*"write" + 0.008*"death" + 0.007*"budget" + 0.007*"scene" + 0.007*"live" + 0.006*"experience" + 0.006*"portrayal"'),
 (2,
  '0.024*"make" + 0.019*"screen" + 0.016*"character" + 0.015*"plot" + 0.012*"scene" + 0.012*"see" + 0.012*"act" + 0.011*"say" + 0.011*"use" + 0.010*"get" + 0.010*"lack" + 0.009*"actor" + 0.008*"way" + 0.008*"write" + 0.007*"work" + 0.007*"time" + 0.007*"cinematography" + 0.006*"cartoon" + 0.006*"fact" + 0.006*"go"'),
 (3,
  '0.045*"see" + 

In [17]:
# Visualize model with 10 topics
vis3 = pyLDAvis.gensim_models.prepare(lda_model_3, bow, dictionary)
vis3

  default_term_info = default_term_info.sort_values(
