In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [2]:
df = pd.read_csv('movie_data.csv', encoding='utf-8')
df = df.rename(columns={'0': 'review',
                        '1': 'sentiment'})

In [3]:
'''
max_df = 0.1 implies that maximum document frequency of words to be considered
to 10% to exclude words that occur too frequently.
max_features = 5000 implies limitation on number of words to be considered to
the most frequently occuring 5000 words
'''
count = CountVectorizer(stop_words='english',
                        max_df=0.1,
                        max_features=5000)

x = count.fit_transform(df['review'].values)

In [4]:
#n_components = 10 means that we infer 10 different topics from the document
lda = LatentDirichletAllocation(n_components=10,
                                random_state=123,
                                #do training on all tranining data at once
                                #slower than online but more accurate
                                learning_method='batch',
                                n_jobs=-1)

x_topics = lda.fit_transform(x)

In [5]:
print(x_topics.shape)
print(lda.components_.shape)
print(lda.components_[0])
lda.components_[0].argsort()[::-1]

(50000, 10)
(10, 5000)
[ 81.77512903  91.53368997 355.05632265 ... 256.7160334  176.68221008
  34.36647622]


array([4962, 2896, 3891, ..., 1130, 2917, 3346], dtype=int64)

In [6]:
#setting up to print 5 most important words for each of the 10 topics
n_top_words = 5
feature_names = count.get_feature_names_out()


In [7]:
'''
Word importance values are ranked in increasing order. Thus, to print top
5 words, we need to sort the topic array in reverse order.
'''

for topic_idx, topic in enumerate(lda.components_):
    print(f'Topic {(topic_idx + 1)}:')
    arr = topic.argsort()[::-1]

    for i in range(n_top_words):
        print(feature_names[arr[i]], end=' ')
    print()    


Topic 1:
worst minutes script awful stupid 
Topic 2:
family mother father children girl 
Topic 3:
american dvd music tv war 
Topic 4:
human audience cinema art feel 
Topic 5:
police guy car dead murder 
Topic 6:
horror house sex woman girl 
Topic 7:
role performance comedy actor performances 
Topic 8:
series episode war episodes season 
Topic 9:
book version original effects special 
Topic 10:
action fight guy guys fun 


<b>The above cell gives 5 most relevant words to 10 topics generated from the document</b>

In [8]:
#topic 6 seems like horror category. let us see 3 reviews from this category
#to verify that categorisation made by LDA is logically correct

horror = x_topics[:, 5].argsort()[::-1]
print(horror)
horror.shape

[44203 29185 35613 ... 31616 48404 25114]


(50000,)

In [9]:
for iter_idx, movie_idx in enumerate(horror[:3]):
    print(f'Horror movie #{iter_idx + 1}: ')
    print(df['review'][movie_idx][:300], '...\n')

Horror movie #1: 
Once upon a time in a castle...... Two little girls are playing in the garden's castle. They are sisters. A blonde little girl (Kitty) and a brunette one (Evelyn). Evelyn steals Kitty's doll. Kitty pursues Evelyn. Running through long corridors, they reach the room where their grandfather, sitting o ...

Horror movie #2: 
House of Dracula works from the same basic premise as House of Frankenstein from the year before; namely that Universal's three most famous monsters; Dracula, Frankenstein's Monster and The Wolf Man are appearing in the movie together. Naturally, the film is rather messy therefore, but the fact that ...

Horror movie #3: 
<br /><br />Horror movie time, Japanese style. Uzumaki/Spiral was a total freakfest from start to finish. A fun freakfest at that, but at times it was a tad too reliant on kitsch rather than the horror. The story is difficult to summarize succinctly: a carefree, normal teenage girl starts coming fac ...



In [10]:
#probability of belonging to horror category is highest
print(df['review'][44203])

#probability is lowest
print(f'\n{df["review"][25114]}')

Once upon a time in a castle...... Two little girls are playing in the garden's castle. They are sisters. A blonde little girl (Kitty) and a brunette one (Evelyn). Evelyn steals Kitty's doll. Kitty pursues Evelyn. Running through long corridors, they reach the room where their grandfather, sitting on an armchair, reads the newspaper. Kitty complains about Evelyn, while Evelyn is looking interestedly at a picture hanging on the wall. Evelyn begins to say repeatedly: "I am the red lady and Kitty is the black lady". Suddenly Evelyn grabs a dagger lying nearby and stabs Kitty's doll and then cuts her (the doll's) head. A fight ensues. And Evelyn almost uses the dagger against Kitty. The grandfather intervenes and the worst is avoided.<br /><br />Later on, their grandfather tells them the legend related to the picture hanging on the wall in front of them, in which a lady dressed in black is stabbing a lady dressed in red:<br /><br />"A long time ago, a red lady and a black lady lived in the