In [2]:
# importing all the dependencies
import gensim
from gensim import corpora
from nltk.corpus import stopwords
import string
import pickle
from nltk.stem.wordnet import WordNetLemmatizer

In [3]:
#cleaning the dataset
with open("reviews.pkl", 'rb') as data:
    reviews = pickle.load(data)
print(str(len(reviews)))

stopwords_set = set(stopwords.words('english'))
print(stopwords_set)
punctuations = set(string.punctuation)
print(punctuations)
lemmatizer = WordNetLemmatizer()

def clean_review(review):
    # Removing stopwords
    s = " ".join([word for word in review.lower().split() if word not in stopwords_set])
    # Removing punctuations
    p = ''.join(letter for letter in s if letter not in punctuations)
    # Lemmatizing words eg. loves -> love
    out = " ".join(lemmatizer.lemmatize(word) for word in p.split())
    return out


cleaned_reviews = [clean_review(review).split() for review in reviews]

1222
{'now', 'not', 'being', 'hers', 'its', 'himself', 'too', 'more', 'me', 'a', 'aren', 'or', 'again', 'once', 'hadn', 'such', 'but', 'should', 'where', 'all', 'd', 'ourselves', 'further', 'didn', 'out', 'into', 'over', 'most', 'few', 'after', 'can', 'mustn', 'those', 'll', 'yours', 'did', 'having', 'themselves', 'doesn', 'i', 'so', 'an', 'him', 'isn', 'each', 'herself', 'they', 'until', 'no', 'wouldn', 'was', 'other', 'ain', 'then', 'is', 'yourselves', 'any', 'our', 'this', 'were', 'as', 'shan', 'be', 'am', 'haven', 'in', 'what', 'both', 'had', 'will', 'and', 'who', 's', 'wasn', 'them', 'these', 'her', 'for', 'myself', 'some', 'than', 'couldn', 'does', 'don', 'it', 'of', 'only', 'itself', 'during', 'there', 'just', 'if', 'are', 'that', 'while', 'm', 'how', 'down', 'whom', 'yourself', 've', 'we', 'she', 'doing', 'with', 'below', 'weren', 'mightn', 'theirs', 'y', 'needn', 'by', 'shouldn', 'which', 'been', 're', 'he', 'do', 'above', 'o', 'against', 'nor', 'your', 'has', 'you', 'between'

In [8]:
# length of cleaned reviews
len(cleaned_reviews)

1222

In [9]:
# Creating a dictionary
dictionary = corpora.Dictionary(cleaned_reviews)

In [10]:
inp = [dictionary.doc2bow(review) for review in cleaned_reviews]

In [11]:
# Creating lda model
lda = gensim.models.ldamodel.LdaModel
ldamodel = lda(inp, num_topics=5, id2word=dictionary, passes=75)
a = ldamodel.print_topics(num_topics=5, num_words=5)

In [20]:
print(a)

[(0, 0.03292397), (2, 0.06108385), (1, 0.089471988), (4, 0.25228694), (3, 0.56423324)]


In [14]:
# inference
# Copy the input review in the test variable

test = "So, Airlift has one of Akshay Kumar's best performances - along with other sterling acts. Rich Indian-origin businessman Ranjit Katyal (Akshay) loves profits and dislikes the idea of India. But Ranjit's golden life in the desert crumbles when Iraqi forces invade Kuwait. Suddenly, people are savagely shot in the streets, houses looted, buildings blown up, tanks taking over, choppers hovering maliciously amidst minarets. Airlift features Akshay at his best - based on real-life characters, there's little khiladi-wala swag in Akshay's performance and more mature control. As Ranjit, who goes from protecting his kin to his countrymen, Akshay does a polished, restrained, powerful job. Nimrat conveys brittle edginess - she asks Ranjit, Ye Indian-Indian khelna, kya drama ho raha hai? - but grows into a woman who loves her husband's humanitarian heart. Certain cameos are outstanding. Prakash Belawadi brings alive surly, suspicious George, Kumud Mishra deeply impresses as a quietly determined MEA babu and Inaamulhuq oozes smooth menace as he quotes 'Amytabh Bachchan' to Ranjit - before showing him his business partner, hanging from a crane outside a palace full of bloodied marble and broken glass. Airlift's scale is impressive and editing (Hemanti Sarkar) deft. Some sequences - Iraqi soldiers brutally molesting an Indian girl, looting even onions, cheerily singing 'Ek Do Teen', dragging out a young mother, so the boys can have some fun with her - are intense. But the movie could've increased this intensity, the horror and taut, time-ticking dread that typify unforgettable siege/rescue films like Hotel Rwanda (2004), where you vividly felt humanity running out each second. Airlift depicts desperation but with more sound and light than darkness, more broad strokes than fine detail. Yet, Airlift works because it conveys a time when armies will attack civilians - you're struck by how IS was born from the Iraqi army's core - and raises Bollywood's generic bar. Plus, it movingly celebrates the most beautiful flag in the world."
test = clean_review(test).split()
test

['so',
 'airlift',
 'one',
 'akshay',
 'kumars',
 'best',
 'performance',
 'along',
 'sterling',
 'act',
 'rich',
 'indianorigin',
 'businessman',
 'ranjit',
 'katyal',
 'akshay',
 'love',
 'profit',
 'dislike',
 'idea',
 'india',
 'ranjits',
 'golden',
 'life',
 'desert',
 'crumbles',
 'iraqi',
 'force',
 'invade',
 'kuwait',
 'suddenly',
 'people',
 'savagely',
 'shot',
 'street',
 'house',
 'looted',
 'building',
 'blown',
 'up',
 'tank',
 'taking',
 'over',
 'chopper',
 'hovering',
 'maliciously',
 'amidst',
 'minaret',
 'airlift',
 'feature',
 'akshay',
 'best',
 'based',
 'reallife',
 'character',
 'there',
 'little',
 'khiladiwala',
 'swag',
 'akshays',
 'performance',
 'mature',
 'control',
 'ranjit',
 'go',
 'protecting',
 'kin',
 'countryman',
 'akshay',
 'polished',
 'restrained',
 'powerful',
 'job',
 'nimrat',
 'conveys',
 'brittle',
 'edginess',
 'asks',
 'ranjit',
 'ye',
 'indianindian',
 'khelna',
 'kya',
 'drama',
 'ho',
 'raha',
 'hai',
 'grows',
 'woman',
 'love',
 '

In [15]:

test = dictionary.doc2bow(test)
print(ldamodel[test])
a = list(sorted(ldamodel[test], key=lambda x: x[1]))

[(0, 0.032923248), (1, 0.089477375), (2, 0.061083086), (3, 0.56423235), (4, 0.25228399)]


In [16]:
a

[(0, 0.03292397),
 (2, 0.06108385),
 (1, 0.089471988),
 (4, 0.25228694),
 (3, 0.56423324)]

In [17]:
# Least related topic to the test review is the first element of the sorted list a
print("The words associated with least related topic to the test review are")
ldamodel.print_topic(a[0][0])


The words associated with least related topic to the test review are


'0.003*"life" + 0.002*"character" + 0.002*"love" + 0.001*"film" + 0.001*"food" + 0.001*"aur" + 0.001*"though" + 0.001*"one" + 0.001*"look" + 0.001*"hai"'

In [18]:
# Most probable topic related to the test review is the last element of the sorted list a
print("The words associated with most probable topic related to the test review are")
ldamodel.print_topic(a[-1][0])

The words associated with most probable topic related to the test review are


'0.017*"film" + 0.010*"one" + 0.010*"character" + 0.009*"like" + 0.007*"make" + 0.007*"story" + 0.006*"even" + 0.006*"it" + 0.006*"audience" + 0.005*"part"'