In [26]:
import numpy as np
import csv
from nltk.corpus import stopwords
from collections import Counter
import re

In [27]:
train = dict()
train['positive'] = []
train['negative'] = []
with open('IMDB Dataset.csv', newline='', encoding='utf-8') as csvfile:
    reader = csv.reader(csvfile)
    next(reader)
    for review_sentiment in reader:
        review = review_sentiment[0]
        review = review.lower()
        review = re.sub(r'[^a-z\s]', '', review)
        review = re.sub(r'\s+', ' ', review)
        review = review.split()
        sentiment = review_sentiment[1]
        train[sentiment].append(review)



In [28]:
# CREATE FREQUENCY TABLE
frequencies = dict()
for class_name, texts in train.items():
    bigram_tokens = []
    for text in texts:
        for i in range(len(text)-1):
            bigram_tokens.append(text[i] + " " + text[i+1])
    frequencies[class_name] = Counter(bigram_tokens)

In [29]:
# REMOVE STOPWORDS FROM TABLE
stop_words = set(stopwords.words('english'))
cleaned_frequencies = {}
for class_name, counts in frequencies.items():
    cleaned_frequencies[class_name] = Counter()
    for bigram, count in counts.items():
        token_pair = bigram.split(" ")
        if (token_pair[0] not in stop_words or token_pair[1] not in stop_words):
            cleaned_frequencies[class_name][bigram] = count

In [30]:
# SMOOTH THE FREQUENCIES USING LAPLACE
smoothing_constant = 0.01
likelihood = {}
for class_name, counts in cleaned_frequencies.items():
    likelihood[class_name] = {}
    count_total = sum(counts.values())
    for bigram, count in counts.items():
        likelihood[class_name][bigram] = (count+smoothing_constant)/(count_total + (smoothing_constant*(len(counts)+1)))
    likelihood[class_name]['OOV'] = smoothing_constant/(count_total + (smoothing_constant*(len(counts)+1)))

In [31]:
# CLASSIFY A REVIEW
def classify(review, likelihood, prior):
    pos_total = np.log(prior)
    neg_total = np.log(1-prior)
    for k in range(len(text)-1):
        if (text[k] not in stop_words or text[k+1] not in stop_words):
            if ((text[k] + " " + text[k+1]) in likelihood["positive"]):
                pos_total += np.log(likelihood["positive"][text[k] + " " + text[k+1]])
            else:
                pos_total += np.log(likelihood["positive"]["OOV"])
            if ((text[k] + " " + text[k+1]) in likelihood["negative"]):
                neg_total += np.log(likelihood["negative"][text[k] + " " + text[k+1]])
            else:
                neg_total += np.log(likelihood["negative"]["OOV"])
    if (pos_total > neg_total):
        return ("Positive", pos_total, neg_total)
    else:
        return ("Negative", pos_total, neg_total)

In [32]:
review = "For anyone with a hunger for real science fiction rather than the crowd-pleasing, watered-down version Hollywood typically offers (and that I often enjoy immensely), Interstellar is a satisfying entrée. I'd rank this alongside Memento and The Dark Knight as the best Nolan has done, and it's an immediate contender for one of 2014's best. The film deserves the label of an experience and the bigger the venue, the more immersive it will be. As event movies go, this is one of the most unique and mesmerizing."
print(classify(review, likelihood=likelihood, prior=0.5))

('Negative', -1260.0778418740128, -1179.8617773048038)


In [36]:
reviews = ["Hello World"]
pos_count = 0
neg_count = 0
for r1 in reviews:
    if (classify(r1, likelihood, prior = 0.5)[0] == "Positive"):
        pos_count += 1
    else:
        neg_count += 1
if (pos_count/(pos_count+neg_count) > 0.85):
    print("We recommend this movie!")
else:
    print("We do not recommend this movie")
print(f'Approximately {(pos_count/(pos_count+neg_count)) * 100}% of this movie\'s reviews had positive sentiment')

We do not recommend this movie
Approximately 0.0% of this movie's reviews had positive sentiment
