In [1]:
import re
import math
import operator
import pandas as pd
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split

ModuleNotFoundError: No module named 'pandas'

### I have downloaded the complete dataset from Kaggle: https://www.kaggle.com/datasets/ulrikthygepedersen/rotten-tomatoes-reviews

In [274]:
# Ref.: https://stackoverflow.com/questions/5552555/unicodedecodeerror-invalid-continuation-byte
df = pd.read_csv("Downloads/rt_reviews.csv", encoding="latin-1")

In [275]:
# A peek at the Dataset
df.head()

Unnamed: 0,Freshness,Review
0,fresh,"Manakamana doesn't answer any questions, yet ..."
1,fresh,Wilfully offensive and powered by a chest-thu...
2,rotten,It would be difficult to imagine material mor...
3,rotten,Despite the gusto its star brings to the role...
4,rotten,If there was a good idea at the core of this ...


In [276]:
print("Number of Reviews: ", df.shape[0])

Number of Reviews:  480000


In [277]:
print("Are there any null values?")
for column in df.columns:
    print(column, ":  number of null values=", df[column].isna().sum())

Are there any null values?
Freshness :  number of null values= 0
Review :  number of null values= 0


### Divide the dataset as train, development, and test.
- Please note that the dataset size is huge.
- To avoid computational resource constrainings, I am considering 20% of the entire dataset for this assignment.

In [278]:
# Splitting the Data into Training and Test Sets
# As the dataset is large I am selecting 20% reviews to train, dev and test the model.

df1, df2 = train_test_split(df, stratify=df["Freshness"], test_size=0.8)

In [279]:
# I will be developing the model and testing it with df1 data only.

print("Number of rows in df1 DataFrame: ", df1.shape)

Number of rows in df1 DataFrame:  (96000, 2)


In [280]:
# Distrutbution of Freshness column
df1["Freshness"].value_counts()

rotten    48000
fresh     48000
Name: Freshness, dtype: int64

In [281]:
df1 = df1.reset_index(drop=True)

In [282]:
df1

Unnamed: 0,Freshness,Review
0,rotten,Drive a stake through the heart of this stink...
1,fresh,A political drama that assumes you'll be able...
2,rotten,"How about next time, instead of telling us ho..."
3,fresh,Perhaps the most salient and unsung thread in...
4,fresh,This battle of the voices [Steve Coogan and R...
...,...,...
95995,rotten,"As Cuarï¿½ï¿½n spells out in his int""Life in ..."
95996,fresh,This is just the movie for two hours of mindl...
95997,fresh,"Certainly an enjoyable experiment, but ultima..."
95998,rotten,"Again and again, the I Love Trouble script ta..."


#### Splitting the Dataset into Train and Test: 80-20 Ratio

In [283]:
# Actual Train Test Split
train_df, test_df = train_test_split(df1, stratify=df1["Freshness"], test_size=0.2)

# Resetting the indices.
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

#### Splitting the Train Dataset into Train and Dev: 80-20 Ratio

In [284]:
# Actual Train Dev Split
train_df, dev_df = train_test_split(train_df, stratify=train_df["Freshness"], test_size=0.2)

# Resetting the indices.
train_df = train_df.reset_index(drop=True)
dev_df = dev_df.reset_index(drop=True)

In [285]:
print("Training DataSize: ", train_df.shape[0])
print("Dev DataSize: ", dev_df.shape[0])
print("Testing DataSize: ", test_df.shape[0])

Training DataSize:  61440
Dev DataSize:  15360
Testing DataSize:  19200


#### Preprocessing the Train, Dev and Test Set Reviews

In [286]:
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

def preprocess_review(review):
    # Stripping leading and trailing empty space characters
    # Converting the text to lower case letters
    review = review.strip().lower().split()
    
    # Removing stopwords
    review = " ".join([word for word in review if word not in stop_words])
    
    # Selecting only word with alphabet characters
    review = re.findall(r'[a-z]+', review)
    
    # Stemming the words and including a word if it has more than two characters
    review = [ps.stem(word) for word in review if len(ps.stem(word)) > 2]
    
    # Returning the list of words
    return review

##### Train Data Preprocessing

In [287]:
train_df["preprocessed_review"] = train_df["Review"].apply(preprocess_review)

##### Dev Data Preprocessing

In [288]:
dev_df["preprocessed_review"] = dev_df["Review"].apply(preprocess_review)

##### Test Data Preprocessing

In [289]:
test_df["preprocessed_review"] = test_df["Review"].apply(preprocess_review)

#### Building Vocabulary List

In [290]:
all_reviews_string = ""


# Iterating through all the samples in the training DataFrame
for i in tqdm(range(len(train_df))):
    
    # Retrieve the review
    review = train_df.loc[i, "preprocessed_review"]
    
    all_reviews_string = all_reviews_string + " " + " ".join(review)
    
    
vocabulary_list = list(set(all_reviews_string.split()))
print("Total Number of words in vocabulary list: ", len(vocabulary_list))

100%|██████████| 61440/61440 [00:49<00:00, 1245.21it/s] 


Total Number of words in vocabulary list:  29037


##### Omitting rare words if the occurrence is less than five times.

In [291]:
vocab_count = {}

for word in all_reviews_string.split():
    if word not in vocab_count.keys():
        vocab_count[word] = 1
    else:
        vocab_count[word] += 1
        
vocab_count = {k:v for k, v in vocab_count.items() if v<5}

print("Total Number of words in Vocabulary List after removing Rare Words: ", len(vocab_count.keys()))

Total Number of words in Vocabulary List after removing Rare Words:  17748


#### Calculate the following probability

##### Probability of the Occurrence

In [292]:
# Counting the total review documents in the training set
total_reviews = len(train_df)

# Computing the probability of occurrence of each word across all the documents
prob_of_occurrence = {k:v/total_reviews for k, v in vocab_count.items()}

##### Conditional Probability of the Sentiment

In [293]:
# Dictionary to keep track of the words that appear in "fresh" class and their probabilities
fresh_class_words = {}

fresh_df = train_df[train_df["Freshness"]=="fresh"]
for word in tqdm(vocab_count.keys()):
    mask = fresh_df["preprocessed_review"].apply(lambda x: True if word in x else False)
    if sum(mask) > 0:
        fresh_class_words[word] = sum(mask)/len(fresh_df)

100%|██████████| 17748/17748 [06:09<00:00, 48.07it/s]


In [296]:
# Dictionary to keep track of the words that appear in "rotten" class and their probabilities
rotten_class_words = {}

rotten_df = train_df[train_df["Freshness"]=="rotten"]
for word in tqdm(vocab_count.keys()):
    mask = rotten_df["preprocessed_review"].apply(lambda x: True if word in x else False)
    if sum(mask) > 0:
        rotten_class_words[word] = sum(mask)/len(rotten_df)

100%|██████████| 17748/17748 [06:05<00:00, 48.54it/s]


##### Computing Class Probabilities

In [297]:
data_distribution = dict(train_df["Freshness"].value_counts())

print("Training Class Dirstirbution Counts: ", data_distribution)

# Storing the total number of reviews in a variable
total_samples = train_df.shape[0]

# Computing the Class Probabilities
class_probabilities = {k:(v/total_samples) for k, v in data_distribution.items()}

print("Class-wise Probability Distributions")
for key, value in class_probabilities.items():
    print("Class=", key, " Probability=", value)
    
    
# Computing the Log of Class-wise probabilities
fresh_prob = math.log(class_probabilities["fresh"])
rotten_prob = math.log(class_probabilities["rotten"])

print("\n")
print("Probabilities after applying Log Transformations")
print("Class= rotten, Probability= ", rotten_prob)
print("Class= fresh, Probability= ", fresh_prob)

Training Class Dirstirbution Counts:  {'rotten': 30720, 'fresh': 30720}
Class-wise Probability Distributions
Class= rotten  Probability= 0.5
Class= fresh  Probability= 0.5


Probabilities after applying Log Transformations
Class= rotten, Probability=  -0.6931471805599453
Class= fresh, Probability=  -0.6931471805599453


#### Calculate accuracy using dev dataset

In [298]:
y_pred = []

# Iterating through each of the dev set reviews.
for review in tqdm(dev_df["preprocessed_review"]):
        
    # Initializing the prob_fresh and prob_rotten with the class probabilities.
    prob_fresh = fresh_prob
    prob_rotten = rotten_prob
    
    # Iterating through each word in the review.
    for word in review:
        
        # If the word is in fresh class words...
        if word in fresh_class_words.keys():
            prob_fresh += math.log(fresh_class_words[word])
            
        # If the word is in rotten class words..    
        elif word in rotten_class_words.keys():
            prob_rotten += math.log(rotten_class_words[word])
            
    # If the probability of belonging to fresh class is greater than rotten class probability, assign it to fresh class.
    if prob_fresh > prob_rotten:
        y_pred.append("fresh")
        
    # Assign rotten class as target label.
    else:
        y_pred.append("rotten")

100%|██████████| 15360/15360 [00:00<00:00, 195985.38it/s]


In [299]:
count_of_correct_predictions = 0

for actual_class, predicted_class in zip(dev_df["Freshness"], y_pred):
    if actual_class==predicted_class:
        count_of_correct_predictions += 1

print("Accuracy of the Naive Bayes Model on the Dev Set is: ", count_of_correct_predictions/len(y_pred))

Accuracy of the Naive Bayes Model on the Dev Set is:  0.47194010416666665


#### Compare the Effect of Smoothing

In [302]:
alpha = 1

fresh_class_words2 = {k:(v+alpha)/(total_samples + alpha*len(vocab_count.keys())) for k, v in fresh_class_words.items()}
rotten_class_words2 = {k:(v+alpha)/(total_samples + alpha*len(vocab_count.keys())) for k, v in rotten_class_words.items()}

y_pred = []

# Iterating through each of the dev set reviews.
for review in tqdm(dev_df["preprocessed_review"]):
    
    # Initializing the prob_fresh and prob_rotten with the class probabilities.
    prob_fresh = fresh_prob
    prob_rotten = rotten_prob
    
    # Iterating through each word in the review.
    for word in review:
        
        # If the word is in fresh class words...
        if word in fresh_class_words2.keys():
            prob_fresh += math.log(fresh_class_words2[word])
            
        # If the word is in rotten class words..    
        elif word in rotten_class_words2.keys():
            prob_rotten += math.log(rotten_class_words2[word])
            
    # If the probability of belonging to fresh class is greater than rotten class probability, assign it to fresh class.
    if prob_fresh > prob_rotten:
        y_pred.append("fresh")
        
    # Assign rotten class as target label.
    else:
        y_pred.append("rotten")
        

count_of_correct_predictions = 0

for actual_class, predicted_class in zip(dev_df["Freshness"], y_pred):
    if actual_class==predicted_class:
        count_of_correct_predictions += 1

print("Accuracy of the Naive Bayes Model on the Dev Set with Laplacian Smoothing is: ", count_of_correct_predictions/len(y_pred))

100%|██████████| 15360/15360 [00:00<00:00, 237484.32it/s]

Accuracy of the Naive Bayes Model on the Dev Set with Laplacian Smoothing is:  0.47194010416666665





In [303]:
alpha = 100

fresh_class_words2 = {k:(v+alpha)/(total_samples + alpha*len(vocab_count.keys())) for k, v in fresh_class_words.items()}
rotten_class_words2 = {k:(v+alpha)/(total_samples + alpha*len(vocab_count.keys())) for k, v in rotten_class_words.items()}

y_pred = []

# Iterating through each of the dev set reviews.
for review in tqdm(dev_df["preprocessed_review"]):
    
    # Initializing the prob_fresh and prob_rotten with the class probabilities.
    prob_fresh = fresh_prob
    prob_rotten = rotten_prob
    
    # Iterating through each word in the review.
    for word in review:
        
        # If the word is in fresh class words...
        if word in fresh_class_words2.keys():
            prob_fresh += math.log(fresh_class_words2[word])
            
        # If the word is in rotten class words..    
        elif word in rotten_class_words2.keys():
            prob_rotten += math.log(rotten_class_words2[word])
            
    # If the probability of belonging to fresh class is greater than rotten class probability, assign it to fresh class.
    if prob_fresh > prob_rotten:
        y_pred.append("fresh")
        
    # Assign rotten class as target label.
    else:
        y_pred.append("rotten")
        

count_of_correct_predictions = 0

for actual_class, predicted_class in zip(dev_df["Freshness"], y_pred):
    if actual_class==predicted_class:
        count_of_correct_predictions += 1

print("Accuracy of the Naive Bayes Model on the Dev Set with Laplacian Smoothing is: ", count_of_correct_predictions/len(y_pred))

100%|██████████| 15360/15360 [00:00<00:00, 233104.57it/s]

Accuracy of the Naive Bayes Model on the Dev Set with Laplacian Smoothing is:  0.47194010416666665





In [304]:
alpha = 1000

fresh_class_words2 = {k:(v+alpha)/(total_samples + alpha*len(vocab_count.keys())) for k, v in fresh_class_words.items()}
rotten_class_words2 = {k:(v+alpha)/(total_samples + alpha*len(vocab_count.keys())) for k, v in rotten_class_words.items()}

y_pred = []

# Iterating through each of the dev set reviews.
for review in tqdm(dev_df["preprocessed_review"]):
    
    # Initializing the prob_fresh and prob_rotten with the class probabilities.
    prob_fresh = fresh_prob
    prob_rotten = rotten_prob
    
    # Iterating through each word in the review.
    for word in review:
        
        # If the word is in fresh class words...
        if word in fresh_class_words2.keys():
            prob_fresh += math.log(fresh_class_words2[word])
            
        # If the word is in rotten class words..    
        elif word in rotten_class_words2.keys():
            prob_rotten += math.log(rotten_class_words2[word])
            
    # If the probability of belonging to fresh class is greater than rotten class probability, assign it to fresh class.
    if prob_fresh > prob_rotten:
        y_pred.append("fresh")
        
    # Assign rotten class as target label.
    else:
        y_pred.append("rotten")
        

count_of_correct_predictions = 0

for actual_class, predicted_class in zip(dev_df["Freshness"], y_pred):
    if actual_class==predicted_class:
        count_of_correct_predictions += 1

print("Accuracy of the Naive Bayes Model on the Dev Set with Laplacian Smoothing is: ", count_of_correct_predictions/len(y_pred))

100%|██████████| 15360/15360 [00:00<00:00, 231875.46it/s]

Accuracy of the Naive Bayes Model on the Dev Set with Laplacian Smoothing is:  0.47194010416666665





##### Top 10 words that predicts each class - without Laplacian Smoothing

In [305]:
# Printing top-10 words in the fresh_class_words
sorted_fcw = dict(sorted(fresh_class_words.items(), key=operator.itemgetter(1), reverse=True))

# top-10 most probable words "fresh" class and their probabilities
for word in list(sorted_fcw.keys())[:10]:
    print(word, sorted_fcw[word])

burli 0.00013020833333333333
hardscrabbl 0.00013020833333333333
silicon 0.00013020833333333333
whoa 0.00013020833333333333
paparazzi 0.00013020833333333333
chad 0.00013020833333333333
rousingli 0.00013020833333333333
havana 0.00013020833333333333
colman 0.00013020833333333333
ennobl 0.00013020833333333333


In [306]:
# Printing top-10 words in the rotten_class_words
sorted_rcw = dict(sorted(rotten_class_words.items(), key=operator.itemgetter(1), reverse=True))

# top-10 most probable words "rotten" class and their probabilities
for word in list(sorted_rcw.keys())[:10]:
    print(word, sorted_rcw[word])

bangkok 0.00013020833333333333
sox 0.00013020833333333333
ter 0.00013020833333333333
grime 0.00013020833333333333
objectif 0.00013020833333333333
marmaduk 0.00013020833333333333
hoser 0.00013020833333333333
diretor 0.00013020833333333333
oldi 0.00013020833333333333
pretext 0.00013020833333333333


#### Using the test dataset

In [307]:
alpha = 1

fresh_class_words2 = {k:(v+alpha)/(total_samples + alpha*len(vocab_count.keys())) for k, v in fresh_class_words.items()}
rotten_class_words2 = {k:(v+alpha)/(total_samples + alpha*len(vocab_count.keys())) for k, v in rotten_class_words.items()}

y_pred = []

# Iterating through each of the test set reviews.
for review in tqdm(test_df["preprocessed_review"]):
    
    # Initializing the prob_fresh and prob_rotten with the class probabilities.
    prob_fresh = fresh_prob
    prob_rotten = rotten_prob
    
    # Iterating through each word in the review.
    for word in review:
        
        # If the word is in fresh class words...
        if word in fresh_class_words2.keys():
            prob_fresh += math.log(fresh_class_words2[word])
            
        # If the word is in rotten class words..    
        elif word in rotten_class_words2.keys():
            prob_rotten += math.log(rotten_class_words2[word])
            
    # If the probability of belonging to fresh class is greater than rotten class probability, assign it to fresh class.
    if prob_fresh > prob_rotten:
        y_pred.append("fresh")
        
    # Assign rotten class as target label.
    else:
        y_pred.append("rotten")
        

count_of_correct_predictions = 0

for actual_class, predicted_class in zip(test_df["Freshness"], y_pred):
    if actual_class==predicted_class:
        count_of_correct_predictions += 1

print("Accuracy of the Naive Bayes Model on the Test Set with Laplacian Smoothing is: ", count_of_correct_predictions/len(y_pred))

100%|██████████| 19200/19200 [00:00<00:00, 200183.54it/s]

Accuracy of the Naive Bayes Model on the Test Set with Laplacian Smoothing is:  0.4742708333333333





References:
1. https://towardsdatascience.com/laplace-smoothing-in-na%C3%AFve-bayes-algorithm-9c237a8bdece
2. https://www.kaggle.com/datasets/ulrikthygepedersen/rotten-tomatoes-reviews
3. https://www.geeksforgeeks.org/removing-stop-words-nltk-python/
4. https://www.geeksforgeeks.org/python-stemming-words-with-nltk/