In [1]:
import pandas as pd
import numpy as np
from collections import Counter

In [2]:
filename = "rt_reviews.csv"
df = pd.read_csv("rt_reviews.csv", encoding='ISO-8859-1')
df.head()

Unnamed: 0,Freshness,Review
0,fresh,"Manakamana doesn't answer any questions, yet ..."
1,fresh,Wilfully offensive and powered by a chest-thu...
2,rotten,It would be difficult to imagine material mor...
3,rotten,Despite the gusto its star brings to the role...
4,rotten,If there was a good idea at the core of this ...


In [None]:
#df = df.sample(96000)

In [3]:
df_shuffled = df.sample(frac=1, random_state=42).reset_index(drop=True)

In [4]:
# Calculating the sizes of the train, deve, and test
# Keeping 80% for training
# 10% for dev
# 10# for testing
train_size = int(0.8 * len(df))
dev_size = int(0.1 * len(df))
test_size = len(df) - train_size - dev_size

# Splitting the datasets
train_df = df_shuffled[:train_size]
dev_df = df_shuffled[train_size : train_size + dev_size]
test_df = df_shuffled[train_size + dev_size :]



In [5]:
word_counts = Counter()
for review in train_df['Review']:
    tokens = review.lower().split()
    word_counts.update(tokens)

# Building vocabulary and reverse indices
min_occurrences = 5
vocabulary = [word for word, count in word_counts.items() if count >= min_occurrences]
reverse_index = {word: idx for idx, word in enumerate(vocabulary)}


In [6]:
# Tokenizing and creating counters
word_counts = Counter()
sentiment_word_counts = {'fresh': Counter(), 'rotten': Counter()}

for _, row in train_df.iterrows():
    sentiment, review = row['Freshness'], row['Review']
    tokens = set(review.lower().split())
    word_counts.update(tokens)
    sentiment_word_counts[sentiment].update(tokens)

# Calculating the probabilities
num_documents = len(train_df)
num_fresh_documents = sum(train_df['Freshness'] == 'fresh')
num_rotten_documents = sum(train_df['Freshness'] == 'rotten')

p_the = word_counts['the'] / num_documents
p_the_given_fresh = sentiment_word_counts['fresh']['the'] / num_fresh_documents
p_the_given_rotten = sentiment_word_counts['rotten']['the'] / num_rotten_documents

print(f"P(the) = {p_the}")
print(f"P(the|fresh) = {p_the_given_fresh}")
print(f"P(the|rotten) = {p_the_given_rotten}")


P(the) = 0.6342083333333334
P(the|fresh) = 0.6352426992279883
P(the|rotten) = 0.6331743337117386


In [7]:
def calculate_conditional_probabilities(vocabulary, sentiment_word_counts, num_fresh_documents, num_rotten_documents, smoothing=1):
    fresh_probs = {}
    rotten_probs = {}
    vocab_size = len(vocabulary)
    
    for word in vocabulary:
        fresh_count = sentiment_word_counts['fresh'][word]
        rotten_count = sentiment_word_counts['rotten'][word]
        
        fresh_prob = (fresh_count + smoothing) / (num_fresh_documents + vocab_size * smoothing)
        rotten_prob = (rotten_count + smoothing) / (num_rotten_documents + vocab_size * smoothing)
        
        fresh_probs[word] = fresh_prob
        rotten_probs[word] = rotten_prob
    
    return fresh_probs, rotten_probs

def naive_bayes_classifier(review, vocabulary, fresh_probs, rotten_probs):
    tokens = set(review.lower().split())
    tokens = [token for token in tokens if token in vocabulary]
    
    fresh_prob = np.prod([fresh_probs[token] for token in tokens])
    rotten_prob = np.prod([rotten_probs[token] for token in tokens])

    return 'fresh' if fresh_prob > rotten_prob else 'rotten'



In [8]:
# Calculating the conditional probabilities
fresh_probs, rotten_probs = calculate_conditional_probabilities(vocabulary, sentiment_word_counts, num_fresh_documents, num_rotten_documents, smoothing=0)

# Classifing reviews in the development dataset and calculate accuracy
correct_predictions = 0
for _, row in dev_df.iterrows():
    sentiment, review = row['Freshness'], row['Review']
    predicted_sentiment = naive_bayes_classifier(review, vocabulary, fresh_probs, rotten_probs)
    if predicted_sentiment == sentiment:
        correct_predictions += 1

accuracy = correct_predictions / len(dev_df)
print(f"Accuracy: {accuracy}")

Accuracy: 0.8030208333333333


In [9]:


# Calculate the conditional probabilities with and without Laplace smoothing
fresh_probs_smoothed, rotten_probs_smoothed = calculate_conditional_probabilities(vocabulary, sentiment_word_counts, num_fresh_documents, num_rotten_documents, smoothing=1)
fresh_probs_unsmoothed, rotten_probs_unsmoothed = calculate_conditional_probabilities(vocabulary, sentiment_word_counts, num_fresh_documents, num_rotten_documents, smoothing=0)

# Classify reviews in the development dataset and calculate accuracy for both cases
correct_predictions_smoothed = 0
correct_predictions_unsmoothed = 0

for _, row in dev_df.iterrows():
    sentiment, review = row['Freshness'], row['Review']
    
    predicted_sentiment_smoothed = naive_bayes_classifier(review, vocabulary, fresh_probs_smoothed, rotten_probs_smoothed)
    if predicted_sentiment_smoothed == sentiment:
        correct_predictions_smoothed += 1

    predicted_sentiment_unsmoothed = naive_bayes_classifier(review, vocabulary, fresh_probs_unsmoothed, rotten_probs_unsmoothed)
    if predicted_sentiment_unsmoothed == sentiment:
        correct_predictions_unsmoothed += 1

accuracy_smoothed = correct_predictions_smoothed / len(dev_df)
accuracy_unsmoothed = correct_predictions_unsmoothed / len(dev_df)

print(f"Accuracy with Laplace smoothing: {accuracy_smoothed}")
print(f"Accuracy without smoothing: {accuracy_unsmoothed}")


Accuracy with Laplace smoothing: 0.8052291666666667
Accuracy without smoothing: 0.8030208333333333


In [10]:
def grid_search_alpha(vocabulary, sentiment_word_counts, num_fresh_documents, num_rotten_documents, dev_df, alphas):
    best_alpha = None
    best_accuracy = 0
    
    for alpha in alphas:
        # For each alpha calculating the conditional probability
        fresh_probs_alpha, rotten_probs_alpha = calculate_conditional_probabilities(vocabulary, sentiment_word_counts, num_fresh_documents, num_rotten_documents, smoothing=alpha)
        
        # Count to store correct predictions
        correct_predictions_alpha = 0
        # Making prediction
        for _, row in dev_df.iterrows():
            sentiment, review = row['Freshness'], row['Review']
            predicted_sentiment_alpha = naive_bayes_classifier(review, vocabulary, fresh_probs_alpha, rotten_probs_alpha)
            if predicted_sentiment_alpha == sentiment:
                correct_predictions_alpha += 1
        # Calculating the accuracy
        accuracy_alpha = correct_predictions_alpha / len(dev_df)
        
        # Updating beset alpha, based on the accuracy
        if accuracy_alpha > best_accuracy:
            best_alpha = alpha
            best_accuracy = accuracy_alpha
    
    return best_alpha, best_accuracy

# Defining different alpha values to experiment with
alphas = [0, 0.1, 0.5, 1, 2, 5, 10]

# Performing analysis
best_alpha, best_accuracy = grid_search_alpha(vocabulary, sentiment_word_counts, num_fresh_documents, num_rotten_documents, dev_df, alphas)

print(f"Best alpha value: {best_alpha}")
print(f"Best accuracy: {best_accuracy}")


Best alpha value: 0.5
Best accuracy: 0.8056666666666666


In [11]:
def calculate_class_given_word_probabilities(vocabulary, fresh_probs, rotten_probs, num_fresh_documents, num_rotten_documents):
    fresh_given_word_probs = {}
    rotten_given_word_probs = {}
    
    for word in vocabulary:
        fresh_prob = fresh_probs[word]
        rotten_prob = rotten_probs[word]
        
        prob_word = fresh_prob * num_fresh_documents + rotten_prob * num_rotten_documents
        
        fresh_given_word_prob = (fresh_prob * num_fresh_documents) / prob_word
        rotten_given_word_prob = (rotten_prob * num_rotten_documents) / prob_word
        
        fresh_given_word_probs[word] = fresh_given_word_prob
        rotten_given_word_probs[word] = rotten_given_word_prob
    
    return fresh_given_word_probs, rotten_given_word_probs

# Calculating P[class | word] for each word
fresh_given_word_probs, rotten_given_word_probs = calculate_class_given_word_probabilities(vocabulary, fresh_probs_smoothed, rotten_probs_smoothed, num_fresh_documents, num_rotten_documents)

# Sorting the words by P[class | word] and get the top 10 words for each class
top_fresh_words = sorted(fresh_given_word_probs, key=fresh_given_word_probs.get, reverse=True)[:10]
top_rotten_words = sorted(rotten_given_word_probs, key=rotten_given_word_probs.get, reverse=True)[:10]

print("Top 10 words that predict 'fresh':")
print(top_fresh_words)

print("\nTop 10 words that predict 'rotten':")
print(top_rotten_words)

Top 10 words that predict 'fresh':
['heartbreakingly', 'razor-sharp', 'beautifully.', 'flawless,', 'gem.', 'skilful', 'joyous,', 'cannily', 'rewarded.', 'petzold']

Top 10 words that predict 'rotten':
['charmless', 'unfunny', 'limp,', 'mirthless', 'lifeless.', 'flavorless', 'tediously', 'charmless,', 'yawn.', 'uninteresting,']


In [12]:
# Calculating the conditional probabilities using the best alpha value
fresh_probs_best_alpha, rotten_probs_best_alpha = calculate_conditional_probabilities(vocabulary, sentiment_word_counts, num_fresh_documents, num_rotten_documents, smoothing=best_alpha)

# to store correct predictions
correct_predictions_test = 0
for _, row in test_df.iterrows():
    sentiment, review = row['Freshness'], row['Review']
    predicted_sentiment_test = naive_bayes_classifier(review, vocabulary, fresh_probs_best_alpha, rotten_probs_best_alpha)
    if predicted_sentiment_test == sentiment:
        correct_predictions_test += 1

accuracy_test = correct_predictions_test / len(test_df)
print(f"Final accuracy: {accuracy_test}")


Final accuracy: 0.8032708333333334
