#  Rotten Tomatoes Reviews prediction : Naive-Bayes Classifier

This dataset is a compilation of movie reviews that were obtained from the well-known movie review website Rotten Tomatoes. The dataset consists of the reviews' text and a corresponding label that specifies whether the review was classified as "fresh" or "rotten", based on Rotten Tomatoes' proprietary review aggregation system. 

This dataset is a highly valuable resource for individuals interested in conducting sentiment analysis and natural language processing, including researchers, data analysts, and machine learning practitioners. It contains reviews from a diverse group of critics and publications, encompassing a wide range of movies across various genres and languages.

In [128]:
import numpy as np
import matplotlib.pyplot as plt

import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ravirajpurohit/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ravirajpurohit/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


## -------------------------------------- Load the data --------------------------------------

In [130]:
def read_data(filename='./data/rt_reviews.csv'):
    """
    Load the dataset
    
    Parameters
    ----------
    filename - string
    
    Returns
    -------
    data - numpy array of floats
    labels - numpy array of integers
    """
    reviews = []
    labels = []
    with open(filename, 'r', encoding='latin-1') as f:
        for line in f:
            line = line.split(',')
            label, review = line[0], ''.join(line[1:])

            labels.append(label)
            reviews.append(review)

    ## returning from 1st index, 1st line is just column names in the dataset
    return np.array(labels[1:]), np.array(reviews[1:])

In [131]:
labels, reviews = read_data()

In [132]:
labels.shape, reviews.shape

((480000,), (480000,))

## -------------------------------------- Split the data --------------------------------------

In [133]:
def train_test_val_split(labels, reviews):
    """
    DESCRIPTION
    """
    ## assume that labels and reviews are numpy arrays with the same length
    data_size = len(labels)

    ## shuffle the indices of the data
    shuffled_indices = np.random.RandomState(seed=21).permutation(data_size)

    ## split the indices into train, validation, and test sets
    train_indices = shuffled_indices[:int(0.7 * data_size)]
    val_indices = shuffled_indices[int(0.7 * data_size):int(0.8 * data_size)]
    test_indices = shuffled_indices[int(0.8 * data_size):]

    ## use the indices to extract the corresponding data
    train_data = reviews[train_indices]
    train_labels = labels[train_indices]

    val_data = reviews[val_indices]
    val_labels = labels[val_indices]

    test_data = reviews[test_indices]
    test_labels = labels[test_indices]

    return train_data, train_labels, test_data, test_labels, val_data, val_labels

In [134]:
train_data, train_labels, test_data, test_labels, val_data, val_labels = train_test_val_split(labels, reviews)

## -------------------------------------- Data Exploration --------------------------------------

In [135]:
def get_data_dist(labels):
    """
    DESCRIPTION
    """
    # assume that train_labels is a numpy array
    unique_labels, label_counts = np.unique(labels, return_counts=True)

    for label, count in zip(unique_labels, label_counts):
        print(f"- {label} : {round(100*count/len(labels),2)}%")
        
    return None

In [136]:
for name, labs in zip(['Train','Test','Validation'],[train_labels, test_labels, val_labels]):
    print(f'\nClass distribution in {name} dataset')
    get_data_dist(labs)


Class distribution in Train dataset
- fresh : 50.04%
- rotten : 49.96%

Class distribution in Test dataset
- fresh : 49.75%
- rotten : 50.25%

Class distribution in Validation dataset
- fresh : 50.2%
- rotten : 49.8%


## -------------------------------------- Data Preprocessing --------------------------------------

In [137]:
def clean_data(data):
    """
    
    """
    data = np.array([i.replace('"','').replace('\n','').strip() for i in data])
    return data

In [138]:
print(f'----------------------------- Training data before cleaning ----------------------------- \n{train_data}')

----------------------------- Training data before cleaning ----------------------------- 
['" Gloriously daft but with a good deal of heart Fanged Up\'s Hammer in the slammer shtick has a surprising amount of bite. It\'s great entertainment for a night in with good friends and a couple of crates of beer -- unless of course you only drink wine."\n'
 '" The Back-Up Plan represents a major comeback for Jennifer Lopez. Unfortunately she\'s come back to making crap. "\n'
 '" The acting can be so-so the story implausible the camerawork stolid -- none of that really matters if you care about what happens to the characters."\n'
 ...
 '" The film is indeed a bit pat. Sweet and funny - largely thanks to James Corden in the lead role - it\'s never particularly surprising."\n'
 ' More concerned with recruiting the testosterone troubled boys of today than it is rewarding fans of yesteryear.\n'
 '" It strives awfully hard for depth but more often than not comes off too shallow."\n']


In [139]:
train_data = clean_data(train_data)
test_data = clean_data(test_data)
val_data = clean_data(val_data)

In [140]:
print(f'----------------------------- Training data after cleaning ----------------------------- \n{train_data}')

----------------------------- Training data after cleaning ----------------------------- 
["Gloriously daft but with a good deal of heart Fanged Up's Hammer in the slammer shtick has a surprising amount of bite. It's great entertainment for a night in with good friends and a couple of crates of beer -- unless of course you only drink wine."
 "The Back-Up Plan represents a major comeback for Jennifer Lopez. Unfortunately she's come back to making crap."
 'The acting can be so-so the story implausible the camerawork stolid -- none of that really matters if you care about what happens to the characters.'
 ...
 "The film is indeed a bit pat. Sweet and funny - largely thanks to James Corden in the lead role - it's never particularly surprising."
 'More concerned with recruiting the testosterone troubled boys of today than it is rewarding fans of yesteryear.'
 'It strives awfully hard for depth but more often than not comes off too shallow.']


In [141]:
def preprocess(text):
    stop_words = stopwords.words('english')
    text = text.lower()
    text = word_tokenize(text)
    text = [word for word in text if word.isalpha()]
    text = [word for word in text if word not in stop_words]
    text = ' '.join(text)
    return text


In [147]:
train_data = np.array([preprocess(i) for i in train_data])
test_data = np.array([preprocess(i) for i in test_data])
# val_data = np.array([preprocess(i) for i in val_data])

## -------------------------------------- Feature Engineering --------------------------------------

In [None]:
def get_features(reviews):
    word_counts = []
    for review in reviews:
        words = review.split()
        word_counts.append(dict(nltk.FreqDist(words)))
    feature_matrix = np.zeros((len(reviews), len(vocab)))
    for i in range(len(reviews)):
        for j, word in enumerate(vocab):
            if word in word_counts[i]:
                feature_matrix[i, j] = word_counts[i][word]
    return feature_matrix

all_words = ' '.join(reviews).split()
freq_dist = nltk.FreqDist(all_words)
vocab = freq_dist.keys()

X_train = get_features(train_data)
X_test = get_features(test_data)


In [None]:
X_val = get_features(val_data)

## -------------------------------------- Model Training --------------------------------------

In [None]:
def train(X_train, y_train):
    n_samples, n_features = X_train.shape
    classes = np.unique(y_train)
    n_classes = len(classes)
    prior_probs = np.zeros(n_classes)
    conditional_probs = np.zeros((n_classes, n_features))
    for i, label in enumerate(classes):
        X_train_label = X_train[y_train == label]
        prior_probs[i] = X_train_label.shape[0] / n_samples
        conditional_probs[i, :] = (X_train_label.sum(axis=0) + 1) / (X_train_label.sum() + n_features)
    return classes, prior_probs, conditional_probs

classes, prior_probs, conditional_probs = train(X_train, train_labels)


## -------------------------------------- Performance Analysis --------------------------------------

In [None]:
def predict(X_test, classes, prior_probs, conditional_probs):
    n_samples, n_features = X_test.shape
    y_pred = np.zeros(n_samples)
    for i in range(n_samples):
        probs = np.zeros(len(classes))
        for j, label in enumerate(classes):
            probs[j] = np.log(prior_probs[j]) + (np.log(conditional_probs[j, :]) * X_test[i]).sum()
        y_pred[i] = classes[np.argmax(probs)]
    return y_pred

y_pred = predict(X_test, classes, prior_probs, conditional
