In [67]:
import math
import pandas as pd
import numpy as np
import csv
from sklearn.model_selection import train_test_split

In [68]:
with open('./rt_reviews.csv', 'r', encoding='ISO-8859-1') as file:
    reader = csv.reader(file, delimiter=',')
    headers = next(reader)  # read the header row
    data = [dict(zip(headers, row)) for row in reader]

In [69]:
#printing the first row of data
print("Freshness: ",data[0]["Freshness"])
print("Review: ",data[0]["Review"])

Freshness:  fresh
Review:   Manakamana doesn't answer any questions, yet makes its point: Nepal, like the rest of our planet, is a picturesque but far from peaceable kingdom.


In [70]:
#Splitting into train and test data
data_size = len(data)
train_size = math.ceil(0.8 * data_size)
dev_size = math.ceil(0.1 * data_size)
test_size = math.ceil(0.1 * data_size)

train_data = data[:train_size]
dev_data = data[train_size:train_size+dev_size]
test_data = data[train_size+dev_size:]

print("The size of the train data is:", len(train_data))
print("The size of the development data is:", len(dev_data))
print("The size of the test data is:", len(test_data))

print("Sample for Train: ", train_data[0])
print("Sample for Development: ", dev_data[0])
print("Sample for Test: ", test_data[0])


The size of the train data is: 384000
The size of the development data is: 48000
The size of the test data is: 48000
Sample for Train:  {'Freshness': 'fresh', 'Review': " Manakamana doesn't answer any questions, yet makes its point: Nepal, like the rest of our planet, is a picturesque but far from peaceable kingdom."}
Sample for Development:  {'Freshness': 'rotten', 'Review': ' A film that abuses its ridiculous premise and never gives necessary time to delve into the motivations of its human characters who are overshadowed by the monster. [Full review in Spanish]'}
Sample for Test:  {'Freshness': 'fresh', 'Review': " It's the directorial debut of Clea Duvall and it's very much an actors' showcase: eight characters with heavy baggage doing their best to ignore the weight."}


In [71]:
#Create a function to preprocess the text data
import re

def preprocess_text(text):
# Convert text to lowercase
    text = text.lower()
# Remove non-alphabetic characters
    text = re.sub('[^A-Za-z]+', ' ', text)
# Split text into words
    words = text.split()
# Remove stop words
    stopwords = ['the', 'a', 'an', 'and', 'or', 'in', 'of', 'to', 'for', 'with', 'on', 'at', 'that', 'this']
    words = [w for w in words if w not in stopwords]
    return words

#Create the vocabulary and word count dictionary
vocabulary = []
v_count = {}

for row in train_data:
# Preprocess the text
    words = preprocess_text(row["Review"])
for word in words:
# If the word is not already in the dictionary, add it with a count of 1
    if word not in v_count:
        vocabulary.append(word)
        v_count[word] = 1
# If the word is already in the dictionary, increment its count
    else:
        v_count[word] += 1

#Remove rare occurring words
rare_count = 5
filtered_vocabulary = []
filtered_v_count = {}
for vocab in v_count:
    if v_count[vocab] >= rare_count and vocab != "":
        filtered_vocabulary.append(vocab)
        filtered_v_count[vocab] = v_count[vocab]

#Display one Sample:
for word in vocabulary:
    print("Sample of Vocabulary list before filtering: ", word)
    print("Sample of Word count before filtering: ", word, ":", v_count[word])
    break
for word in filtered_vocabulary:
    print("\nSample of Vocabulary list after filtering: ", word)
    print("Sample of Word count after filtering: ", word, ":", filtered_v_count[word])
    break



Sample of Vocabulary list before filtering:  those
Sample of Word count before filtering:  those : 1


In [72]:
#Positive Documents
Positive_reviews = []

for row in train_data:
    if row["Freshness"] == "fresh":
        Positive_reviews.append(row["Review"].lower())

#Display number of positive documents:
print("Number of Positive documents: ", len(Positive_reviews))


Number of Positive documents:  192306


In [73]:
def count_word_in_reviews(word, reviews):
    """
    Count the number of times a given word appears in a list of reviews.

    Args:
    word (str): The word to count.
    reviews (list): A list of reviews.

    Returns:
    int: The number of times the word appears in the reviews.
    """
    count = 0
    for review in reviews:
        words = review.lower().split()
        if word.lower() in words:
            count += 1
    return count

Positive_reviews = [row['Review'] for row in train_data if row['Freshness'] == 'fresh']
word = 'Excellent'
count = count_word_in_reviews(word, Positive_reviews)
print(f'Number of "{word}" in positive reviews: {count}')


Number of "Excellent" in positive reviews: 834


In [74]:
Negetive_reviews = []

for row in train_data:
    if row["Freshness"]=="rotten":
        Negetive_reviews.append([row["Review"].lower()])

#Display number of negative documents:
print("Number of Negative documents: ", len(Negetive_reviews))

Number of Negative documents:  191694


In [75]:
def count_word_occurrences(word, reviews):
    word = word.lower()
    word_count = {}
    word_count[word] = 0
    for row in reviews:
        review = list(map(str, str(row).split(" ")))
        if word in review:
            word_count[word] += 1
            return word_count[word]
print('Number of "Excellent" in Negative documents: ', count_word_occurrences("Excellent", Negetive_reviews))


Number of "Excellent" in Negative documents:  1


In [76]:
# Calculating the probability and conditional probablity of the words occuring
def count_word_in_reviews(word, reviews):
    """
    Count the number of times a given word appears in a list of reviews.

    Args:
    word (str): The word to count.
    reviews (list): A list of reviews.

    Returns:
    int: The number of times the word appears in the reviews.
    """
    count = 0
    for review in reviews:
        words = review.lower().split()
        if word.lower() in words:
            count += 1
    return count


Positive_reviews = [row['Review'] for row in train_data if row['Freshness'] == 'fresh']
word = 'Excellent'
count = count_word_in_reviews(word, Positive_reviews)
print(f'Number of "{word}" in positive reviews: {count}')


def P(word):
    word = word.lower()
    count = 0
    doc_count = len(train_data)
    for review in train_data:
        words = review['Review'].lower().split()
        if word in words:
            count += 1

    P_of_word = f'{count/doc_count:.10f}'
    return P_of_word


def P_CP(word):
    P_CP_of_vocabulary = {}
    a = count_word_in_reviews(word, Positive_reviews) / len(Positive_reviews)
    P_CP_of_vocabulary[word] = f'{a:.10f}'
    return P_CP_of_vocabulary[word]


def N_CP(word):
    N_CP_of_vocabulary = {}
    a = count_word_in_reviews(word, Negative_reviews) / len(Negative_reviews)
    N_CP_of_vocabulary[word] = f'{a:.10f}'
    return N_CP_of_vocabulary[word]


Negative_reviews = [row['Review'] for row in train_data if row['Freshness'] == 'rotten']

# OUTPUT
print("The Probability of 'the' occurring in dataset: ", P("the"))
print("Conditional Probability P(the|Positive):", P_CP("the"))
print("Conditional Probability P(the|Negative):", N_CP("the"))




Number of "Excellent" in positive reviews: 834
The Probability of 'the' occurring in dataset:  0.6345859375
Conditional Probability P(the|Positive): 0.6357939950
Conditional Probability P(the|Negative): 0.6333740232


In [77]:
#calculate using development data
def predict_class(review):
    classes = {}
    words = set(map(str, str(review).lower().split(" ")))
    words = [word for word in words if word in filtered_vocabulary and len(word)>3]

    pp = 1
    np = 1

    for word in words:
        pp *= float(P_CP(word, Positive_reviews))
        np *= float(N_CP(word, Negative_reviews))

    if np < pp:
        return "fresh"
    else:
        return "rotten"


In [78]:
import pandas as pd
def accuracy(kstart, kend):
    equal=0
    unequal = 0
    iterations = 1
    accuracy = []
    loss = []
    df = pd.DataFrame(development)
    for row in df.values:
        print("Iteration",iterations-kstart,"of", k)
        iterations +=1
        if iterations >= kstart:

            if predict_class(row[1]) == row[0]:
                equal+=1
                accuracy.append((equal/iterations)*100)
            else:
                unequal +=1
                loss.append((unequal/iterations)*100)
            if iterations==k:
                break
            print("Classified",iterations," and current accuracy is:",accuracy[iterations-1])
    
        if len(accuracy)==0:
            accuracy.append(0)
        if len(loss)==0:
            loss.append(0)

    return [accuracy,loss]

In [79]:
print("The accuracy of k=10 reviews in dev set:", accuracy(10,20))

NameError: name 'development' is not defined