In [29]:
import pandas as pd

In [30]:
df = pd.read_csv("DS_group_project.csv")

In [31]:
import re
import string
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
from nltk.corpus import stopwords
from collections import defaultdict

def clean_text(text):
    # Remove punctuation
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    # Remove emojis (if applicable)
    # Add the code to remove emojis here
    # Convert to lowercase
    text = text.lower()
    return text


def remove_emojis(text):
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

df["REVIEW"] = df["REVIEW"].apply(lambda x: remove_emojis(x))



df["REVIEW"] = df["REVIEW"].apply(lambda x: clean_text(x))
df["REVIEW"] = df["REVIEW"].apply(lambda x: word_tokenize(x))

stop_words = set(stopwords.words("english"))

def remove_stop_words(tokens):
    return [word for word in tokens if word not in stop_words]

df["REVIEW"] = df["REVIEW"].apply(lambda x: remove_stop_words(x))

def build_vocabulary(reviews):
    vocabulary = defaultdict(int)
    for review in reviews:
        for word in review:
            vocabulary[word] += 1
    return vocabulary

vocabulary = build_vocabulary(df["REVIEW"])

def review_to_bow(review, vocabulary):
    bow = defaultdict(int)
    for word in review:
        if word in vocabulary:
            bow[word] += 1
    return bow

df["REVIEW"] = df["REVIEW"].apply(lambda x: review_to_bow(x, vocabulary))

import numpy as np

# replace NaN values with 0
df.fillna(value=0, inplace=True)

# replace infinite values with a large finite value
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.fillna(value=1e10, inplace=True)

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

# convert bag of words to a numerical representation using DictVectorizer
vectorizer = DictVectorizer()
X = vectorizer.fit_transform(df["REVIEW"])
y = df["RATING"]

# split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# train the logistic regression model
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# train a naive bayes model on the training data
naive_bayes = MultinomialNB()
naive_bayes.fit(X_train , y_train)

# evaluate the model on the test set
accuracy = clf.score(X_test, y_test)
naivebayesaccuracy = naive_bayes.score(X_test, y_test)
print("Logistic Regression Accuracy: ", accuracy,"\nNaive Bayes Accuracy: ",naivebayesaccuracy)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\benmv\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\benmv\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Logistic Regression Accuracy:  0.30458715596330277 
Naive Bayes Accuracy:  0.3275229357798165


import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

# load the data into a pandas data frame
df = pd.read_csv("DS_group_project.csv")

# replace NaN values with empty strings
df["REVIEW"].fillna("", inplace=True)

# remove emoji characters from the reviews
df["REVIEW"] = df["REVIEW"].str.encode('ascii', 'ignore').str.decode('ascii')

# convert the reviews to lowercase
df["REVIEW"] = df["REVIEW"].str.lower()

# tokenize the reviews and convert them into a bag of words representation
vectorizer = CountVectorizer()
reviews_bow = vectorizer.fit_transform(df["REVIEW"])

df.fillna(value=0, inplace=True)

# replace infinite values with a large finite value
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.fillna(value=1e10, inplace=True)

# split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(reviews_bow, df["RATING"], test_size=0.2, random_state=0)

# train a logistic regression model on the training data
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

 # train a naive bayes model on the training data
naive_bayes = MultinomialNB()
naive_bayes.fit(X_train , y_train)

# evaluate the model on the test data
logregaccuracy = clf.score(X_test, y_test)
naivebayesaccuracy = naive_bayes.score(X_test, y_test)

print("Logistic Regression Accuracy: ", logregaccuracy, "\nNaive Bayes Accuracy: ",naivebayesaccuracy)


In [33]:
import re
import string

def preprocess_review(review):
    # Remove punctuation and convert to lowercase
    review = review.translate(str.maketrans('', '', string.punctuation))
    review = review.lower()
    # Remove emojis
    review = re.sub(r'[^\x00-\x7F]+', '', review)
    return review

df['REVIEW'] = df['REVIEW'].apply(preprocess_review)

def tokenize_review(review):
    # Split the review into tokens
    tokens = review.split()
    return tokens

df['TOKENS'] = df['REVIEW'].apply(tokenize_review)

def create_vocabulary(tokens_list):
    # Create a set of all unique tokens
    vocabulary = set()
    for tokens in tokens_list:
        vocabulary |= set(tokens)
    return vocabulary

vocabulary = create_vocabulary(df['TOKENS'].tolist())

from sklearn.feature_extraction.text import CountVectorizer

def review_to_bow(review, vocabulary):
    # Create a count vectorizer using the vocabulary
    vectorizer = CountVectorizer(vocabulary=vocabulary)
    bow = vectorizer.fit_transform([review]).toarray()[0]
    return bow

df['BOW'] = df['REVIEW'].apply(lambda x: review_to_bow(x, vocabulary))

In [34]:
df.head(60)

Unnamed: 0,ID,REVIEW,RATING,AUTHOR,TITLE,TOKENS,BOW
0,1,i dont get all the terrible reviews for this m...,5.0,margarida-44311,Not Bad\n,"[i, dont, get, all, the, terrible, reviews, fo...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,2,i cannot believe anyone could give this film l...,8.0,joemay-2,What are all the bad reviews about is it a wo...,"[i, cannot, believe, anyone, could, give, this...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,3,great white is not the worst way to spend 90 m...,4.0,nebk,Great White=Jaws Lite\n,"[great, white, is, not, the, worst, way, to, s...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,4,great white is as basic of a killer shark film...,4.0,kuarinofu,Bare-bones killer shark film\n,"[great, white, is, as, basic, of, a, killer, s...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,5,terrible story dialogue and cgi the film has a...,4.0,Horror_Flick_Fanatic,"Terrible story, dialogue, and CGI\n","[terrible, story, dialogue, and, cgi, the, fil...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
5,6,whilst the shark survival sub genre has plenty...,6.0,NickyDee07938,A decent effort\n,"[whilst, the, shark, survival, sub, genre, has...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
6,7,much better than the ratings suggest its on pa...,9.0,Novelwolf,Nice Shark movie!\n,"[much, better, than, the, ratings, suggest, it...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
7,8,first of all i love the film locations drone f...,5.0,mbnn,"Nice, but could be so much better\n","[first, of, all, i, love, the, film, locations...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
8,9,the film is meh when it comes to these types o...,4.0,phobicsq,Typical movie for the genre\n,"[the, film, is, meh, when, it, comes, to, thes...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
9,10,thought it was a great shark movie special ef...,7.0,rotini-52586,Liked it !\n,"[thought, it, was, a, great, shark, movie, spe...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
