# CS 525 Assignment 2
Sirut Buasai, sbuasai2@wpi.edu

### Imports and Downloads

In [1]:
import pandas as pd
import nltk
import numpy as np
import gensim
import gensim.downloader as gensim_api
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

# NLTK downloads
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
embeddings = gensim_api.load('word2vec-google-news-300')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/sirutbuasai/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sirutbuasai/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sirutbuasai/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/sirutbuasai/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


### Data Retrieval and Processing

#### Create Labels Based On Rating score

In [2]:
# load data from csv file
raw_data = pd.read_csv('Reviews.csv')

# create labels based on score (label 1 when score >= 3, label 0 when score < 3)
raw_data['labels'] = np.where(raw_data.Score >= 3, 1, 0)
groups = raw_data.groupby('labels')

#### Sample Balanced Data

In [3]:
# sample balanced data
sampled_data = groups.apply(lambda x: x.sample(groups.size().min()).reset_index(drop=True))
sampled_data['labels'].value_counts()

0    82037
1    82037
Name: labels, dtype: int64

#### Remove Punctuations, Tokenize, Remove Stop Words, and Lemmatize Text

In [4]:
# clean text by removing punctuations and special characters and convert string to lower case
sampled_data = sampled_data.replace(r'[^A-Za-z0-9]+', ' ', regex=True)
sampled_data['Text'] = sampled_data['Text'].str.lower()

# tokenize text
sampled_data['tokenized_text'] = sampled_data['Text'].apply(nltk.tokenize.word_tokenize)

# remove stop words
stop_words = nltk.corpus.stopwords.words('english')
sampled_data['stop_removed_text'] = sampled_data['tokenized_text'].apply(lambda sentence: [word for word in sentence if word not in stop_words])

# lemmatize tokens
lemmatizer = nltk.stem.WordNetLemmatizer()
sampled_data['lemmatized_text'] = sampled_data['stop_removed_text'].apply(lambda sentence: [lemmatizer.lemmatize(word) for word in sentence])

# clean tokens into one string
sampled_data['cleaned_text'] = sampled_data['lemmatized_text'].apply(lambda sentence: ' '.join([word for word in sentence]))
sampled_data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,labels,tokenized_text,stop_removed_text,lemmatized_text,cleaned_text
labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,0,100942,B0016ZU83M,A3E3EQ39FZTZEK,sh sh,0,0,2,1347667200,Much less flaovr nearly same calories,these are for people that cannot tolerate suga...,0,"[these, are, for, people, that, can, not, tole...","[people, tolerate, sugar, low, cal, slightly, ...","[people, tolerate, sugar, low, cal, slightly, ...",people tolerate sugar low cal slightly lower c...
0,1,480042,B003IHO8OG,A11JJPVSHVJM0S,Nicole Taylor,1,7,1,1287705600,Awful,i tried this and it was just awful it tasted l...,0,"[i, tried, this, and, it, was, just, awful, it...","[tried, awful, tasted, like, fizzy, cough, syr...","[tried, awful, tasted, like, fizzy, cough, syr...",tried awful tasted like fizzy cough syrup even...
0,2,49516,B001E5E268,A3LOIAZYY3U9V9,Adam Z,10,11,2,1172707200,Go for Pinhead Gunpowder instead,having purchased a package of this tea and one...,0,"[having, purchased, a, package, of, this, tea,...","[purchased, package, tea, one, stash, pinhead,...","[purchased, package, tea, one, stash, pinhead,...",purchased package tea one stash pinhead gunpow...
0,3,546407,B0014GHZ3O,A3UHFQT4E3R2D3,a consumer,2,2,1,1305936000,Poor excuse for an Easter gift,hopefully by the time you consider this produc...,0,"[hopefully, by, the, time, you, consider, this...","[hopefully, time, consider, product, vendor, c...","[hopefully, time, consider, product, vendor, c...",hopefully time consider product vendor changed...
0,4,357948,B0015J7BG6,AFX0Z8Q4XXW0F,Michael,0,0,1,1297123200,Big disappointment,these marshmallows are no larger than standard...,0,"[these, marshmallows, are, no, larger, than, s...","[marshmallows, larger, standard, jet, puffed, ...","[marshmallow, larger, standard, jet, puffed, c...",marshmallow larger standard jet puffed campfir...


### TF-IDF Feature Set

In [5]:
# create tf-idf feature set
tfidf_vect = TfidfVectorizer()

# split data into training and testing set with 70-30 split ratio
train_x, test_x, train_y, test_y = train_test_split(sampled_data['cleaned_text'], sampled_data['labels'], test_size=0.3)

tfidf_train_x = tfidf_vect.fit_transform(train_x)
tfidf_test_x = tfidf_vect.transform(test_x)

In [6]:
tfidf_train_x.shape

(114851, 54734)

#### Logistic Regression on TF-IDF Feature Set

In [7]:
# perform logistic regresstion model on TF-IDF feature
tfidf_log = LogisticRegression(solver='liblinear')

# train model on training set
tfidf_log.fit(tfidf_train_x, train_y)

# test model on testing set
prediction_y = tfidf_log.predict(tfidf_test_x)

print(f"Precision score:\t{precision_score(test_y, prediction_y)}")
print(f"Recall score:\t\t{recall_score(test_y, prediction_y)}")
print(f"Accuracy score:\t\t{accuracy_score(test_y, prediction_y)}")
print(f"F1 Score:\t\t{f1_score(test_y, prediction_y)}")

Precision score:	0.8814983443708609
Recall score:		0.8672123137063279
Accuracy score:		0.8755866160128395
F1 Score:		0.8742969744242375


#### Random Forest Classifier on TF-IDF Feature Set

In [8]:
# perform multinomial naive bayes model on TF-IDF feature
tfidf_rfc = RandomForestClassifier()

# train model on training set
tfidf_rfc.fit(tfidf_train_x, train_y)

# test model on testing set
prediction_y = tfidf_rfc.predict(tfidf_test_x)

print(f"Precision score:\t{precision_score(test_y, prediction_y)}")
print(f"Recall score:\t\t{recall_score(test_y, prediction_y)}")
print(f"Accuracy score:\t\t{accuracy_score(test_y, prediction_y)}")
print(f"F1 Score:\t\t{f1_score(test_y, prediction_y)}")

Precision score:	0.8814304144492824
Recall score:		0.890259793142764
Accuracy score:		0.8855006805761534
F1 Score:		0.8858231027916211


#### Support Vector Machine on TF-IDF Feature Set

In [9]:
# perform support vector machine model on TF-IDF feature
tfidf_svm = LinearSVC()

# train model on training set
tfidf_svm.fit(tfidf_train_x, train_y)

# test model on testing set
prediction_y = tfidf_svm.predict(tfidf_test_x)

print(f"Precision score:\t{precision_score(test_y, prediction_y)}")
print(f"Recall score:\t\t{recall_score(test_y, prediction_y)}")
print(f"Accuracy score:\t\t{accuracy_score(test_y, prediction_y)}")
print(f"F1 Score:\t\t{f1_score(test_y, prediction_y)}")

Precision score:	0.8847169499917478
Recall score:		0.8731167033146021
Accuracy score:		0.8799341771123256
F1 Score:		0.8788785506414722


### Word2Vec Feature Set

In [10]:
# create a word2vec feature from google embeddings
sampled_data['embedding_text'] = sampled_data['lemmatized_text'].apply(lambda sentence: [embeddings[word] for word in sentence if word in embeddings])
sampled_data['word2vec_text'] = sampled_data['embedding_text'].apply(lambda arr: np.mean(arr, axis=0))
word2vec_data = pd.DataFrame(np.vstack(sampled_data['word2vec_text'].values))

# split data into training and testing set with 70-30 split ratio
word2vec_train_x, word2vec_test_x, train_y, test_y = train_test_split(word2vec_data, sampled_data['labels'], test_size=0.3, random_state=1)

#### Logistic Regression on Word2Vec Feature Set

In [11]:
# perform logistic regresstion model on TF-IDF feature
word2vec_log = LogisticRegression(solver='liblinear')

# train model on training set
word2vec_log.fit(word2vec_train_x, train_y)

# test model on testing set
prediction_y = word2vec_log.predict(word2vec_test_x)

print(f"Precision score:\t{precision_score(test_y, prediction_y)}")
print(f"Recall score:\t\t{recall_score(test_y, prediction_y)}")
print(f"Accuracy score:\t\t{accuracy_score(test_y, prediction_y)}")
print(f"F1 Score:\t\t{f1_score(test_y, prediction_y)}")

Precision score:	0.8233045212765957
Recall score:		0.8071376191640186
Accuracy score:		0.8174430652337322
F1 Score:		0.8151409175066859


#### Random Forest Classifier on Word2Vec Feature Set

In [12]:
# perform multinomial naive bayes model on TF-IDF feature
word2vec_rfc = RandomForestClassifier()

# train model on training set
word2vec_rfc.fit(word2vec_train_x, train_y)

# test model on testing set
prediction_y = word2vec_rfc.predict(word2vec_test_x)

print(f"Precision score:\t{precision_score(test_y, prediction_y)}")
print(f"Recall score:\t\t{recall_score(test_y, prediction_y)}")
print(f"Accuracy score:\t\t{accuracy_score(test_y, prediction_y)}")
print(f"F1 Score:\t\t{f1_score(test_y, prediction_y)}")

Precision score:	0.8352299856756327
Recall score:		0.8551698851136641
Accuracy score:		0.8436503260670825
F1 Score:		0.84508233020653


#### Support Vector Machine on Word2Vec Feature Set

In [13]:
# perform support vector machine model on TF-IDF feature
word2vec_svm = LinearSVC()

# train model on training set
word2vec_svm.fit(word2vec_train_x, train_y)

# test model on testing set
prediction_y = word2vec_svm.predict(word2vec_test_x)

print(f"Precision score:\t{precision_score(test_y, prediction_y)}")
print(f"Recall score:\t\t{recall_score(test_y, prediction_y)}")
print(f"Accuracy score:\t\t{accuracy_score(test_y, prediction_y)}")
print(f"F1 Score:\t\t{f1_score(test_y, prediction_y)}")

Precision score:	0.8241203935300984
Recall score:		0.8053858062413428
Accuracy score:		0.8172399081730085
F1 Score:		0.8146454032224832
