# CS 525 Assignment 2
Sirut Buasai, sbuasai2@wpi.edu

### Imports and Downloads

In [110]:
import gensim
import gensim.downloader as gensim_api
import nltk
import numpy as np
import pandas as pd
import torch
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.utils import resample
from transformers import BertTokenizerFast, BertForSequenceClassification
from transformers import pipeline
from transformers import TrainingArguments, Trainer

# NLTK downloads
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
embeddings = gensim_api.load('word2vec-google-news-300')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/sirutbuasai/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sirutbuasai/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sirutbuasai/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/sirutbuasai/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


### Data Retrieval and Processing
#### Remove Unused Columns and Create Labels Based On Rating score

In [111]:
# load data from csv file
raw_data = pd.read_csv('Reviews.csv')

# drop unused columns
raw_data.drop(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator', 'HelpfulnessDenominator', 'Time', 'Summary'], axis=1, inplace=True)

# create labels based on score (label 1 when score >= 3, label 0 when score < 3)
raw_data['labels'] = np.where(raw_data.Score >= 3, 1, 0)
raw_data['labels'].value_counts()

1    486417
0     82037
Name: labels, dtype: int64

In [112]:
raw_data['lmao'] = raw_data['Text'].apply(lambda row: row.lower())
raw_data

Unnamed: 0,Score,Text,labels,lmao
0,5,I have bought several of the Vitality canned d...,1,i have bought several of the vitality canned d...
1,1,Product arrived labeled as Jumbo Salted Peanut...,0,product arrived labeled as jumbo salted peanut...
2,4,This is a confection that has been around a fe...,1,this is a confection that has been around a fe...
3,2,If you are looking for the secret ingredient i...,0,if you are looking for the secret ingredient i...
4,5,Great taffy at a great price. There was a wid...,1,great taffy at a great price. there was a wid...
...,...,...,...,...
568449,5,Great for sesame chicken..this is a good if no...,1,great for sesame chicken..this is a good if no...
568450,2,I'm disappointed with the flavor. The chocolat...,0,i'm disappointed with the flavor. the chocolat...
568451,5,"These stars are small, so you can give 10-15 o...",1,"these stars are small, so you can give 10-15 o..."
568452,5,These are the BEST treats for training and rew...,1,these are the best treats for training and rew...


#### Sample Balanced Data

In [113]:
# sample balanced data
ones = raw_data[raw_data['labels'] == 1]
zeros = raw_data[raw_data['labels'] == 0]

# balance ones and zeros
resampled_ones = resample(ones, replace=True, n_samples=int(40000))
resampled_zeros = resample(zeros, replace=True, n_samples=int(40000))

sampled_data = pd.concat([resampled_ones, resampled_zeros])
sampled_data['labels'].value_counts()

1    40000
0    40000
Name: labels, dtype: int64

#### Shuffle the Data and Reset the Index

In [114]:
# shuffle and reset index
sampled_data = sampled_data.sample(frac=1).reset_index(drop=True)
sampled_data.head()

Unnamed: 0,Score,Text,labels,lmao
0,2,While the flowers were cute...they looked noth...,0,while the flowers were cute...they looked noth...
1,5,All the seeds started are growing. The first ...,1,all the seeds started are growing. the first ...
2,5,"I love the Keurig coffee maker, but it seemed ...",1,"i love the keurig coffee maker, but it seemed ..."
3,5,A great way to buy groceries if u cannot get o...,1,a great way to buy groceries if u cannot get o...
4,1,A number of us earnestly tried this product bu...,0,a number of us earnestly tried this product bu...


#### Remove Punctuations, Tokenize, Remove Stop Words, and Lemmatize Text

In [115]:
# clean text by removing punctuations and special characters and convert string to lower case
sampled_data = sampled_data.replace(r'[^A-Za-z0-9]+', ' ', regex=True)
sampled_data['Text'] = sampled_data['Text'].str.lower()

# tokenize text
sampled_data['tokenized_text'] = sampled_data['Text'].apply(nltk.tokenize.word_tokenize)

# remove stop words
stop_words = nltk.corpus.stopwords.words('english')
sampled_data['stop_removed_text'] = sampled_data['tokenized_text'].apply(lambda sentence: [word for word in sentence if word not in stop_words])

# lemmatize tokens
lemmatizer = nltk.stem.WordNetLemmatizer()
sampled_data['lemmatized_text'] = sampled_data['stop_removed_text'].apply(lambda sentence: [lemmatizer.lemmatize(word) for word in sentence])

# clean tokens into one string
sampled_data['cleaned_text'] = sampled_data['lemmatized_text'].apply(lambda sentence: ' '.join([word for word in sentence]))
sampled_data.head()

Unnamed: 0,Score,Text,labels,lmao,tokenized_text,stop_removed_text,lemmatized_text,cleaned_text
0,2,while the flowers were cute they looked nothin...,0,while the flowers were cute they looked nothin...,"[while, the, flowers, were, cute, they, looked...","[flowers, cute, looked, nothing, like, picture...","[flower, cute, looked, nothing, like, picture,...",flower cute looked nothing like picture none f...
1,5,all the seeds started are growing the first se...,1,all the seeds started are growing the first se...,"[all, the, seeds, started, are, growing, the, ...","[seeds, started, growing, first, set, leaves, ...","[seed, started, growing, first, set, leaf, sta...",seed started growing first set leaf started an...
2,5,i love the keurig coffee maker but it seemed s...,1,i love the keurig coffee maker but it seemed s...,"[i, love, the, keurig, coffee, maker, but, it,...","[love, keurig, coffee, maker, seemed, silly, s...","[love, keurig, coffee, maker, seemed, silly, s...",love keurig coffee maker seemed silly spend ex...
3,5,a great way to buy groceries if u cannot get o...,1,a great way to buy groceries if u cannot get o...,"[a, great, way, to, buy, groceries, if, u, can...","[great, way, buy, groceries, u, get, need, shi...","[great, way, buy, grocery, u, get, need, ship,...",great way buy grocery u get need ship someone ...
4,1,a number of us earnestly tried this product bu...,0,a number of us earnestly tried this product bu...,"[a, number, of, us, earnestly, tried, this, pr...","[number, us, earnestly, tried, product, trying...","[number, u, earnestly, tried, product, trying,...",number u earnestly tried product trying anothe...


## Task 1: TF-IDF Approach
### TF-IDF Feature Set

In [116]:
# create tf-idf feature set
tfidf_vect = TfidfVectorizer()

# split data into training and testing set with 70-30 split ratio
train_x, test_x, train_y, test_y = train_test_split(sampled_data['cleaned_text'], sampled_data['labels'], test_size=0.3)

tfidf_train_x = tfidf_vect.fit_transform(train_x)
tfidf_test_x = tfidf_vect.transform(test_x)

In [117]:
tfidf_train_x.shape

(56000, 38753)

### Logistic Regression on TF-IDF Feature Set

In [118]:
# perform logistic regresstion model on TF-IDF feature
tfidf_log = LogisticRegression(solver='liblinear')

# train model on training set
tfidf_log.fit(tfidf_train_x, train_y)

# test model on testing set
prediction_y = tfidf_log.predict(tfidf_test_x)

print(f"Precision score:\t{precision_score(test_y, prediction_y)}")
print(f"Recall score:\t\t{recall_score(test_y, prediction_y)}")
print(f"Accuracy score:\t\t{accuracy_score(test_y, prediction_y)}")
print(f"F1 Score:\t\t{f1_score(test_y, prediction_y)}")

Precision score:	0.8754214430209035
Recall score:		0.8677416659704236
Accuracy score:		0.8724583333333333
F1 Score:		0.8715646372676541


### Random Forest Classifier on TF-IDF Feature Set

In [119]:
# perform multinomial naive bayes model on TF-IDF feature
tfidf_rfc = RandomForestClassifier()

# train model on training set
tfidf_rfc.fit(tfidf_train_x, train_y)

# test model on testing set
prediction_y = tfidf_rfc.predict(tfidf_test_x)

print(f"Precision score:\t{precision_score(test_y, prediction_y)}")
print(f"Recall score:\t\t{recall_score(test_y, prediction_y)}")
print(f"Accuracy score:\t\t{accuracy_score(test_y, prediction_y)}")
print(f"F1 Score:\t\t{f1_score(test_y, prediction_y)}")

Precision score:	0.8908497825359653
Recall score:		0.8898821956721531
Accuracy score:		0.8907083333333333
F1 Score:		0.8903657262277953


### Support Vector Machine on TF-IDF Feature Set

In [120]:
# perform support vector machine model on TF-IDF feature
tfidf_svm = LinearSVC()

# train model on training set
tfidf_svm.fit(tfidf_train_x, train_y)

# test model on testing set
prediction_y = tfidf_svm.predict(tfidf_test_x)

print(f"Precision score:\t{precision_score(test_y, prediction_y)}")
print(f"Recall score:\t\t{recall_score(test_y, prediction_y)}")
print(f"Accuracy score:\t\t{accuracy_score(test_y, prediction_y)}")
print(f"F1 Score:\t\t{f1_score(test_y, prediction_y)}")

Precision score:	0.8836696625402747
Recall score:		0.8707494360431114
Accuracy score:		0.878375
F1 Score:		0.8771619744981694


## Task 2: Word2Vec Approach
### Word2Vec Feature Set

In [121]:
# create a word2vec feature from google embeddings
sampled_data['embedding_text'] = sampled_data['lemmatized_text'].apply(lambda sentence: [embeddings[word] for word in sentence if word in embeddings])
sampled_data['word2vec_text'] = sampled_data['embedding_text'].apply(lambda arr: np.mean(arr, axis=0))
word2vec_data = pd.DataFrame(np.vstack(sampled_data['word2vec_text'].values))

# split data into training and testing set with 70-30 split ratio
word2vec_train_x, word2vec_test_x, train_y, test_y = train_test_split(word2vec_data, sampled_data['labels'], test_size=0.3, random_state=1)

### Logistic Regression on Word2Vec Feature Set

In [122]:
# perform logistic regresstion model on TF-IDF feature
word2vec_log = LogisticRegression(solver='liblinear')

# train model on training set
word2vec_log.fit(word2vec_train_x, train_y)

# test model on testing set
prediction_y = word2vec_log.predict(word2vec_test_x)

print(f"Precision score:\t{precision_score(test_y, prediction_y)}")
print(f"Recall score:\t\t{recall_score(test_y, prediction_y)}")
print(f"Accuracy score:\t\t{accuracy_score(test_y, prediction_y)}")
print(f"F1 Score:\t\t{f1_score(test_y, prediction_y)}")

Precision score:	0.8214743315964808
Recall score:		0.8016170709344003
Accuracy score:		0.81375
F1 Score:		0.8114242321970975


### Random Forest Classifier on Word2Vec Feature Set

In [123]:
# perform multinomial naive bayes model on TF-IDF feature
word2vec_rfc = RandomForestClassifier()

# train model on training set
word2vec_rfc.fit(word2vec_train_x, train_y)

# test model on testing set
prediction_y = word2vec_rfc.predict(word2vec_test_x)

print(f"Precision score:\t{precision_score(test_y, prediction_y)}")
print(f"Recall score:\t\t{recall_score(test_y, prediction_y)}")
print(f"Accuracy score:\t\t{accuracy_score(test_y, prediction_y)}")
print(f"F1 Score:\t\t{f1_score(test_y, prediction_y)}")

Precision score:	0.8507561358565408
Recall score:		0.8581311994665333
Accuracy score:		0.8538333333333333
F1 Score:		0.8544277533405261


### Support Vector Machine on Word2Vec Feature Set

In [124]:
# perform support vector machine model on TF-IDF feature
word2vec_svm = LinearSVC()

# train model on training set
word2vec_svm.fit(word2vec_train_x, train_y)

# test model on testing set
prediction_y = word2vec_svm.predict(word2vec_test_x)

print(f"Precision score:\t{precision_score(test_y, prediction_y)}")
print(f"Recall score:\t\t{recall_score(test_y, prediction_y)}")
print(f"Accuracy score:\t\t{accuracy_score(test_y, prediction_y)}")
print(f"F1 Score:\t\t{f1_score(test_y, prediction_y)}")

Precision score:	0.8203465050780917
Recall score:		0.8012003000750187
Accuracy score:		0.8129166666666666
F1 Score:		0.8106603694020411


## Task 3: BERT Without Tuning Approach
### Initialize and Run Untuned BERT

In [125]:
# truncate raw text since BERT only allows 512 string length
BERT_MAX_STR = 512
sampled_data['truncated_text'] = sampled_data['Text'].str.slice(0, BERT_MAX_STR)

# initialize pretrained BERT sentiment classifier
untuned_bert = pipeline('sentiment-analysis')

# predict using BERT model
result = untuned_bert(sampled_data['truncated_text'].tolist())
bert_predictions = pd.DataFrame(result)

# convert POSITIVE and NEGATIVE result into 1s and 0s
bert_predictions['predictions'] = bert_predictions['label'].map({'POSITIVE' : 1, 'NEGATIVE' : 0})
bert_predictions['labels'] = sampled_data['labels']
bert_predictions.head()

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
loading configuration file config.json from cache at /Users/sirutbuasai/.cache/huggingface/hub/models--distilbert-base-uncased-finetuned-sst-2-english/snapshots/324d3097568e82724d53d7ac1d312aa719d48037/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased-finetuned-sst-2-english",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "finetuning_task": "sst-2",
  "hidden_dim": 3072,
  "id2label": {
    "0": "NEGATIVE",
    "1": "POSITIVE"
  },
  "initializer_range": 0.02,
  "label2id": {
    "NEGATIVE": 0,
    "POSITIVE": 1
  },
  "max_position_embeddings": 512,
  "model_type": "disti

Unnamed: 0,label,score,predictions,labels
0,NEGATIVE,0.994321,0,0
1,NEGATIVE,0.946179,0,1
2,POSITIVE,0.996443,1,1
3,POSITIVE,0.990148,1,1
4,NEGATIVE,0.997801,0,0


### Analyze Untuned BERT

In [126]:
# retrive test and prediction labels
prediction_y = bert_predictions['predictions']
test_y = bert_predictions['labels']

print(f"Precision score:\t{precision_score(test_y, prediction_y)}")
print(f"Recall score:\t\t{recall_score(test_y, prediction_y)}")
print(f"Accuracy score:\t\t{accuracy_score(test_y, prediction_y)}")
print(f"F1 Score:\t\t{f1_score(test_y, prediction_y)}")

Precision score:	0.8902278290340803
Recall score:		0.7092
Accuracy score:		0.810875
F1 Score:		0.7894692900676259


## Task 4: BERT With Tuning Approach
### Initialize and Tune Tuned BERT

In [None]:
# initialized based uncased BERT tokenizer and classification model
bert_name = 'bert-base-uncased'
tokenizer = BertTokenizerFast.from_pretrained(bert_name, do_lower_case=True)
classifier = BertForSequenceClassification.from_pretrained(bert_name, num_labels=2)

# split data into training and testing set with 70-30 split ratio
train_x, validate_x, train_y, validate_y = train_test_split(sampled_data['Text'], sampled_data['labels'], test_size=0.3)
train_x = train_x.reset_index(drop=True)
validate_x = validate_x.reset_index(drop=True)
train_y = train_y.reset_index(drop=True)
validate_y = validate_y.reset_index(drop=True)

### Process Data into Compatible Tensors

In [None]:
# tokenize the dataset
train_tokens = tokenizer(train_x.tolist(), truncation=True, padding=True, max_length=BERT_MAX_STR)
validate_tokens = tokenizer(validate_x.tolist(), truncation=True, padding=True, max_length=BERT_MAX_STR)

# class for token arr to tensors conversion
class NewsGroupsDataset(torch.utils.data.Dataset):
  def __init__(self, encodings, labels):
    self.encodings = encodings
    self.labels = labels

  def __getitem__(self, idx):
    item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
    item['labels'] = torch.tensor(self.labels[idx])
    return item

  def __len__(self):
    return len(self.labels)

# convert tokenized list into torch tensors
train_tensors = NewsGroupsDataset(train_tokens, train_y)
validate_tensors = NewsGroupsDataset(validate_tokens, validate_y)

### Initialize Training and Trainer for BERT

In [None]:
# metrics computation function
def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  # calculate accuracy using sklearn's function
  acc = accuracy_score(labels, preds)
  return {
    'accuracy': acc,
  }

# initialize training arguments
training_args = TrainingArguments(
  output_dir='./results',          # output directory
  num_train_epochs=1,              # total number of training epochs
  per_device_train_batch_size=8,  # batch size per device during training
  per_device_eval_batch_size=10,   # batch size for evaluation
  warmup_steps=500,                # number of warmup steps for learning rate scheduler
  weight_decay=0.01,               # strength of weight decay
  logging_dir='./logs',            # directory for storing logs
  load_best_model_at_end=True,     # load the best model when finished training (default metric is loss)
  # but you can specify `metric_for_best_model` argument to change to accuracy or other metric
  logging_steps=400,               # log & save weights each logging_steps
  save_steps=400,
  evaluation_strategy="steps",     # evaluate each `logging_steps`
)

# initialize trainer
trainer = Trainer(
  model=classifier,                    # the instantiated Transformers model to be trained
  args=training_args,                  # training arguments, defined above
  train_dataset=train_tensors,         # training dataset
  eval_dataset=validate_tensors,       # evaluation dataset
  compute_metrics=compute_metrics,     # the callback that computes metrics of interest
)

### Fine-tune BERT Model

In [None]:
# train BERT
trainer.train()

### Create Test Data to Evaluate Tuned BERT

In [None]:
# balance ones and zeros
resampled_ones = resample(ones, replace=True, n_samples=int(20000))
resampled_zeros = resample(zeros, replace=True, n_samples=int(20000))

test_data = pd.concat([resampled_ones, resampled_zeros])
test_data['labels'].value_counts()

# shuffle and reset index
test_data = test_data.sample(frac=1).reset_index(drop=True)
test_data.head()

### Evaluate Tuned BERT

In [None]:
# create function to get BERT predictions
def get_prediction(text):
  # prepare our text into tokenized sequence
  inputs = tokenizer(text, padding=True, truncation=True, max_length=BERT_MAX_STR, return_tensors="pt").to("cuda")
  # perform inference to our model
  outputs = classifier(**inputs)
  # get output probabilities by doing softmax
  probs = outputs[0].softmax(1)
  # executing argmax function to get the candidate label
  return int(probs.argmax())

# evaluate testing set on BERT
test_data['predictions'] = test_data['Text'].apply(lambda text: get_prediction(text))

### Analyze Tuned BERT

In [None]:
# retrive test and prediction labels
prediction_y = test_data['predictions']
test_y = test_data['labels']

print(f"Precision score:\t{precision_score(test_y, prediction_y)}")
print(f"Recall score:\t\t{recall_score(test_y, prediction_y)}")
print(f"Accuracy score:\t\t{accuracy_score(test_y, prediction_y)}")
print(f"F1 Score:\t\t{f1_score(test_y, prediction_y)}")