In [1]:
import pandas as pd
import contractions
import re
import numpy as np

# Dataset Generation

In [2]:
df = pd.read_csv(
    "./amazon_reviews_us_Office_Products_v1_00.tsv", sep="\t", on_bad_lines="skip"
)

  df = pd.read_csv(


In [3]:
df["review"] = df["review_headline"] + " " + df["review_body"]
df = df.drop(
    [
        "marketplace",
        "customer_id",
        "review_id",
        "product_id",
        "product_parent",
        "product_title",
        "product_category",
        "helpful_votes",
        "total_votes",
        "vine",
        "verified_purchase",
        "review_date",
        "review_headline",
        "review_body",
    ],
    axis=1,
)

def safe_convert(x):
    try:
        return int(x)
    except ValueError:
        return pd.NA


df["star_rating"] = df["star_rating"].apply(safe_convert)
# Drop rows with NaN
df = df.dropna(subset=["star_rating"])
# Remove rows with non strings for reviews
df = df[df["review"].apply(lambda x: isinstance(x, str))]

In [4]:
one_star = df[df["star_rating"] == 1][0:50000]
two_star = df[df["star_rating"] == 2][0:50000]
three_star = df[df["star_rating"] == 3][0:50000]
four_star = df[df["star_rating"] == 4][0:50000]
five_star = df[df["star_rating"] == 5][0:50000]
reviews = pd.concat([one_star, two_star, three_star, four_star, five_star])

In [5]:
def binning(row):
    if row["star_rating"] >= 4:
        return 1
    elif row["star_rating"] <= 2:
        return 0
    else:
        return 2


reviews["label"] = reviews.apply(binning, axis=1)

In [6]:
def cleaning(row):
    review = row["review"]
    # Lower case
    review = review.lower()
    # Remove html tags
    review = re.sub(r"<.*?>", "", review)
    # Remove urls
    review = re.sub(r"http[s]?://\S+", "", review)
    # Remove non alphanumeric characters
    review = re.sub(r"[^a-zA-Z\s]", "", review)
    # Remove extra whitespaces
    review = re.sub(r"[\s]+", " ", review)
    # Remove contractions
    review = contractions.fix(review)

    return review


reviews["review"] = reviews.apply(cleaning, axis=1)

In [7]:
import nltk
from nltk.corpus import stopwords

nltk.download("wordnet")
nltk.download("stopwords")

words = stopwords.words("english")

def remove_stopwords(row):
    review = row["review"]
    review = review.split(" ")
    review = [word for word in review if word not in words]
    review = " ".join(review)
    return review


reviews["review"] = reviews.apply(remove_stopwords, axis=1)

[nltk_data] Downloading package wordnet to /home/sujay/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/sujay/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

nltk.download("universal_tagset")
nltk.download("averaged_perceptron_tagger")
nltk.download("punkt")

lemmatizer = WordNetLemmatizer()


def lemmatize(row):
    review = row["review"]
    words = nltk.word_tokenize(review)
    lemmatized_words = []
    for word, tag in pos_tag(words, tagset="universal"):
        if tag == "NOUN":
            lemmatized_words.append(lemmatizer.lemmatize(word, "n"))
        elif tag == "VERB":
            lemmatized_words.append(lemmatizer.lemmatize(word, "v"))
        elif tag == "ADJ":
            lemmatized_words.append(lemmatizer.lemmatize(word, "a"))
        elif tag == "ADV":
            lemmatized_words.append(lemmatizer.lemmatize(word, "r"))
        else:
            lemmatized_words.append(word)
    lemmatized_review = " ".join(lemmatized_words)
    return lemmatized_review


reviews["review"] = reviews.apply(lemmatize, axis=1)

[nltk_data] Downloading package universal_tagset to
[nltk_data]     /home/sujay/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/sujay/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /home/sujay/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [61]:
# Drop rows with empty reviews
reviews = reviews[reviews["review"].apply(lambda x: len(x) > 0)]

# Word Embedding

In [23]:
import gensim.downloader as api
from gensim import utils, matutils

google_wv = api.load("word2vec-google-news-300")

In [24]:
# print(wv.similarity(wv['king'] - wv['man'] + wv['woman'], wv['queen']))
def similarity_score(w1, w2):
    return np.dot(matutils.unitvec(w1), matutils.unitvec(w2))

In [25]:
print(google_wv.similarity("excellent", "outstanding"))
print(
    similarity_score(
        google_wv["king"] - google_wv["man"] + google_wv["woman"], google_wv["queen"]
    )
)

0.55674857
0.73005176


In [26]:
import gensim.models

class MyCorpus:
    """An iterator that yields sentences (lists of str)."""

    def __iter__(self):
        for r in reviews['review']:
            # assume there's one document per line, tokens separated by whitespace
            yield r

sentences = MyCorpus()
model = gensim.models.Word2Vec(sentences=sentences, min_count=10, vector_size=200)

In [27]:
model.train(sentences, total_examples=model.corpus_count, epochs=model.epochs)

(29348452, 170707160)

# Simple models

In [62]:
from sklearn.model_selection import train_test_split

# Drop neutral reviews
reviews_no_neutral = reviews[reviews["label"] != 2]


x_train_og, x_test_og, y_train, y_test = train_test_split(
    reviews_no_neutral["review"],
    reviews_no_neutral["label"],
    test_size=0.2,
    random_state=42,
)
x_train_og = x_train_og.to_frame()
x_test_og = x_test_og.to_frame()
y_train = y_train.to_frame()
y_test = y_test.to_frame()

In [64]:
def get_features(row):
    review = row["review"]
    combined_vector = np.zeros(google_wv.vector_size)
    if len(review) == 0:
        print(review)
    for w in review:
        if w in google_wv:
            combined_vector += google_wv[w]
    return combined_vector/len(review)


x_train = np.vstack(x_train_og.apply(get_features, axis=1).to_numpy())
x_test = np.vstack(x_test_og.apply(get_features, axis=1).to_numpy())

In [75]:
print(len(x_test))
print(len(y_test))

39997
39997


In [67]:
from sklearn.linear_model import Perceptron
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

p_model = Perceptron(tol=1e-3, random_state=0)
p_model.fit(x_train, y=y_train)

y_test_pred = p_model.predict(x_test)

# Test
# print(
#     f"Accuracy, Precision, Recall, F1 (Test Set): {accuracy_score(y_test, y_test_pred)}, {precision_score(y_test, y_test_pred)}, {recall_score(y_test, y_test_pred)}, {f1_score(y_test, y_test_pred)}"
# )

  y = column_or_1d(y, warn=True)


Accuracy, Precision, Recall, F1 (Test Set): 0.6412980973573018, 0.9243211158925657, 0.31042559553473537, 0.4647640365603432


In [68]:
from sklearn.svm import LinearSVC

model = LinearSVC(
    dual="auto", random_state=0, tol=1e-05
)  # You can change the kernel and other parameters

# Train the model
model.fit(x_train, y_train)

# Make predictions
y_test_pred = model.predict(x_test)

# Test
print(
    f"Accuracy, Precision, Recall, F1 (Test Set): {accuracy_score(y_test, y_test_pred)}, {precision_score(y_test, y_test_pred)}, {recall_score(y_test, y_test_pred)}, {f1_score(y_test, y_test_pred)}"
)

  y = column_or_1d(y, warn=True)


Accuracy, Precision, Recall, F1 (Test Set): 0.7169287696577243, 0.751959428307976, 0.6502541612678162, 0.6974183548025015
