## Read data

In [21]:
import pandas as pd

file_path = './amazon_reviews_us_Office_Products_v1_00.tsv'
full_df = pd.read_csv(file_path, delimiter='\t', on_bad_lines='skip')
full_df.dropna()

print(full_df.head(100))

  full_df = pd.read_csv(file_path, delimiter='\t', on_bad_lines='skip')


   marketplace  customer_id       review_id  product_id  product_parent  \
0           US     43081963  R18RVCKGH1SSI9  B001BM2MAC       307809868   
1           US     10951564  R3L4L6LW1PUOFY  B00DZYEXPQ        75004341   
2           US     21143145  R2J8AWXWTDX2TF  B00RTMUHDW       529689027   
3           US     52782374  R1PR37BR7G3M6A  B00D7H8XB6       868449945   
4           US     24045652  R3BDDDZMZBZDPU  B001XCWP34        33521401   
..         ...          ...             ...         ...             ...   
95          US     43069257  R2Y8H6IMJICNHE  B00E7W6SIU       649134050   
96          US      4219837  R3BOZ2S3XKQQDQ  B005XBFYY8       292062400   
97          US     10021573  R2EWS2YM55KC99  B007AJ92T4       172016162   
98          US     24270459  R3SW4W88I5NUWB  B000E25X92        19288137   
99          US     15354510  R15QQMDRBSSDGY  B00N1Q70GM       137395659   

                                        product_title product_category  \
0      Scotch Cushion Wra

## Keep reviews and ratings

In [22]:
cols = ['review_body', 'star_rating']
reviews_ratings_df = full_df[cols]

print(reviews_ratings_df)

                                               review_body star_rating
0                                           Great product.           5
1        What's to say about this commodity item except...           5
2          Haven't used yet, but I am sure I will like it.           5
3        Although this was labeled as &#34;new&#34; the...           1
4                          Gorgeous colors and easy to use           4
...                                                    ...         ...
2640249  I can't live anymore whithout my Palm III. But...           4
2640250  Although the Palm Pilot is thin and compact it...           4
2640251  This book had a lot of great content without b...           4
2640252  I am teaching a course in Excel and am using t...           5
2640253  A very comprehensive layout of exactly how Vis...           5

[2640254 rows x 2 columns]


 ## Form two classes and select 50000 reviews randomly from each class

In [23]:
class_1 = reviews_ratings_df[reviews_ratings_df['star_rating'].isin([1, 2, 3])].copy()
class_2 = reviews_ratings_df[reviews_ratings_df['star_rating'].isin([4, 5])].copy()

sample_size = 50_000
class_1 = class_1.sample(n=min(len(class_1), sample_size))
class_2 = class_2.sample(n=min(len(class_2), sample_size))
classified_df = pd.concat([class_1, class_2])

classified_df.loc[:, 'class'] = classified_df['star_rating'].apply(lambda x: 1 if x in [1,2,3] else 2)

print(classified_df.tail(50))

                                               review_body star_rating  class
1427149  It's difficult to get a telephone/fax/answerin...           5      2
2333726  First of all, I ordered this thing at 3:30pm o...         5.0      2
43704                               I couldn't be happier.           5      2
2284893  This is by far the coolest note pad I have eve...         4.0      2
2632318  I stand by my decision to buy it. The case loo...           4      2
1052648  This product does a good job. No problems in u...         4.0      2
523414              Just as described like the style of it           4      2
394123   Very easy to use magnetic sheet. You can write...           5      2
971798                                         Easy to use           5      2
1267915  meet expectation.  delivery was timely, produc...           4      2
410864   Seems to be of good quality and coordinates wi...           5      2
1930978  i wanted this phones for an older person that ...      

## Data cleaning

In [24]:
import re
from contractions import fix

cleaned_df = classified_df.dropna(subset=['review_body'])
cleaned_df.loc[:, 'review_body'] = cleaned_df['review_body'].astype(str)
cleaned_df.loc[:, 'review_body'] = cleaned_df['review_body'].str.lower()
cleaned_df.loc[:, 'review_body'] = cleaned_df['review_body'].apply(lambda x: re.sub(r'<.*?>', '', x))
cleaned_df.loc[:, 'review_body'] = cleaned_df['review_body'].apply(lambda x: re.sub(r'http\S+', '', x))
cleaned_df.loc[:, 'review_body'] = cleaned_df['review_body'].apply(lambda x: re.sub(r'[^a-z\s]', '', x))
cleaned_df.loc[:, 'review_body'] = cleaned_df['review_body'].apply(lambda x: ' '.join(x.split()))
cleaned_df.loc[:, 'review_body'] = cleaned_df['review_body'].apply(lambda x: fix(x))

ave_len_bef = classified_df['review_body'].str.len().mean()
ave_len_aft = cleaned_df['review_body'].str.len().mean()

print(f'Average length of reviews before vs after: {ave_len_bef:.1f}, {ave_len_aft:.1f}')

Average length of reviews before vs after: 321.5, 305.1


## Preprocessing

In [25]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/scottsus/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/scottsus/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/scottsus/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Remove stop words 

In [26]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def remove_stop_words(text):
    tokens = nltk.word_tokenize(text)
    tokens = [token for token in tokens if token not in stop_words]
    return ' '.join(tokens)

no_stopwords_df = cleaned_df
no_stopwords_df.loc[:, 'review_body'] = no_stopwords_df['review_body'].apply(remove_stop_words)

print(no_stopwords_df)

                                               review_body star_rating  class
939996   soon put mfcdw printer signaled jam inside eve...           1      1
1698116  dealt machine years hated every minute yes eat...           1      1
1965940  let first saythis biggest junk ever purchased ...           1      1
2600383  like reviewers also replace mine unit would re...           2      1
2281202  printers last spews ink crimps page corners cr...         1.0      1
...                                                    ...         ...    ...
1313431                                          good unit           5      2
1207699  received yesterday meets needs speak hold long...           4      2
53121    gave journal gift little girl loves writing st...           5      2
45356    arrived perfect conditionjust put printer comm...           5      2
591625                                hard fine work great           5      2

[99993 rows x 3 columns]


### Perform lemmatization  

In [27]:
from nltk.stem import WordNetLemmatizer

def lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(tokens)

lemmatized_df = no_stopwords_df
lemmatized_df.loc[:, 'review_body'] = lemmatized_df['review_body'].apply(lemmatize)

print(lemmatized_df)

                                               review_body star_rating  class
939996   soon put mfcdw printer signaled jam inside eve...           1      1
1698116  dealt machine year hated every minute yes eat ...           1      1
1965940  let first saythis biggest junk ever purchased ...           1      1
2600383  like reviewer also replace mine unit would rec...           2      1
2281202  printer last spews ink crimp page corner crump...         1.0      1
...                                                    ...         ...    ...
1313431                                          good unit           5      2
1207699  received yesterday meet need speak hold long t...           4      2
53121    gave journal gift little girl love writing sto...           5      2
45356    arrived perfect conditionjust put printer comm...           5      2
591625                                hard fine work great           5      2

[99993 rows x 3 columns]


### Print results

In [28]:
ave_len_bef = cleaned_df['review_body'].str.len().mean()
ave_len_aft = classified_df['review_body'].str.len().mean()

print(f'Average length of reviews before vs after data preprocessing: {ave_len_bef:.1f}, {ave_len_aft:.1f}')

Average length of reviews before vs after data preprocessing: 189.4, 321.5


# TF-IDF and BoW Feature Extraction

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split

processed_df = lemmatized_df
max_features = 5000

bow_vectorizer = CountVectorizer(max_features=max_features)
X_bow = bow_vectorizer.fit_transform(processed_df['review_body'])

tfidf_vectorizer = TfidfVectorizer(max_features=max_features)
X_tfidf = tfidf_vectorizer.fit_transform(processed_df['review_body'])

y = processed_df['class']
X_train_bow, X_test_bow, y_train_bow, y_test_bow = train_test_split(X_bow, y, test_size=0.20)
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(X_tfidf, y, test_size=0.20)

print(X_train_bow, X_test_bow, y_train_bow, y_test_bow)
print(X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf)

  (0, 2814)	1
  (0, 1246)	1
  (0, 4263)	1
  (0, 4482)	1
  (0, 2975)	1
  (0, 1757)	1
  (1, 3305)	1
  (1, 4927)	1
  (1, 1869)	1
  (2, 3305)	1
  (2, 3311)	2
  (2, 3302)	1
  (2, 2188)	1
  (2, 3045)	1
  (2, 2582)	1
  (2, 2783)	1
  (2, 1906)	1
  (2, 2490)	1
  (2, 3394)	1
  (2, 4349)	1
  (2, 2535)	1
  (2, 3893)	1
  (2, 784)	1
  (2, 3150)	1
  (2, 574)	1
  :	:
  (79992, 2089)	1
  (79993, 3305)	1
  (79993, 602)	1
  (79993, 1463)	1
  (79993, 4500)	2
  (79993, 3890)	2
  (79993, 2200)	2
  (79993, 3333)	1
  (79993, 2202)	1
  (79993, 4528)	1
  (79993, 558)	1
  (79993, 1815)	1
  (79993, 4523)	1
  (79993, 557)	1
  (79993, 169)	1
  (79993, 3326)	1
  (79993, 2972)	1
  (79993, 4608)	1
  (79993, 1399)	1
  (79993, 1444)	1
  (79993, 3666)	1
  (79993, 3629)	1
  (79993, 3884)	1
  (79993, 772)	1
  (79993, 2934)	1   (0, 4722)	1
  (0, 2077)	1
  (0, 2841)	1
  (0, 2310)	1
  (0, 270)	1
  (0, 2636)	1
  (0, 4077)	1
  (0, 2754)	1
  (0, 4272)	1
  (0, 2368)	1
  (0, 263)	1
  (1, 3305)	1
  (1, 3302)	1
  (1, 602)	2
  (1, 16

# Perceptron Using Both Features

In [30]:
from sklearn.linear_model import Perceptron
from sklearn.metrics import precision_score, recall_score, f1_score

max_iters = 10_000

perc_bow = Perceptron(max_iter=max_iters)
perc_bow.fit(X_train_bow, y_train_bow)
y_pred_bow = perc_bow.predict(X_test_bow)

precision_bow = precision_score(y_test_bow, y_pred_bow)
recall_bow = recall_score(y_test_bow, y_pred_bow)
f1_bow = f1_score(y_test_bow, y_pred_bow)

print('Bag of Words Perceptron')
print(f'Precision: {precision_bow:.2f}, Recall: {recall_bow:.2f}, F1: {f1_bow:.2f}')

perc_tfidf = Perceptron(max_iter=max_iters)
perc_tfidf.fit(X_train_tfidf, y_train_tfidf)
y_pred_tfidf = perc_tfidf.predict(X_test_tfidf)

precision_tfidf = precision_score(y_test_tfidf, y_pred_tfidf)
recall_tfidf = recall_score(y_test_tfidf, y_pred_tfidf)
f1_tfidf = f1_score(y_test_tfidf, y_pred_tfidf)

print('TF-IDF Perceptron')
print(f'Precision: {precision_bow:.2}, Recall: {recall_tfidf:.2}, F1: {f1_tfidf:.2}')

Bag of Words Perceptron
Precision: 0.76, Recall: 0.81, F1: 0.79
TF-IDF Perceptron
Precision: 0.76, Recall: 0.79, F1: 0.79


# SVM Using Both Features

In [31]:
from sklearn.svm import LinearSVC

svm_bow = LinearSVC(dual=False, max_iter=max_iters)
svm_bow.fit(X_train_bow, y_train_bow)
y_pred_bow = svm_bow.predict(X_test_bow)

precision_bow = precision_score(y_test_bow, y_pred_bow)
recall_bow = recall_score(y_test_bow, y_pred_bow)
f1_bow = f1_score(y_test_bow, y_pred_bow)

print('Bag of Words SVM')
print(f'Precision: {precision_bow:.2}, Recall: {recall_bow:.2}, F1: {f1_bow:.2}')

svm_tfidf = LinearSVC(dual=False, max_iter=max_iters)
svm_tfidf.fit(X_train_tfidf, y_train_tfidf)
y_pred_tfidf = svm_tfidf.predict(X_test_tfidf)

precision_tfidf = precision_score(y_test_tfidf, y_pred_tfidf)
recall_tfidf = recall_score(y_test_tfidf, y_pred_tfidf)
f1_tfidf = f1_score(y_test_tfidf, y_pred_tfidf)

print('TF-IDF SVM')
print(f'Precision: {precision_tfidf:.2}, Recall: {recall_tfidf:.2}, F1: {f1_tfidf:.2}')

Bag of Words SVM
Precision: 0.84, Recall: 0.81, F1: 0.82
TF-IDF SVM
Precision: 0.84, Recall: 0.84, F1: 0.84


# Logistic Regression Using Both Features

In [32]:
from sklearn.linear_model import LogisticRegression

logreg_bow = LogisticRegression(max_iter=max_iters)
logreg_bow.fit(X_train_bow, y_train_bow)
y_pred_bow = logreg_bow.predict(X_test_bow)

precision_bow = precision_score(y_test_bow, y_pred_bow)
recall_bow = recall_score(y_test_bow, y_pred_bow)
f1_bow = f1_score(y_test_bow, y_pred_bow)

print('Bag of Words Logistic Regression')
print(f'Precision: {precision_bow:.2}, Recall: {recall_bow:.2}, F1: {f1_bow:.2}')

logreg_tfidf = LogisticRegression(max_iter=max_iters)
logreg_tfidf.fit(X_train_tfidf, y_train_tfidf)
y_pred_tfidf = logreg_tfidf.predict(X_test_tfidf)

precision_tfidf = precision_score(y_test_tfidf, y_pred_tfidf)
recall_bow = recall_score(y_test_tfidf, y_pred_tfidf)
f1_bow = f1_score(y_test_tfidf, y_pred_tfidf)

print('TF-IDF Logistic Regression')
print(f'Precision: {precision_tfidf:.2}, Recall: {recall_tfidf:.2}, F1: {f1_tfidf:.2}')

Bag of Words Logistic Regression
Precision: 0.84, Recall: 0.82, F1: 0.83
TF-IDF Logistic Regression
Precision: 0.84, Recall: 0.84, F1: 0.84


# Naive Bayes Using Both Features

In [33]:
from sklearn.naive_bayes import MultinomialNB

nb_bow = MultinomialNB()
nb_bow.fit(X_train_bow, y_train_bow)
y_pred_bow = nb_bow.predict(X_test_bow)

precision_bow = precision_score(y_test_bow, y_pred_bow)
recall_bow = recall_score(y_test_bow, y_pred_bow)
f1_bow = f1_score(y_test_bow, y_pred_bow)

print('Bag of Words Multinomial Naive Bayes')
print(f'Precision: {precision_bow:.2}, Recall: {recall_bow:.2}, F1: {f1_bow:.2}')

nb_tfidf = MultinomialNB()
nb_tfidf.fit(X_train_tfidf, y_train_tfidf)
y_pred_tfidf = nb_tfidf.predict(X_test_tfidf)

precision_tfidf = precision_score(y_test_tfidf, y_pred_tfidf)
recall_bow = recall_score(y_test_tfidf, y_pred_tfidf)
f1_bow = f1_score(y_test_tfidf, y_pred_tfidf)

print('TF-IDF Multinomial Naive Bayes')
print(f'Precision: {precision_tfidf:.2}, Recall: {recall_tfidf:.2}, F1: {f1_tfidf:.2}')

Bag of Words Multinomial Naive Bayes
Precision: 0.83, Recall: 0.74, F1: 0.78
TF-IDF Multinomial Naive Bayes
Precision: 0.82, Recall: 0.84, F1: 0.84
