In [1]:
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
df = pd.read_csv('/Users/mzheng/stat3494-paper/data/amazon_reviews.txt', delimiter = '\t')

In [4]:
df.head()

Unnamed: 0,DOC_ID,LABEL,RATING,VERIFIED_PURCHASE,PRODUCT_CATEGORY,PRODUCT_ID,PRODUCT_TITLE,REVIEW_TITLE,REVIEW_TEXT
0,1,__label1__,4,N,PC,B00008NG7N,"Targus PAUK10U Ultra Mini USB Keypad, Black",useful,"When least you think so, this product will sav..."
1,2,__label1__,4,Y,Wireless,B00LH0Y3NM,Note 3 Battery : Stalion Strength Replacement ...,New era for batteries,Lithium batteries are something new introduced...
2,3,__label1__,3,N,Baby,B000I5UZ1Q,"Fisher-Price Papasan Cradle Swing, Starlight",doesn't swing very well.,I purchased this swing for my baby. She is 6 m...
3,4,__label1__,4,N,Office Products,B003822IRA,Casio MS-80B Standard Function Desktop Calculator,Great computing!,I was looking for an inexpensive desk calcolat...
4,5,__label1__,4,N,Beauty,B00PWSAXAM,Shine Whitening - Zero Peroxide Teeth Whitenin...,Only use twice a week,I only use it twice a week and the results are...


In [6]:
# re-encode 'LABEL' values
df.loc[df["LABEL"] == "__label1__", "LABEL"] = '1'
df.loc[df["LABEL"] == "__label2__", "LABEL"] = '0'

In [7]:
# get information about data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21000 entries, 0 to 20999
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   DOC_ID             21000 non-null  int64 
 1   LABEL              21000 non-null  object
 2   RATING             21000 non-null  int64 
 3   VERIFIED_PURCHASE  21000 non-null  object
 4   PRODUCT_CATEGORY   21000 non-null  object
 5   PRODUCT_ID         21000 non-null  object
 6   PRODUCT_TITLE      21000 non-null  object
 7   REVIEW_TITLE       21000 non-null  object
 8   REVIEW_TEXT        21000 non-null  object
dtypes: int64(2), object(7)
memory usage: 1.4+ MB


1. Exploring the dataset

In [11]:
products_by_labels = df.groupby(df["LABEL"]).PRODUCT_CATEGORY.value_counts()
products_by_labels

LABEL  PRODUCT_CATEGORY      
0      Apparel                   350
       Automotive                350
       Baby                      350
       Beauty                    350
       Books                     350
       Camera                    350
       Electronics               350
       Furniture                 350
       Grocery                   350
       Health & Personal Care    350
       Home                      350
       Home Entertainment        350
       Home Improvement          350
       Jewelry                   350
       Kitchen                   350
       Lawn and Garden           350
       Luggage                   350
       Musical Instruments       350
       Office Products           350
       Outdoors                  350
       PC                        350
       Pet Products              350
       Shoes                     350
       Sports                    350
       Tools                     350
       Toys                      350
       V

In [15]:
ratings_by_labels = df.groupby(df["LABEL"]).RATING.value_counts()
ratings_by_labels

LABEL  RATING
0      5         6151
       4         1974
       3          942
       1          868
       2          565
1      5         6059
       4         1999
       3          926
       1          889
       2          627
Name: RATING, dtype: int64

In [16]:
products_by_ratings = df.groupby(df["RATING"]).PRODUCT_CATEGORY.value_counts()
products_by_ratings

RATING  PRODUCT_CATEGORY  
1       Wireless              103
        Office Products        91
        PC                     84
        Lawn and Garden        77
        Electronics            74
                             ... 
5       Watches               375
        Shoes                 371
        Home Entertainment    359
        Wireless              357
        Furniture             342
Name: PRODUCT_CATEGORY, Length: 150, dtype: int64

In [17]:
labels_by_purchases = df.groupby(df["VERIFIED_PURCHASE"]).LABEL.value_counts()
labels_by_purchases

VERIFIED_PURCHASE  LABEL
N                  1        7623
                   0        1679
Y                  0        8821
                   1        2877
Name: LABEL, dtype: int64

2. Create new features

In [20]:
# create new feature: length of review_texts
df['TEXT_LENGTH'] = df['REVIEW_TEXT'].apply(len)
textLengths_by_labels = df.groupby(df["LABEL"]).TEXT_LENGTH.agg(lambda x: sum(x)/len(x))
textLengths_by_labels

LABEL
0    428.064571
1    316.538857
Name: TEXT_LENGTH, dtype: float64

In [21]:
# create new feature: number of sentences in review_text
df['NUM_SENTENCES'] = df['REVIEW_TEXT'].apply(lambda x: len(str(x).split('.')))

In [25]:
# get a list of common english words ('like', 'and', 'I')
import nltk
nltk.download('stopwords')
wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mzheng/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [26]:
def stop_count(x):
    total = 0
    for char in x.split():
        total += char in stop_words
    return total

# create new feature: number of stop words
df['STOP_COUNT'] = df['REVIEW_TEXT'].apply(stop_count)

In [27]:
def caps_count(x):
    total = 0
    for char in x:
        total += char in "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
    return total

# create new feature: number of capital letters
df['CAPS_COUNT'] = df['REVIEW_TEXT'].apply(caps_count)

In [29]:
import string

count = lambda l1, l2: sum([1 for x in l1 if x in l2])

def punct_count(x):
    return count(x, set(string.punctuation))

# create new feature: number of punctuation symbols
df['PUNCT_COUNT'] = df['REVIEW_TEXT'].apply(punct_count)

In [30]:
# create new feature: number of emojis used
df["EMOJIS"] = df["REVIEW_TEXT"].apply(lambda x: 1 if ";)" in x.split() or ":)" in x.split() or ":-)" in x.split() else 0)

In [31]:
# create new feature: sentiment classifier

# any rating < 3 is a negative review
df.loc[df["RATING"] < 3, "RATING"] = 0

# a review of 3 is neutral and doesn't fall into either category

# any rating > 3 is a positive review
df.loc[df["RATING"] > 3, "RATING"] = 1

In [33]:
# rating 1 is over-represented compared to rating 0 and rating 3 should be ignored
df.RATING.value_counts()

1    16183
0     2949
3     1868
Name: RATING, dtype: int64

In [34]:
# df with all RATING = 1
df1 = df.loc[df['RATING'] == 1]

# want to make RATING = 1 more proportional to RATING = 0 (select 20% of the data with RATING = 1)
df2 = df1.sample(frac=0.2, replace=True)

# df with all RATING = 0
df3 = df.loc[df['RATING'] == 0]

# combining df2 with df3 to make a resulting df that contains proportional amounts of RATING = 1 and RATING = 0
df4 = pd.concat([df2, df3], ignore_index=True)
df4

Unnamed: 0,DOC_ID,LABEL,RATING,VERIFIED_PURCHASE,PRODUCT_CATEGORY,PRODUCT_ID,PRODUCT_TITLE,REVIEW_TITLE,REVIEW_TEXT,TEXT_LENGTH,NUM_SENTENCES,STOP_COUNT,CAPS_COUNT,PUNCT_COUNT,EMOJIS
0,6596,1,1,N,Toys,B00M159GCK,Ginzick 4ch Rc Remote Control Speed Zoom Race ...,Wow! It's cool,My son loves this... it works awesome & goes r...,150,6,9,3,11,0
1,7654,1,1,N,Pet Products,B0114CAWBY,"★★ MASSIVE 32OZ GLUCOSAMINE★★Best Glucosamine,...",Has really helped,"A great product, I totally recommend it to you...",433,6,38,8,12,0
2,6800,1,1,Y,Camera,B00GXGZ6UY,HDView 2.4MP HD-AHD 1080P Outdoor Turbo Platin...,Caught a thief,"Installed about 3 months, works great. It caug...",157,2,9,2,4,0
3,11867,0,1,Y,Home,B0097GMHXG,Seville Classics WEB162 Mobile Laptop Desk Cart,Stable cart with a solid top.,This laptop cart has a one piece level top tha...,995,11,73,14,30,0
4,12086,0,1,N,Baby,B00A4B35I4,Fisher-Price Deluxe Newborn Rock 'N Play Sleep...,Great Product!,This is a must-have for those who want to have...,388,6,35,5,17,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6181,20961,0,0,Y,Shoes,B0069F61NU,MG Collection Lucca Designer Inspired Glamour ...,not same,"the bag is not same as the picture, nothing is...",109,2,13,1,5,0
6182,20967,0,0,Y,Shoes,B005B9GFUY,Fila Women's Memory Flux Slip Resistant Traini...,"Too man""ish""",These are so manish looking I sent them back. ...,146,4,9,5,7,0
6183,20970,0,0,Y,Shoes,B008MI08ZO,Stride Rite Star Wars Morphing Light-Up Sneake...,JUNK!,We are on our third pair in less than 2 months...,485,4,52,8,7,0
6184,20983,0,0,Y,Shoes,B00IA6US7G,West Blvd Womens LIMA MOCCASIN Boots 3-Layer F...,Good thing they are only for one outfit to hav...,These run I would say two sizes smaller than w...,487,8,41,14,16,0


In [40]:
# processing data to be split into test and training sets by isolating 2 key columns
raw_data = df4[['REVIEW_TEXT', 'RATING']]
raw_data = [tuple(x) for x in raw_data.values]

3. Data pre-processing

In [44]:
# process review text for model

# returns a table mapping each punctuation symbol to None; for use with translate() later
table = str.maketrans({key: None for key in string.punctuation})

def processor(text):
    # converts a word to its base form
    lemmatizer = nltk.stem.WordNetLemmatizer()

    # stores bigrams (a pair of consecutive words)
    filtered_tokens = []

    # contains all the base words (converted from their original words in the review text)
    lemmatized_tokens = []

    # set of stop words (commonly used english words)
    stop_words = set(nltk.corpus.stopwords.words('english'))

    # returns a string where each character is mapped to its corresponding character in the translation table
    text = text.translate(table)

    # iterate through each word in review text
    for word in text.split(" "):
        if word not in stop_words: 
            # then, the word should be converted to its base form
            lemmatized_tokens.append(lemmatizer.lemmatize(word.lower()))

        # append the bigrams of that base word to filtered_tokens
        filtered_tokens = [' '.join(l) for l in nltk.bigrams(lemmatized_tokens)] + lemmatized_tokens

    return filtered_tokens

In [55]:
# create feature vectors (to be used with 'processsor' inside 'train_test_split')
# feature_vector numerically quantifies the contents of review_text so that ML models can use it to make predictions

# a global dictionary of features
global_feature_dict = dict()

def feature_vector(tokens):
    # a local dictionary of features
    local_feature_dict = dict()

    # iterate through each token
    for token in tokens:
        if token not in global_feature_dict:
            global_feature_dict[token] = 1
        else:
            global_feature_dict[token] =+ 1

        if token not in local_feature_dict:
            local_feature_dict[token] = 1
        else:
            local_feature_dict[token] =+ 1
    
    return local_feature_dict

In [56]:
nltk.download('wordnet')
nltk.download('omw-1.4')

def train_test_split(raw_data, p):
    # stores training set
    training_set = []

    # stores testing set
    testing_set = []

    # number of rows in raw_data
    all_raw_data = len(raw_data)

    # number of rows in half of raw_data
    half_raw_data = int(len(raw_data)/2)

    # extra rows to split by
    randomized_index = int((p * all_raw_data)/2)

    # suppose you have 100 data values and p=0.8
    # this for-loop selects 0-39 and 50-89 as training data
    for (review_text, rating) in raw_data[:randomized_index] + raw_data[half_raw_data:half_raw_data + randomized_index]:
        training_set.append((feature_vector(processor(review_text)), rating))

    # this for-loop selects 40-49 and 90-100 as testing data
    for (review_text, rating) in raw_data[randomized_index:half_raw_data] + raw_data[half_raw_data + randomized_index:]:
        testing_set.append((feature_vector(processor(review_text)), rating))
    
    return training_set, testing_set

[nltk_data] Downloading package wordnet to /Users/mzheng/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/mzheng/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [51]:
# split raw data (0.8 is training, 0.2 is testing)
training_set, testing_set = train_test_split(raw_data, 0.8)

4. Model building

In [52]:
from sklearn.svm import LinearSVC
from nltk.classify import SklearnClassifier
from sklearn.pipeline import Pipeline

In [53]:
# the classifier
# training set is a list of (feature vector, label) where each 'feature vector' is a dict mapping strings to a number
def classifier(training_set):
    # pipeline containing the Linear Support Vector Classifier (SVM) from sklearn
    pipeline =  Pipeline([('svc', LinearSVC())])
    return SklearnClassifier(pipeline).train(training_set) # trains the SVM on training_set

In [54]:
# predicts labels using a trained classifier
def predict(testing_set, classifier):
    # for each 'review_text' in testing_set, map the corresponding prediction made by the classifer to it
    return classifier.classify_many(map(lambda x: x[0], testing_set))

In [57]:
# train the classifier
classifier = classifier(training_set)

# make predictions using the trained classifier
predictions = predict(testing_set, classifier)

# get true labels of test data
# for each 'rating' in testing_set, map it to a list
true_labels = list(map(lambda x: x[1], testing_set))



In [61]:
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score

# accuracy of model on test_data
accuracy = accuracy_score(true_labels, predictions)

# precision, recall, and fscore on test_data
precision, recall, fscore, _ = precision_recall_fscore_support(true_labels, predictions, average='macro')

print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)
print("F-score: ", fscore)

Accuracy:  0.8618739903069467
Precision:  0.8620866149957174
Recall:  0.8618739903069467
F-score:  0.8618537097229935
