Step 1: Import libaries spacy(NLP) and pandas(handle amazon csv file), textblob for polarity

In [71]:
import spacy
import pandas as pd
from textblob import TextBlob

Step 2: Load spaCy's english model (small)

In [72]:
# Size small means that it will be quick to load but less accurate 
nlp = spacy.load('en_core_web_sm')

Step 3: Preprocess data function

In [73]:
def preprocess(text):
    doc = nlp(text) # function tokenizes the text fed to it

    # Removes stopwords and punctuation, and lemmatizes the tokens (and converts it to lowercase)
    return ' '.join([token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct])

Step 4: Load dataset and clean data

In [74]:
# Loading dataset and displaying first 5 rows
df = pd.read_csv('amazon_product_reviews.csv')
df.head()

Unnamed: 0,id,dateAdded,dateUpdated,name,asins,brand,categories,primaryCategories,imageURLs,keys,...,reviews.didPurchase,reviews.doRecommend,reviews.id,reviews.numHelpful,reviews.rating,reviews.sourceURLs,reviews.text,reviews.title,reviews.username,sourceURLs
0,AVpgNzjwLJeJML43Kpxn,2015-10-30T08:59:32Z,2019-04-25T09:08:16Z,AmazonBasics AAA Performance Alkaline Batterie...,"B00QWO9P0O,B00LH3DMUO",Amazonbasics,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,https://images-na.ssl-images-amazon.com/images...,"amazonbasics/hl002619,amazonbasicsaaaperforman...",...,,,,,3,https://www.amazon.com/product-reviews/B00QWO9...,I order 3 of them and one of the item is bad q...,... 3 of them and one of the item is bad quali...,Byger yang,"https://www.barcodable.com/upc/841710106442,ht..."
1,AVpgNzjwLJeJML43Kpxn,2015-10-30T08:59:32Z,2019-04-25T09:08:16Z,AmazonBasics AAA Performance Alkaline Batterie...,"B00QWO9P0O,B00LH3DMUO",Amazonbasics,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,https://images-na.ssl-images-amazon.com/images...,"amazonbasics/hl002619,amazonbasicsaaaperforman...",...,,,,,4,https://www.amazon.com/product-reviews/B00QWO9...,Bulk is always the less expensive way to go fo...,... always the less expensive way to go for pr...,ByMG,"https://www.barcodable.com/upc/841710106442,ht..."
2,AVpgNzjwLJeJML43Kpxn,2015-10-30T08:59:32Z,2019-04-25T09:08:16Z,AmazonBasics AAA Performance Alkaline Batterie...,"B00QWO9P0O,B00LH3DMUO",Amazonbasics,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,https://images-na.ssl-images-amazon.com/images...,"amazonbasics/hl002619,amazonbasicsaaaperforman...",...,,,,,5,https://www.amazon.com/product-reviews/B00QWO9...,Well they are not Duracell but for the price i...,... are not Duracell but for the price i am ha...,BySharon Lambert,"https://www.barcodable.com/upc/841710106442,ht..."
3,AVpgNzjwLJeJML43Kpxn,2015-10-30T08:59:32Z,2019-04-25T09:08:16Z,AmazonBasics AAA Performance Alkaline Batterie...,"B00QWO9P0O,B00LH3DMUO",Amazonbasics,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,https://images-na.ssl-images-amazon.com/images...,"amazonbasics/hl002619,amazonbasicsaaaperforman...",...,,,,,5,https://www.amazon.com/product-reviews/B00QWO9...,Seem to work as well as name brand batteries a...,... as well as name brand batteries at a much ...,Bymark sexson,"https://www.barcodable.com/upc/841710106442,ht..."
4,AVpgNzjwLJeJML43Kpxn,2015-10-30T08:59:32Z,2019-04-25T09:08:16Z,AmazonBasics AAA Performance Alkaline Batterie...,"B00QWO9P0O,B00LH3DMUO",Amazonbasics,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,https://images-na.ssl-images-amazon.com/images...,"amazonbasics/hl002619,amazonbasicsaaaperforman...",...,,,,,5,https://www.amazon.com/product-reviews/B00QWO9...,These batteries are very long lasting the pric...,... batteries are very long lasting the price ...,Bylinda,"https://www.barcodable.com/upc/841710106442,ht..."


In [75]:
# Getting the dimensions
df.shape

(28332, 24)

In [76]:
# Getting concise summary
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28332 entries, 0 to 28331
Data columns (total 24 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   28332 non-null  object 
 1   dateAdded            28332 non-null  object 
 2   dateUpdated          28332 non-null  object 
 3   name                 28332 non-null  object 
 4   asins                28332 non-null  object 
 5   brand                28332 non-null  object 
 6   categories           28332 non-null  object 
 7   primaryCategories    28332 non-null  object 
 8   imageURLs            28332 non-null  object 
 9   keys                 28332 non-null  object 
 10  manufacturer         28332 non-null  object 
 11  manufacturerNumber   28332 non-null  object 
 12  reviews.date         28332 non-null  object 
 13  reviews.dateSeen     28332 non-null  object 
 14  reviews.didPurchase  9 non-null      object 
 15  reviews.doRecommend  16086 non-null 

In [77]:
# Looking for number of missing values
df.isnull().sum()

id                         0
dateAdded                  0
dateUpdated                0
name                       0
asins                      0
brand                      0
categories                 0
primaryCategories          0
imageURLs                  0
keys                       0
manufacturer               0
manufacturerNumber         0
reviews.date               0
reviews.dateSeen           0
reviews.didPurchase    28323
reviews.doRecommend    12246
reviews.id             28291
reviews.numHelpful     12217
reviews.rating             0
reviews.sourceURLs         0
reviews.text               0
reviews.title              0
reviews.username           5
sourceURLs                 0
dtype: int64

In [78]:
# Will be using thie reviews.rating to determine the accuracy nlp/polarity
df['reviews.rating'].unique()

array([3, 4, 5, 1, 2], dtype=int64)

In [79]:
# Only need reviews.text and reviews.rating for sentiment analysis and determining its accuracy, so creating a copy of df with only these columns
# Removing rows with missing values, as instructed but wouldn't have been necessary as neither have column have missing values
df_copy = df[['reviews.text', 'reviews.rating']].dropna()
df_copy.head()

Unnamed: 0,reviews.text,reviews.rating
0,I order 3 of them and one of the item is bad q...,3
1,Bulk is always the less expensive way to go fo...,4
2,Well they are not Duracell but for the price i...,5
3,Seem to work as well as name brand batteries a...,5
4,These batteries are very long lasting the pric...,5


Step 5: Create functions for sentiment analysis

In [80]:
# Using a subset of data (5000) for speed reasons, but this will reduce accuracy, and setting the seed to ensure reliability
df_copy = df_copy.sample(5000, random_state=42)

# Creating a new column or processed reviews.text
df_copy['processed_text'] = df_copy['reviews.text'].apply(preprocess)

# Creating a function that analyses polarity using TextBlob
def analyze_polarity(text):
    blob = TextBlob(text)
    polarity = blob.sentiment.polarity
    return polarity

# Creating a function that converts the polarity socre to sentiment label
def polarity_to_sentiment(polarity_score):

    ''' Thought that because polarity score ranges from -1 to 1, that dividing it by 3 (for each sentiment category) and using these values as thresholds would
    result in higher accuracy but was wrong. Running this code results in the output: Accuracy: 0.5472
    
    if polarity_score >= 0.33:
        return 'positive'
    elif polarity_score <= -0.33:
        return 'negative'
    elif polarity_score > -0.33 and polarity_score < 0.33:
        return 'neutral'
    '''
    # Updated threshold for better accuracy 
    if polarity_score > 0:
        return 'positive'
    elif polarity_score < 0:
        return 'negative'
    else:
        return 'neutral'

In [81]:
# Applying sentiment analysis to df_copy
df_copy['polarity_score'] = df_copy['processed_text'].apply(analyze_polarity)
df_copy['predicted_sentiment'] = df_copy['polarity_score'].apply(polarity_to_sentiment)

In [82]:
# Convert review ratings to sentiment categories using the ratings as the threshold to determine polarity/nlp accuracy
def rating_to_sentiment(rating):
    if rating >= 4:
        return 'positive'
    elif rating <= 2:
        return 'negative'
    else:
        return 'neutral'

In [86]:
# Adding ratings sentiment to df_copy
df_copy['actual_sentiment'] = df_copy['reviews.rating'].apply(rating_to_sentiment)
df_copy.head()

Unnamed: 0,reviews.text,reviews.rating,processed_text,polarity_score,predicted_sentiment,actual_sentiment
19947,Awesome tablet. I was amazed how fast it is. A...,5,awesome tablet amazed fast software user friendly,0.525,positive,positive
7445,They don't last. USed in electronics (like com...,1,electronic like computer mouse computer keyboa...,0.025,positive,negative
3525,Thx.,5,thx,0.0,neutral,positive
24122,kids love it EZ to use great Quality bought th...,5,kid love ez use great quality buy grand kid su...,0.538889,positive,positive
25572,The kids feature is great. My 18 month old tak...,4,kid feature great 18 month old take love block...,0.44375,positive,positive


Step 6: Testing accuracy

In [87]:
# Comparing predicted sentiment with actual sentiment
correct_predictions = (df_copy['predicted_sentiment'] == df_copy['actual_sentiment']).sum()
total_samples = len(df_copy)
accuracy = correct_predictions / total_samples
# Accuracy
print(f"Accuracy: {accuracy}")