In [None]:
import numpy
import spacy
import pandas as pd

In [None]:
# Read in csv as DF.
df = pd.read_csv("amazon_product_reviews.csv")
df.head()

Cleaning

In [None]:
# Select only the necessary columns for sentiment analysis.
cleaned = df[['reviews.text', 'reviews.title']]
cleaned.head()

In [None]:
# Check for any null values.
cleaned.isnull().sum()

In [None]:
# Load the English language model.
nlp = spacy.load('en_core_web_md')

In [None]:
# Lemmatize text and filter out punctuation and stop words.
cleaned['processed.text'] = cleaned['reviews.text'].apply(lambda sample: ' '.join([token.lemma_ for token in nlp(sample.lower().strip()) if not token.is_punct and not token.is_stop]))
cleaned.head()

Polarity

In [None]:
# Import necessary libraries for further analysis and add textblob.
# Extension to spaCy pipeline.
from spacytextblob.spacytextblob import SpacyTextBlob
from collections import defaultdict
from wordcloud import WordCloud
import matplotlib.pyplot as plt
%matplotlib inline
nlp.add_pipe('spacytextblob')

In [None]:
# Function to calculate polarity and subjectivity of a text.
def polarity(sample):

    doc = nlp(sample)
    
    polarity_value = doc._.blob.polarity

    subjectivity_value = doc._.blob.subjectivity
    
    return sample, polarity_value, subjectivity_value

In [None]:
# Initialise dictionaries to count positive and negative words.
positive_words = defaultdict(int)
negative_words = defaultdict(int)

# Iterate over processed text samples to determine sentiment.
for item in cleaned['processed.text'].values:

    sample, polarity_score, subjectivity_score = polarity(item)

    if polarity_score > 0:
        sentiment = "Positive"
        positive_words[item] += 1

    elif polarity_score < 0:
        sentiment = "Negative"
        negative_words[item] += 1
    
    else:
        sentiment = "Neutral"

    print(f"Review: {sample}\nPolarity score: {polarity_score}\nSentiment: {sentiment}\nSubjectivity: {subjectivity_score}\n")

    

In [None]:
# Word clouds for positive and negative words.
pos_wordcloud = WordCloud(width=400, height=200, background_color ='white').generate_from_frequencies(positive_words)
neg_wordcloud = WordCloud(width=400, height=200, background_color ='white').generate_from_frequencies(negative_words)

fig, ax = plt.subplots(1, 2, figsize=(10, 5))

ax[0].imshow(pos_wordcloud, interpolation='bilinear')
ax[0].set_title('Positive Words')
ax[0].axis('off')

ax[1].imshow(neg_wordcloud, interpolation='bilinear')
ax[1].set_title('Negative Words')
ax[1].axis('off')

In [None]:
# Function to calculate similarity between two reviews.
def review_similarity(review_1, review_2):
    
    return nlp(review_1).similarity(nlp(review_2))

# Select two reviews from the cleaned DataFrame.
review_1 = cleaned["processed.text"][0]
review_2 = cleaned["processed.text"][1]

# Calculate the similarity between the selected reviews.
similarity_score = review_similarity(review_1, review_2)

print(f"The similarity between the two reviews is: {similarity_score}")