**Amazon Review Dataset preprocessing and Sentiment Analysis**

In [None]:
#Importing all the libraries
import re

import pandas as pd

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import spacy

from wordcloud import WordCloud

import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [53]:
#Reading the file
df = pd.read_csv("/content/amazon_product_reviews.csv", on_bad_lines = "skip")

In [54]:
#Changing all the reviews to lower case
df['reviews'] = df['Reviews'].str.lower()

In [None]:
#Removing all the punctuations in the reviews
df['reviews'] = df['reviews'].str.replace('[^\w\s]', '')

In [33]:
#Removing emojis in the reviews
emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"
                           u"\U0001F300-\U0001F5FF"
                           u"\U0001F680-\U0001F6FF"
                           u"\U0001F1E0-\U0001F1FF"
                           u"\U00002500-\U00002BEF"
                           u"\U00002702-\U000027B0"
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           u"\U0001f926-\U0001f937"
                           u"\U00010000-\U0010ffff"
                           u"\u2640-\u2642"
                           u"\u2600-\u2B55"
                           u"\u200d"
                           u"\u23cf"
                           u"\u23e9"
                           u"\u231a"
                           u"\ufe0f"
                           u"\u3030"
                           "]+", flags=re.UNICODE)
df['reviews'] = df['reviews'].apply(lambda x: emoji_pattern.sub(r'', x))

In [56]:
#Removing stop words in the reviews
stop = stopwords.words('english')
df['reviews'] = df['reviews'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

In [57]:
#Using Lemmatization
nlp = spacy.load('en_core_web_sm')
def lemmatize_text(text):
    doc = nlp(text)
    lemmatized_tokens = [token.lemma_ for token in doc]
    return ' '.join(lemmatized_tokens)
df['reviews'] = df['reviews'].apply(lemmatize_text)

In [None]:
#Making a bar graph of ratings
fig = df['Ratings'].value_counts().sort_index().plot.bar(title = "User's Ratings on Amazon Alexa", color = "blue")
fig.set_xlabel("Ratings")

In [None]:
#Making a Word Cloud of reviews
text = ' '.join(df['reviews'])
wordcloud = WordCloud(width=800, height=400, background_color='white', max_font_size = 130).generate(text)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='sinc')
plt.axis('off')
plt.show()

In [60]:
#Generating polarity of the reviews as positive, negative, or neutral
sentiments = SentimentIntensityAnalyzer()
df["Positive"] = [sentiments.polarity_scores(reviews)["pos"] for reviews in df["reviews"]]
df["Negative"] = [sentiments.polarity_scores(reviews)["neg"] for reviews in df["reviews"]]
df["Neutral"] = [sentiments.polarity_scores(reviews)["neu"] for reviews in df["reviews"]]

In [76]:
#Creating a function to get overall sentiment of the reviews
def get_overall_sentiment(row):
    if row["Positive"] > row["Negative"]:
        return "positive"
    elif row["Positive"] < row["Negative"]:
        return "negative"
    else:
        return "neutral"

df["OverallSentiment"] = df.apply(get_overall_sentiment, axis=1)

In [None]:
df

In [None]:
#Creating pie chart to graphically represent the overall sentiment
pie_chart = df['OverallSentiment'].value_counts().plot.pie(title="Sentiment Pie Chart", colors="green")
pie_chart.set_ylabel('')
plt.show()