In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
from sklearn.feature_extraction.text import TfidfTransformer,TfidfVectorizer,CountVectorizer
from sklearn import metrics
from sklearn.metrics import roc_curve,auc

In [3]:
import re
import nltk
import string
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
import tqdm

In [4]:
from gensim.models import word2vec
from gensim.models import keyedvectors
import pickle,os,json,warnings

<div style="background-color:#202123; color:#ffffff; padding:8px 14px; border-radius:6px;">
  <h2 style="margin:6px 0; color:#ffffff;">
    1 INTRODUCTION
  </h2>
</div>

The Amazon Product Reviews dataset contains user-generated reviews for a wide range of products sold on Amazon. Each record represents a single review and includes both unstructured text data and structured metadata, making it suitable for Natural Language Processing (NLP) and sentiment analysis tasks.

The dataset captures customer opinions in the form of review text and ratings, along with contextual information such as purchase verification, review time, and product identifiers. Due to the large number of product categories, analysis is typically performed on a subset of categories, while the insights and methodologies can be generalized to other categories.
REFERENCE - <a href = "https://nijianmo.github.io/amazon/">https://nijianmo.github.io/amazon/</a>

<div style="background-color:#202123; color:#ffffff; padding:8px 14px; border-radius:6px;">
  <h2 style="margin:6px 0; color:#ffffff;">
    2 Data Loading
  </h2>
</div>

In [5]:
import pandas as pd
import gzip

def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield json.loads(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

In [6]:
df_fashion = getDF('Dataset/Dataset/All_Beauty_5.json.gz')
amazon_fashion = getDF("Dataset/Dataset/AMAZON_FASHION_5.json.gz")
appliances_5 = getDF('Dataset/Dataset/Appliances_5.json.gz')


In [7]:
appliances_5.head()
df=pd.concat([df_fashion,amazon_fashion,appliances_5])
df.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
0,5.0,True,"09 1, 2016",A3CIUOJXQ5VDQ2,B0000530HU,"{'Size:': ' 7.0 oz', 'Flavor:': ' Classic Ice ...",Shelly F,As advertised. Reasonably priced,Five Stars,1472688000,,
1,5.0,True,"11 14, 2013",A3H7T87S984REU,B0000530HU,"{'Size:': ' 7.0 oz', 'Flavor:': ' Classic Ice ...",houserules18,Like the oder and the feel when I put it on my...,Good for the face,1384387200,,
2,1.0,True,"08 18, 2013",A3J034YH7UG4KT,B0000530HU,"{'Size:': ' 7.0 oz', 'Flavor:': ' Classic Ice ...",Adam,I bought this to smell nice after I shave. Wh...,Smells awful,1376784000,,
3,5.0,False,"05 3, 2011",A2UEO5XR3598GI,B0000530HU,"{'Size:': ' 7.0 oz', 'Flavor:': ' Classic Ice ...",Rich K,HEY!! I am an Aqua Velva Man and absolutely lo...,Truth is There IS Nothing Like an AQUA VELVA MAN.,1304380800,25.0,
4,5.0,True,"05 6, 2011",A3SFRT223XXWF7,B00006L9LC,{'Size:': ' 200ml/6.7oz'},C. C. Christian,If you ever want to feel pampered by a shampoo...,Bvlgari Shampoo,1304640000,3.0,


In [8]:
df.to_csv("Dataset/Dataset/data",index=False)

<h2>Droping rows where overall=3</h2>

In [9]:
df=df.loc[df["overall"]!=3,:]

In [10]:
df.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
0,5.0,True,"09 1, 2016",A3CIUOJXQ5VDQ2,B0000530HU,"{'Size:': ' 7.0 oz', 'Flavor:': ' Classic Ice ...",Shelly F,As advertised. Reasonably priced,Five Stars,1472688000,,
1,5.0,True,"11 14, 2013",A3H7T87S984REU,B0000530HU,"{'Size:': ' 7.0 oz', 'Flavor:': ' Classic Ice ...",houserules18,Like the oder and the feel when I put it on my...,Good for the face,1384387200,,
2,1.0,True,"08 18, 2013",A3J034YH7UG4KT,B0000530HU,"{'Size:': ' 7.0 oz', 'Flavor:': ' Classic Ice ...",Adam,I bought this to smell nice after I shave. Wh...,Smells awful,1376784000,,
3,5.0,False,"05 3, 2011",A2UEO5XR3598GI,B0000530HU,"{'Size:': ' 7.0 oz', 'Flavor:': ' Classic Ice ...",Rich K,HEY!! I am an Aqua Velva Man and absolutely lo...,Truth is There IS Nothing Like an AQUA VELVA MAN.,1304380800,25.0,
4,5.0,True,"05 6, 2011",A3SFRT223XXWF7,B00006L9LC,{'Size:': ' 200ml/6.7oz'},C. C. Christian,If you ever want to feel pampered by a shampoo...,Bvlgari Shampoo,1304640000,3.0,


<h2>Convering unix time to pandas date</h2>

In [11]:
df["unixTime"]=pd.to_datetime(df["unixReviewTime"],unit='s')

In [12]:
df.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image,unixTime
0,5.0,True,"09 1, 2016",A3CIUOJXQ5VDQ2,B0000530HU,"{'Size:': ' 7.0 oz', 'Flavor:': ' Classic Ice ...",Shelly F,As advertised. Reasonably priced,Five Stars,1472688000,,,2016-09-01
1,5.0,True,"11 14, 2013",A3H7T87S984REU,B0000530HU,"{'Size:': ' 7.0 oz', 'Flavor:': ' Classic Ice ...",houserules18,Like the oder and the feel when I put it on my...,Good for the face,1384387200,,,2013-11-14
2,1.0,True,"08 18, 2013",A3J034YH7UG4KT,B0000530HU,"{'Size:': ' 7.0 oz', 'Flavor:': ' Classic Ice ...",Adam,I bought this to smell nice after I shave. Wh...,Smells awful,1376784000,,,2013-08-18
3,5.0,False,"05 3, 2011",A2UEO5XR3598GI,B0000530HU,"{'Size:': ' 7.0 oz', 'Flavor:': ' Classic Ice ...",Rich K,HEY!! I am an Aqua Velva Man and absolutely lo...,Truth is There IS Nothing Like an AQUA VELVA MAN.,1304380800,25.0,,2011-05-03
4,5.0,True,"05 6, 2011",A3SFRT223XXWF7,B00006L9LC,{'Size:': ' 200ml/6.7oz'},C. C. Christian,If you ever want to feel pampered by a shampoo...,Bvlgari Shampoo,1304640000,3.0,,2011-05-06


<h2>adding a new column where overal greater than 3 having valuw 1 and less than 3 having value 0</h2>

In [13]:
df["Score"] = np.where(df["overall"]>3,1,0)

In [14]:
df.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image,unixTime,Score
0,5.0,True,"09 1, 2016",A3CIUOJXQ5VDQ2,B0000530HU,"{'Size:': ' 7.0 oz', 'Flavor:': ' Classic Ice ...",Shelly F,As advertised. Reasonably priced,Five Stars,1472688000,,,2016-09-01,1
1,5.0,True,"11 14, 2013",A3H7T87S984REU,B0000530HU,"{'Size:': ' 7.0 oz', 'Flavor:': ' Classic Ice ...",houserules18,Like the oder and the feel when I put it on my...,Good for the face,1384387200,,,2013-11-14,1
2,1.0,True,"08 18, 2013",A3J034YH7UG4KT,B0000530HU,"{'Size:': ' 7.0 oz', 'Flavor:': ' Classic Ice ...",Adam,I bought this to smell nice after I shave. Wh...,Smells awful,1376784000,,,2013-08-18,0
3,5.0,False,"05 3, 2011",A2UEO5XR3598GI,B0000530HU,"{'Size:': ' 7.0 oz', 'Flavor:': ' Classic Ice ...",Rich K,HEY!! I am an Aqua Velva Man and absolutely lo...,Truth is There IS Nothing Like an AQUA VELVA MAN.,1304380800,25.0,,2011-05-03,1
4,5.0,True,"05 6, 2011",A3SFRT223XXWF7,B00006L9LC,{'Size:': ' 200ml/6.7oz'},C. C. Christian,If you ever want to feel pampered by a shampoo...,Bvlgari Shampoo,1304640000,3.0,,2011-05-06,1


In [15]:
df.shape

(9855, 14)

<div style="background-color:#202123; color:#ffffff; padding:8px 14px; border-radius:6px;">
  <h2 style="margin:6px 0; color:#ffffff;">
    3 Exploratory Data Analysis
  </h2>
</div>

In [16]:
df.head(1)

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image,unixTime,Score
0,5.0,True,"09 1, 2016",A3CIUOJXQ5VDQ2,B0000530HU,"{'Size:': ' 7.0 oz', 'Flavor:': ' Classic Ice ...",Shelly F,As advertised. Reasonably priced,Five Stars,1472688000,,,2016-09-01,1


In [17]:
sorted_df = df.sort_values(["reviewerName","reviewerID","unixTime","summary","reviewText"],ascending=True)

In [18]:
sorted_df.head(10)

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image,unixTime,Score
4924,5.0,False,"08 26, 2017",A2MJ8OL2FYN7CW,B00W259T7G,"{'Size:': ' 250 Gram', 'Color:': ' Olive Oil'}",Lynne E.,This PRE DE PROVENCE SHEA BUTTER ENRICHED ARTI...,"Feels Luxurious, Doesn't Dry Out Sensitive Skin",1503705600,,,2017-08-26,1
4187,5.0,False,"09 4, 2017",A2MJ8OL2FYN7CW,B001LNODUS,{'Color:': ' Shower Gel'},Lynne E.,"Lavender is my favorite soap fragrance, so it'...",Clear Gel Creates Nice Lather With Delicate La...,1504483200,,,2017-09-04,1
5034,5.0,False,"09 4, 2017",A2MJ8OL2FYN7CW,B019FWRG3C,{'Color:': ' Shower Gel'},Lynne E.,"Lavender is my favorite soap fragrance, so it'...",Clear Gel Creates Nice Lather With Delicate La...,1504483200,,,2017-09-04,1
4856,5.0,False,"09 21, 2017",A2MJ8OL2FYN7CW,B00W259T7G,"{'Size:': ' 150 Gram', 'Color:': ' Sea Salt'}",Lynne E.,"I love Pre de Provence soaps, but was intensel...",Luxurious French Soap Has Fresh Sea Breeze Fra...,1505952000,7.0,,2017-09-21,1
4849,5.0,False,"09 27, 2017",A2MJ8OL2FYN7CW,B00W259T7G,"{'Size:': ' 250 Gram', 'Color:': ' Sandalwood'}",Lynne E.,This PRE DE PROVENCE Sandalwood SHEA BUTTER EN...,"Luxurious French Soap With Mild, Masculine Fra...",1506470400,,,2017-09-27,1
4848,5.0,False,"09 27, 2017",A2MJ8OL2FYN7CW,B00W259T7G,"{'Size:': ' 250 Gram', 'Color:': ' Patchouli'}",Lynne E.,This PRE DE PROVENCE PATCHOULI SHEA BUTTER ENR...,Luxurious French Soap With Musky Masculine Fra...,1506470400,,,2017-09-27,1
4160,4.0,False,"08 2, 2016",ALNFHVS3SC4FV,B001ET7FZE,{'Style Name:': ' Bubble Fruit'},Seas the Day,My kids love using the Colgate pumps. I think ...,Kids Like the Flavor and the Pump,1470096000,,,2016-08-02,1
5056,4.0,False,"08 2, 2016",ALNFHVS3SC4FV,B01BNEYGQU,{'Style Name:': ' Bubble Fruit'},Seas the Day,My kids love using the Colgate pumps. I think ...,Kids Like the Flavor and the Pump,1470096000,,,2016-08-02,1
4908,5.0,False,"08 29, 2017",ALNFHVS3SC4FV,B00W259T7G,"{'Size:': ' 150 Gram', 'Color:': ' Starflower'}",Seas the Day,Review for Starflower:\n\nThis is a nice bar o...,"Large Bar, Floral but Nice and Clean Smelling",1503964800,,,2017-08-29,1
2190,5.0,False,"09 10, 2017",ALNFHVS3SC4FV,B0010ZBORW,{'Color:': ' Loofah'},Seas the Day,Review for Loofah:\n\nI have tried loofahs in ...,Nice Loofah with Attached Suction Cup,1505001600,,,2017-09-10,1


In [19]:
sorted_df.shape

(9855, 14)

In [20]:
sorted_df.drop_duplicates(subset=["reviewerName","reviewerID","unixTime","summary","reviewText"],inplace=True)

In [21]:
sorted_df.shape

(1874, 14)

In [22]:
sorted_df.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image,unixTime,Score
4924,5.0,False,"08 26, 2017",A2MJ8OL2FYN7CW,B00W259T7G,"{'Size:': ' 250 Gram', 'Color:': ' Olive Oil'}",Lynne E.,This PRE DE PROVENCE SHEA BUTTER ENRICHED ARTI...,"Feels Luxurious, Doesn't Dry Out Sensitive Skin",1503705600,,,2017-08-26,1
4187,5.0,False,"09 4, 2017",A2MJ8OL2FYN7CW,B001LNODUS,{'Color:': ' Shower Gel'},Lynne E.,"Lavender is my favorite soap fragrance, so it'...",Clear Gel Creates Nice Lather With Delicate La...,1504483200,,,2017-09-04,1
4856,5.0,False,"09 21, 2017",A2MJ8OL2FYN7CW,B00W259T7G,"{'Size:': ' 150 Gram', 'Color:': ' Sea Salt'}",Lynne E.,"I love Pre de Provence soaps, but was intensel...",Luxurious French Soap Has Fresh Sea Breeze Fra...,1505952000,7.0,,2017-09-21,1
4849,5.0,False,"09 27, 2017",A2MJ8OL2FYN7CW,B00W259T7G,"{'Size:': ' 250 Gram', 'Color:': ' Sandalwood'}",Lynne E.,This PRE DE PROVENCE Sandalwood SHEA BUTTER EN...,"Luxurious French Soap With Mild, Masculine Fra...",1506470400,,,2017-09-27,1
4848,5.0,False,"09 27, 2017",A2MJ8OL2FYN7CW,B00W259T7G,"{'Size:': ' 250 Gram', 'Color:': ' Patchouli'}",Lynne E.,This PRE DE PROVENCE PATCHOULI SHEA BUTTER ENR...,Luxurious French Soap With Musky Masculine Fra...,1506470400,,,2017-09-27,1


<h2>Display full text</h2>

In [23]:
pd.set_option("display.max_columns",100)
pd.set_option("display.max_rows",100)

In [24]:
text=[]
text = [txt for txt in sorted_df['reviewText'].head(10)]

In [25]:
for txt in text:
  with open(r"text/txt",'a+') as f:
    f.write(txt)

In [26]:
with open(r"text/txt",'r+') as f:
    content=f.read()
    print(content)

This PRE DE PROVENCE SHEA BUTTER ENRICHED ARTISANAL FRENCH SOAP BAR (Olive Oil (250 g)) lathers beautifully, and feels luxurious.  It doesn't dry out or irritate my sensitive skin, so I can use it every day.  This is a large 250 gram olive oil bar that gives good value for the money (about $7 on Amazon).Lavender is my favorite soap fragrance, so it's no surprise that I love this PRE DE PROVENCE FRENCH LAVENDER BATH & SHOWER GEL.  For the shower, a pump or two is enough for a nice lather, and the lather washes off easily.

The lavender fragrance is delicate, and it lingers for only a short time.  It doesn't overwhelm perfume, aftershave lotion, shampoo, or other scented products you may like to use.  The clear gel arrives in an attractive square pump dispenser.

The shower gel leaves my skin silky smooth, and it doesn't irritate my sensitive skin.I love Pre de Provence soaps, but was intensely curious about the Sea Salt bar.  What would it smell like?  Why would I want to wash my hands 

<div style="background-color:#202123; color:#ffffff; padding:8px 14px; border-radius:6px;">
  <h2 style="margin:6px 0; color:#ffffff;">
    4 Preprocessing the data
  </h2>
</div>

__Remove URL__

In [27]:
s1 = print(sorted_df['reviewText'].values[0])

This PRE DE PROVENCE SHEA BUTTER ENRICHED ARTISANAL FRENCH SOAP BAR (Olive Oil (250 g)) lathers beautifully, and feels luxurious.  It doesn't dry out or irritate my sensitive skin, so I can use it every day.  This is a large 250 gram olive oil bar that gives good value for the money (about $7 on Amazon).


In [28]:
sorted_df['reviewText'] = sorted_df['reviewText'].astype('string').apply(lambda x : re.sub(r"http\S+","",x) if x is not pd.NA else x)

In [29]:
sorted_df['reviewText']

4924    This PRE DE PROVENCE SHEA BUTTER ENRICHED ARTI...
4187    Lavender is my favorite soap fragrance, so it'...
4856    I love Pre de Provence soaps, but was intensel...
4849    This PRE DE PROVENCE Sandalwood SHEA BUTTER EN...
4848    This PRE DE PROVENCE PATCHOULI SHEA BUTTER ENR...
                              ...                        
3130    Size, colour and print all above average but d...
816     Great product - kids love it and it smells goo...
405     extremely pleased, very pleasant scent, very l...
643                                           My favorite
606               Very good product----I use it every day
Name: reviewText, Length: 1874, dtype: str

<h2>Using Beutifulsoup</h2>

In [30]:
from bs4 import BeautifulSoup

In [31]:
bs = [BeautifulSoup(text,'lxml').get_text() if pd.notna(text) else text for text in sorted_df['reviewText'] ]

In [32]:
sorted_df.loc[:,'Text']=bs

In [33]:
sorted_df['reviewText']=sorted_df['Text']

In [34]:
sorted_df.drop(columns=['Text'],inplace=True)

In [35]:
sorted_df.head(1)

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image,unixTime,Score
4924,5.0,False,"08 26, 2017",A2MJ8OL2FYN7CW,B00W259T7G,"{'Size:': ' 250 Gram', 'Color:': ' Olive Oil'}",Lynne E.,This PRE DE PROVENCE SHEA BUTTER ENRICHED ARTI...,"Feels Luxurious, Doesn't Dry Out Sensitive Skin",1503705600,,,2017-08-26,1


<h2>Expand the word</h2>

In [36]:
def expand(phrase):
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [43]:
sorted_df["reviewText"] = sorted_df["reviewText"].apply(lambda x:expand(x) if pd.notna(x) else x)

In [46]:
sorted_df["reviewText"].values[1]

'Lavender is my favorite soap fragrance, so it is no surprise that I love this PRE DE PROVENCE FRENCH LAVENDER BATH & SHOWER GEL.  For the shower, a pump or two is enough for a nice lather, and the lather washes off easily.\n\nThe lavender fragrance is delicate, and it lingers for only a short time.  It does not overwhelm perfume, aftershave lotion, shampoo, or other scented products you may like to use.  The clear gel arrives in an attractive square pump dispenser.\n\nThe shower gel leaves my skin silky smooth, and it does not irritate my sensitive skin.'