## Problem Statement No. 16
Consider the Amazon Alexa Reviews Dataset. This dataset consists of a nearly 3000 Amazon customer reviews (input
text), star ratings, date of review, variant and feedback of various amazon Alexa products like Alexa Echo, Echo dots,
Alexa Firesticks etc. Perform following operations on this dataset.
1) Remove all punctuations from review text.
2) Tokenize the review text into words.
3) Remove the Stopwords from the tokenized text.
4) Perform stemming & lemmatization on the review text.
5) Perform the word vectorization on review text using Bag of Words technique.
6) Create representation of Review Text by calculating Term Frequency and Inverse Document Frequency (TF-IDF)

In [29]:
import pandas as pd
import numpy as np
import nltk
import string

In [30]:
df=pd.read_csv(r"D:\OneDrive\Desktop\Desktop\New folder\Datasets\Alexa-Dataset.csv")
df.head(10)

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,5,31-Jul-18,Charcoal Fabric,Music,1
5,5,31-Jul-18,Heather Gray Fabric,I received the echo as a gift. I needed anothe...,1
6,3,31-Jul-18,Sandstone Fabric,"Without having a cellphone, I cannot use many ...",1
7,5,31-Jul-18,Charcoal Fabric,I think this is the 5th one I've purchased. I'...,1
8,5,30-Jul-18,Heather Gray Fabric,looks great,1
9,5,30-Jul-18,Heather Gray Fabric,Love it! I’ve listened to songs I haven’t hear...,1


In [31]:
df.isnull().sum()

rating              0
date                0
variation           0
verified_reviews    1
feedback            0
dtype: int64

In [32]:
df.dropna(inplace=True)

In [33]:
df["verified_reviews"]=df["verified_reviews"].str.lower()

In [34]:
df.head()

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,love my echo!,1
1,5,31-Jul-18,Charcoal Fabric,loved it!,1
2,4,31-Jul-18,Walnut Finish,"sometimes while playing a game, you can answer...",1
3,5,31-Jul-18,Charcoal Fabric,i have had a lot of fun with this thing. my 4 ...,1
4,5,31-Jul-18,Charcoal Fabric,music,1


In [36]:
def rem_pun(s):
    res=""
    for i in s:
        if i not in string.punctuation:
            res+=i
    return res
df["punctuation"]=df["verified_reviews"].apply(rem_pun)
df.head()

Unnamed: 0,rating,date,variation,verified_reviews,feedback,punctuation
0,5,31-Jul-18,Charcoal Fabric,love my echo!,1,love my echo
1,5,31-Jul-18,Charcoal Fabric,loved it!,1,loved it
2,4,31-Jul-18,Walnut Finish,"sometimes while playing a game, you can answer...",1,sometimes while playing a game you can answer ...
3,5,31-Jul-18,Charcoal Fabric,i have had a lot of fun with this thing. my 4 ...,1,i have had a lot of fun with this thing my 4 y...
4,5,31-Jul-18,Charcoal Fabric,music,1,music


In [37]:
nltk.download('punkt')
from nltk.tokenize import word_tokenize
df["token"]=df["punctuation"].apply(word_tokenize)
df.head()

[nltk_data] Error loading punkt: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


Unnamed: 0,rating,date,variation,verified_reviews,feedback,punctuation,token
0,5,31-Jul-18,Charcoal Fabric,love my echo!,1,love my echo,"[love, my, echo]"
1,5,31-Jul-18,Charcoal Fabric,loved it!,1,loved it,"[loved, it]"
2,4,31-Jul-18,Walnut Finish,"sometimes while playing a game, you can answer...",1,sometimes while playing a game you can answer ...,"[sometimes, while, playing, a, game, you, can,..."
3,5,31-Jul-18,Charcoal Fabric,i have had a lot of fun with this thing. my 4 ...,1,i have had a lot of fun with this thing my 4 y...,"[i, have, had, a, lot, of, fun, with, this, th..."
4,5,31-Jul-18,Charcoal Fabric,music,1,music,[music]


In [39]:
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words=set(stopwords.words('english'))
df["Stopwords"]=df["token"].apply(lambda x : [word for word in x if word not in stop_words])
df.head()

[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


Unnamed: 0,rating,date,variation,verified_reviews,feedback,punctuation,token,Stopwords
0,5,31-Jul-18,Charcoal Fabric,love my echo!,1,love my echo,"[love, my, echo]","[love, echo]"
1,5,31-Jul-18,Charcoal Fabric,loved it!,1,loved it,"[loved, it]",[loved]
2,4,31-Jul-18,Walnut Finish,"sometimes while playing a game, you can answer...",1,sometimes while playing a game you can answer ...,"[sometimes, while, playing, a, game, you, can,...","[sometimes, playing, game, answer, question, c..."
3,5,31-Jul-18,Charcoal Fabric,i have had a lot of fun with this thing. my 4 ...,1,i have had a lot of fun with this thing my 4 y...,"[i, have, had, a, lot, of, fun, with, this, th...","[lot, fun, thing, 4, yr, old, learns, dinosaur..."
4,5,31-Jul-18,Charcoal Fabric,music,1,music,[music],[music]


In [42]:
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem import PorterStemmer,WordNetLemmatizer
stemmer=PorterStemmer()
lemm=WordNetLemmatizer()
df['stemmed']=df["Stopwords"].apply(lambda x : [stemmer.stem(word) for word in x])
df['lemmatizer']=df["Stopwords"].apply(lambda x : [lemm.lemmatize(word) for word in x])
df.head(10)

[nltk_data] Error loading wordnet: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>
[nltk_data] Error loading omw-1.4: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


Unnamed: 0,rating,date,variation,verified_reviews,feedback,punctuation,token,Stopwords,stemmed,lemmatizer
0,5,31-Jul-18,Charcoal Fabric,love my echo!,1,love my echo,"[love, my, echo]","[love, echo]","[love, echo]","[love, echo]"
1,5,31-Jul-18,Charcoal Fabric,loved it!,1,loved it,"[loved, it]",[loved],[love],[loved]
2,4,31-Jul-18,Walnut Finish,"sometimes while playing a game, you can answer...",1,sometimes while playing a game you can answer ...,"[sometimes, while, playing, a, game, you, can,...","[sometimes, playing, game, answer, question, c...","[sometim, play, game, answer, question, correc...","[sometimes, playing, game, answer, question, c..."
3,5,31-Jul-18,Charcoal Fabric,i have had a lot of fun with this thing. my 4 ...,1,i have had a lot of fun with this thing my 4 y...,"[i, have, had, a, lot, of, fun, with, this, th...","[lot, fun, thing, 4, yr, old, learns, dinosaur...","[lot, fun, thing, 4, yr, old, learn, dinosaur,...","[lot, fun, thing, 4, yr, old, learns, dinosaur..."
4,5,31-Jul-18,Charcoal Fabric,music,1,music,[music],[music],[music],[music]
5,5,31-Jul-18,Heather Gray Fabric,i received the echo as a gift. i needed anothe...,1,i received the echo as a gift i needed another...,"[i, received, the, echo, as, a, gift, i, neede...","[received, echo, gift, needed, another, blueto...","[receiv, echo, gift, need, anoth, bluetooth, s...","[received, echo, gift, needed, another, blueto..."
6,3,31-Jul-18,Sandstone Fabric,"without having a cellphone, i cannot use many ...",1,without having a cellphone i cannot use many o...,"[without, having, a, cellphone, i, can, not, u...","[without, cellphone, use, many, features, ipad...","[without, cellphon, use, mani, featur, ipad, s...","[without, cellphone, use, many, feature, ipad,..."
7,5,31-Jul-18,Charcoal Fabric,i think this is the 5th one i've purchased. i'...,1,i think this is the 5th one ive purchased im w...,"[i, think, this, is, the, 5th, one, ive, purch...","[think, 5th, one, ive, purchased, im, working,...","[think, 5th, one, ive, purchas, im, work, get,...","[think, 5th, one, ive, purchased, im, working,..."
8,5,30-Jul-18,Heather Gray Fabric,looks great,1,looks great,"[looks, great]","[looks, great]","[look, great]","[look, great]"
9,5,30-Jul-18,Heather Gray Fabric,love it! i’ve listened to songs i haven’t hear...,1,love it i’ve listened to songs i haven’t heard...,"[love, it, i, ’, ve, listened, to, songs, i, h...","[love, ’, listened, songs, ’, heard, since, ch...","[love, ’, listen, song, ’, heard, sinc, childh...","[love, ’, listened, song, ’, heard, since, chi..."


In [49]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer=CountVectorizer()
x_bow=vectorizer.fit_transform(df["punctuation"])
print(x_bow)

  (0, 2394)	1
  (0, 2582)	1
  (0, 1313)	1
  (1, 2395)	1
  (1, 2152)	1
  (2, 2152)	1
  (2, 3632)	1
  (2, 4374)	2
  (2, 2933)	1
  (2, 1687)	1
  (2, 4462)	3
  (2, 699)	1
  (2, 342)	1
  (2, 3107)	1
  (2, 980)	1
  (2, 664)	1
  (2, 271)	1
  (2, 3388)	1
  (2, 1756)	1
  (2, 4442)	1
  (2, 336)	2
  (2, 345)	1
  (2, 3953)	1
  (2, 3372)	1
  (2, 407)	1
  :	:
  (3147, 3684)	1
  (3147, 3102)	2
  (3147, 3989)	1
  (3147, 1739)	1
  (3147, 679)	1
  (3147, 1256)	1
  (3147, 3626)	1
  (3147, 1649)	1
  (3147, 1655)	1
  (3147, 1263)	2
  (3147, 1937)	1
  (3147, 3875)	1
  (3147, 2218)	1
  (3147, 2143)	1
  (3147, 2552)	1
  (3147, 1977)	1
  (3147, 443)	1
  (3147, 2261)	1
  (3147, 4470)	1
  (3147, 3770)	1
  (3147, 885)	1
  (3147, 2509)	1
  (3147, 1936)	1
  (3147, 608)	1
  (3148, 1747)	1


In [51]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer()
x_tfidf=tfidf.fit_transform(df["punctuation"])
print(x_tfidf)

  (0, 1313)	0.6349778976652679
  (0, 2582)	0.5372018732619929
  (0, 2394)	0.5551731413175551
  (1, 2152)	0.31505564472914555
  (1, 2395)	0.9490732009304144
  (2, 1927)	0.1344896534953018
  (2, 1650)	0.13296932969761496
  (2, 463)	0.19777758123963496
  (2, 2703)	0.15142026501281125
  (2, 2721)	0.10212708014901867
  (2, 2313)	0.15997877153013768
  (2, 4109)	0.16382251895611621
  (2, 4030)	0.06410544991219809
  (2, 144)	0.15337060052353965
  (2, 534)	0.16473762091772856
  (2, 2317)	0.10369364541667575
  (2, 407)	0.11137273610038191
  (2, 3372)	0.18025585698246943
  (2, 3953)	0.06301913540649552
  (2, 345)	0.21069754004600616
  (2, 336)	0.13393151076857324
  (2, 4442)	0.21845665807837658
  (2, 1756)	0.14534503009427166
  (2, 3388)	0.20157464456688556
  (2, 271)	0.10189806592284767
  :	:
  (3147, 2429)	0.12207517615635116
  (3147, 1981)	0.09356351745740586
  (3147, 1778)	0.06367603488343027
  (3147, 2138)	0.05835347578839601
  (3147, 3948)	0.21664821485211497
  (3147, 4208)	0.07516045748129