# Scraping Reviews

In [1]:
#Definitions and imports

from lxml import html  
import requests
import pandas as pd
import numpy as np
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import train_test_split
from sklearn import naive_bayes
from sklearn.metrics import roc_auc_score
from textblob import TextBlob as tb

import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

from nltk.corpus import stopwords

set(stopwords.words('english'))



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ronak\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Ronak\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Ronak\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [2]:
#Creating an empty dataframe for reviews of Amazon Echo Dot
reviews_df = pd.DataFrame()

In [3]:
#Defining local browser's user agent string to avoid requests being blocked
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'

In [4]:
#Creating a list of URL's that point to the first 1000 review pages for Apple Airpods

url_list = []

for i in range(1,1001):
    url_list.append("https://www.amazon.com/Apple-MD827LL-EarPods-Remote-Mic/product-reviews/B0097BEG1C/ref=cm_cr_getr_d_paging_btm_prev_1?ie=UTF8&reviewerType=all_reviews&sortBy=recent&pageNumber={0}".format(i))


In [5]:
#Total number of URL's
len(url_list)

1000

In [6]:
# Loop to fetch reviews from Amazon

# Looping through all 1000 URL's
for u in url_list:
    amazon_url = u
    
    # Setting header to user agent string
    headers = {'User-Agent': user_agent}
    page = requests.get(amazon_url, headers = headers)
    # Define parser
    parser = html.fromstring(page.content)
    
    # All reviews are located in a div tag with class "review"
    xpath_reviews = '//div[@data-hook="review"]'
    reviews = parser.xpath(xpath_reviews)
    
    # Within the review div, the following 2 items can be located
    
    # The rating is located in an i tag with class "review-star-rating"
    #xpath_rating  = './/i[@data-hook="review-star-rating"]//text()' 
    
    # The body text of the review is located in a span tag with class "review-body"
    xpath_body    = './/span[@data-hook="review-body"]//text()'
    
    # Looping through each outer div tag and appending results to the dataframe
    for review in reviews:
        body    = review.xpath(xpath_body)

        review_dict = {'body' : body}
        
        reviews_df = reviews_df.append(review_dict, ignore_index=True)

In [7]:
#Dropping any null cells
reviews_df.replace('', np.nan, inplace=True)
reviews_df.dropna(inplace=True,axis = 0)

reviews_df.head()

Unnamed: 0,body
0,[Both daughters say their headphones still wor...
1,"[False advertisement, not an apple product. T..."
2,"[I had apple earbuds to compare, and there was..."
3,[I don’t recommend buying these. Literally it ...
4,[they stop working after a month of use]


In [8]:
#Verifying review count
reviews_df.count()

body    10000
dtype: int64

In [9]:
#Each row is currently a list object due to some reviews having responses and thus creating a list of review body texts
#We go around this by considering the main review and discarding the rest and the exporting to a csv
import re
reviews_df_text = pd.DataFrame()

for i in reviews_df['body']:
    if i:
        text = i[0].strip()
        
        #Many reviews have emojis and special characters. This section defines hex code ranges and removes them.
        emoji_pattern = re.compile("["
            u"\U0001F600-\U0001F926"  # emoticons
            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
            u"\U0001F680-\U0001F6FF"  # transport & map symbols
            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
            u"\U0001FE0F-\U0001Fe0F"  # hearts
            u"\U00002764-\U0001Fe0F"  # hearts
            u"\U00002753"             # question marks
                               "]+", flags=re.UNICODE)
        text2 = emoji_pattern.sub(r'', text) # no emoji
            
        text_dict = {'body' : text2}
        
        reviews_df_text = reviews_df_text.append(text_dict, ignore_index=True)

In [13]:
#Exporting to csv
reviews_df_text.to_csv("Final_Amazon_Reviews.csv",encoding='utf8',index=False)

# Classifying sentiments for the reviews

In [14]:
#Extracting data from csv since scraping changes dataset on every run.

review_data = pd.read_csv("Final_Amazon_Reviews.csv")

In [15]:
review_data.head()

Unnamed: 0,body
0,Both daughters say their headphones still work...
1,"False advertisement, not an apple product. Th..."
2,"I had apple earbuds to compare, and there was ..."
3,I don’t recommend buying these. Literally it h...
4,they stop working after a month of use


In [16]:
#Extracting one review as an example
blob_ex = review_data['body'][2].strip()
blob_ex

'I had apple earbuds to compare, and there was a noticable sound difference. These did not produce as clear a sound, and had problems dealing with bass. Weather or not these are knock-offs I don’t know. They could just be older. Either way, disappointed with the sound quality.'

In [17]:
#Creating a text blob object from the review
test_blob = tb(blob_ex)

In [18]:
#Listing the words in the review
test_blob.words

WordList(['I', 'had', 'apple', 'earbuds', 'to', 'compare', 'and', 'there', 'was', 'a', 'noticable', 'sound', 'difference', 'These', 'did', 'not', 'produce', 'as', 'clear', 'a', 'sound', 'and', 'had', 'problems', 'dealing', 'with', 'bass', 'Weather', 'or', 'not', 'these', 'are', 'knock-offs', 'I', 'don', '’', 't', 'know', 'They', 'could', 'just', 'be', 'older', 'Either', 'way', 'disappointed', 'with', 'the', 'sound', 'quality'])

In [19]:
#Listing tags for each of the words in the review
#For example, 'quality' is marked as a noun
test_blob.tags

[('I', 'PRP'),
 ('had', 'VBD'),
 ('apple', 'NN'),
 ('earbuds', 'NN'),
 ('to', 'TO'),
 ('compare', 'VB'),
 ('and', 'CC'),
 ('there', 'RB'),
 ('was', 'VBD'),
 ('a', 'DT'),
 ('noticable', 'JJ'),
 ('sound', 'NN'),
 ('difference', 'NN'),
 ('These', 'DT'),
 ('did', 'VBD'),
 ('not', 'RB'),
 ('produce', 'VB'),
 ('as', 'IN'),
 ('clear', 'JJ'),
 ('a', 'DT'),
 ('sound', 'NN'),
 ('and', 'CC'),
 ('had', 'VBD'),
 ('problems', 'NNS'),
 ('dealing', 'VBG'),
 ('with', 'IN'),
 ('bass', 'NN'),
 ('Weather', 'PRP$'),
 ('or', 'CC'),
 ('not', 'RB'),
 ('these', 'DT'),
 ('are', 'VBP'),
 ('knock-offs', 'NNS'),
 ('I', 'PRP'),
 ('don', 'VBP'),
 ('’', 'JJ'),
 ('t', 'NN'),
 ('know', 'VBP'),
 ('They', 'PRP'),
 ('could', 'MD'),
 ('just', 'RB'),
 ('be', 'VB'),
 ('older', 'JJR'),
 ('Either', 'DT'),
 ('way', 'NN'),
 ('disappointed', 'VBN'),
 ('with', 'IN'),
 ('the', 'DT'),
 ('sound', 'NN'),
 ('quality', 'NN')]

In [20]:
#Extracting the sentiment for the review. This returns 2 values
# 1. Polarity: A range from -1 to 1 indicating sentiment where -1 is negative, 0 is neutral and 1 is positive
# 2. Subjectivity: This returns a float between 0 and 1 where 0 is very objective and 1 is very subjective
test_blob.sentiment

Sentiment(polarity=0.08095238095238096, subjectivity=0.4523809523809524)

In [21]:
#Extracting the sentiment polarity into a variable
sent_val = test_blob.sentiment.polarity

In [22]:
#Verify sentiment value
sent_val

0.08095238095238096

### For the purposes of the analysis here, I have considered neutral values  top be positive and defined the following rule. 

##### if sent_val >= 0:
#####    print("Positive")
##### else:
#####    print("Negative")

In [23]:
#Creating an empty dataframe for the review text and its respective sentiment
sent_df = pd.DataFrame()

for i in review_data['body']:
    #Extract review text
    text = i.strip()
    
    #Creating text blob
    body_text = tb(text)
    
    #Extract sentiment polarity
    sent_pol = body_text.sentiment.polarity
    
    if sent_pol >= 0:
        sent_val = 1
    else:
        sent_val = 0
        
    comb_dict = {'review': text,
                   'sentiment': sent_val}
    #print(sent_val)
    
    sent_df = sent_df.append(comb_dict, ignore_index=True)

In [24]:
#A combined dataframe with the sentiment values for each review is created
sent_df.head()

Unnamed: 0,review,sentiment
0,Both daughters say their headphones still work...,1.0
1,"False advertisement, not an apple product. Th...",0.0
2,"I had apple earbuds to compare, and there was ...",1.0
3,I don’t recommend buying these. Literally it h...,1.0
4,they stop working after a month of use,1.0


In [25]:
#Exporting to csv to keep record
sent_df.to_csv(header=True,path_or_buf="sentiment_one.csv")

# Naive Bayesian Classifier

In [26]:
#Defining the stopset of english words
#This defines a list of words that are inconsequential to sentiment analysis and can be removed from the data
stopset = set(stopwords.words('english'))

#Defining a TfidVectorizer to convert the raw text into a sparse matrix
#This matrix essentially holds a frequency for each word and associates the words with either a positive or negative sentiment
vectorizer = TfidfVectorizer(use_idf=True, lowercase=True, strip_accents='ascii', stop_words=stopset)

In [27]:
#Defining the predictor variables, i.e. the sentiment
y = sent_df.sentiment

In [28]:
#Vectorize the reviews from the dataframe
x = vectorizer.fit_transform(sent_df.review)

In [29]:
#This is the number of records 
print(y.shape)

(9990,)


In [30]:
# This returns 2 values.
# 1. Number of input records
# 2. Number of unique words in the dataset
print(x.shape)

(9990, 6216)


In [31]:
# Using sklearn to perform a train test split
X_train, X_test, y_train, y_test = train_test_split(x, y, train_size=0.8, test_size=0.2)

In [32]:
#Defining a Naice Bayes classifier and fitting it to our train data
clf = naive_bayes.MultinomialNB()
clf.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [33]:
# Using roc_auc_score to determin the accuracy of the classifier
roc_auc_score(y_test, clf.predict_proba(X_test)[:,1])

0.89980625927258018

### Creating sample sentences to test the model

In [34]:
#Define and vectorise a positive sentence
sample_array_pos = np.array(["I enjoyed this product"])
sample_vector_pos = vectorizer.transform(sample_array_pos)

#Define and vectorize a negatie sentence
sample_array_neg = np.array(["I hate it. It's useless and can't do anything"])
sample_vector_neg = vectorizer.transform(sample_array_neg)


In [35]:
#This means that we are analysing this one sample based on evidence collected for over 7000 words.
sample_vector_pos.shape

(1, 6216)

In [36]:
#Predict sentiment for the positive vector
sentiment = clf.predict(sample_vector_pos)

if sentiment == 1:
    print("Positive")
else:
    print("Negative")

Positive


In [37]:
#Predict sentiment for the negative vector
sentiment = clf.predict(sample_vector_neg)

if sentiment == 1:
    print("Positive")
else:
    print("Negative")

Negative


# Conclusions:

1. The dataset constantly fluctuates in sentiment based on what product reviews were scraped and when they were scraped.
2. Most reviews on amazon seem to be positive. This makes our model biased and results in a few misclassifications.
3. For the naive bayes classifier, the accuracy varies between 85% to 98% depending on the nature of the input data.
4. In general, adding more data tends to increase the accuracy since the model is exposed to a larger amount of words.