In [1]:
# import Python packages
import pandas as pd
import numpy as np
#from google.colab import files
import matplotlib.pyplot as plt

# regression package
import statsmodels.api as sm

# sentiment analysis packages
import re
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk import pos_tag
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('wordnet')
from nltk.corpus import wordnet
from nltk.stem import PorterStemmer
from textblob import TextBlob

# topic modeling packages
import gensim
from gensim import corpora

[nltk_data] Downloading package punkt to /Users/Stefanie/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Stefanie/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/Stefanie/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [9]:
reviews = pd.read_csv('product_reviews.csv') 
reviews.head(2)

Unnamed: 0,Review_ID,Item_ID,Base_item_ID,Review_date,Reviewer_ID,Real_name,Verified_purchase,Rating,Title,Content,...,Helpful_votes_week15,Helpful_votes_week16,Helpful_votes_week17,Helpful_votes_week18,Helpful_votes_week19,Helpful_votes_week20,Helpful_votes_week21,Helpful_votes_week22,Helpful_votes_week23,Helpful_votes_week24
0,R100E6MT94PK6L,B0051VVOB2,,1/8/2012,A1HGATCAMGXTGF,False,True,5,Love My Kindle Fire!,I love my fire and highly recommend it to anyo...,...,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5
1,R100HU42LKLLD0,B0057O9O6K,,4/10/2012,A3GGO95QT2PP47,False,True,2,Not the best Tablet or a good buy,The operating system is is an early android. Y...,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1


In [10]:
reviews_sample = reviews[['Rating','Content']]
reviews_sample.head(2)

Unnamed: 0,Rating,Content
0,5,I love my fire and highly recommend it to anyo...
1,2,The operating system is is an early android. Y...


In [11]:
#clean data
# remove punctuation and numbers, lower case the text
def clean(text):
    # replace any non-letters with a space
    text = re.sub('[^A-Za-z]+', ' ', text)
    # lower case the text
    text = text.lower()
    return text

# apply the function clean to each review
reviews_sample['Cleaned Reviews'] = reviews_sample['Content'].apply(clean)
reviews_sample.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews_sample['Cleaned Reviews'] = reviews_sample['Content'].apply(clean)


Unnamed: 0,Rating,Content,Cleaned Reviews
0,5,I love my fire and highly recommend it to anyo...,i love my fire and highly recommend it to anyo...
1,2,The operating system is is an early android. Y...,the operating system is is an early android yo...


In [12]:
#filter for kindle reviews
reviews_sample = reviews_sample[reviews_sample['Cleaned Reviews'].str.contains('kindle')]

In [15]:
revews_sample_2 = reviews_sample

In [26]:
reviews_sample.shape

(16381, 4)

In [14]:
#remove stop words and stem 
ps = PorterStemmer() 

def token_stop_stem(text):
    tokens = word_tokenize(text) #tokenize the text
    newlist = []
    new_review = ""
    for word in tokens:
        if word not in set(stopwords.words('english')): # remove stop words
          word_stem = ps.stem(word) # stem each word
          newlist.append(word_stem)
          new_review = new_review + " " + word_stem
    return new_review

reviews_sample['Final Reviews'] = reviews_sample['Cleaned Reviews'].apply(token_stop_stem)
reviews_sample.head(2)

Unnamed: 0,Rating,Content,Cleaned Reviews,Final Reviews
1,2,The operating system is is an early android. Y...,the operating system is is an early android yo...,oper system earli android cant use download k...
2,2,I have been reading on Kindle since the Kindle...,i have been reading on kindle since the kindle...,read kindl sinc kindl releas pretti heavili i...


In [36]:
def getPolarityScore(review):
    return TextBlob(review).sentiment.polarity

# function to analyze the reviews
def getPolarity(score):
    if score < 0:
        return 'Negative'
    elif score == 0:
        return 'Neutral'
    else:
        return 'Positive'

## Checking sentiment polarities for Kindle attributes
 ## screen, customer service, weight
 


In [35]:
#Kindle reviews that contain the word "screen"
reviews_sample_screen = reviews_sample[reviews_sample['Cleaned Reviews'].str.contains('screen')]

In [40]:
reviews_sample_screen['Score'] = reviews_sample_screen['Final Reviews'].apply(getPolarityScore) 
reviews_sample_screen['Polarity'] = reviews_sample_screen['Score'].apply(getPolarity)
reviews_sample_screen.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews_sample_screen['Score'] = reviews_sample_screen['Final Reviews'].apply(getPolarityScore)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews_sample_screen['Polarity'] = reviews_sample_screen['Score'].apply(getPolarity)


Unnamed: 0,Rating,Content,Cleaned Reviews,Final Reviews,Score,Polarity
9,4,"This is a nice little mini computer, but I am ...",this is a nice little mini computer but i am a...,nice littl mini comput littl disappoint bough...,0.166667,Positive
22,4,"Kindle Fire, Full Color 7"" Multi-touch Display...",kindle fire full color multi touch display wi ...,kindl fire full color multi touch display wi ...,0.444444,Positive


In [39]:
reviews_sample_screen['Polarity'].value_counts()

Positive    5417
Negative     343
Neutral       46
Name: Polarity, dtype: int64

In [28]:
reviews_sample_custserv = reviews_sample[reviews_sample['Cleaned Reviews'].str.contains('customer service')]

In [41]:
reviews_sample_custserv['Score'] = reviews_sample_custserv['Final Reviews'].apply(getPolarityScore) 
reviews_sample_custserv['Polarity'] = reviews_sample_custserv['Score'].apply(getPolarity)
reviews_sample_custserv.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews_sample_custserv['Score'] = reviews_sample_custserv['Final Reviews'].apply(getPolarityScore)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews_sample_custserv['Polarity'] = reviews_sample_custserv['Score'].apply(getPolarity)


Unnamed: 0,Rating,Content,Cleaned Reviews,Final Reviews,Score,Polarity
242,4,I pre-ordered a Kindle Fire even before the pr...,i pre ordered a kindle fire even before the pr...,pre order kindl fire even product st appear m...,0.023512,Positive
328,5,My first two kindles (pre-fire) both died spon...,my first two kindles pre fire both died sponta...,first two kindl pre fire die spontan without ...,0.05,Positive


In [42]:
reviews_sample_custserv['Polarity'].value_counts()

Positive    526
Negative     69
Neutral       8
Name: Polarity, dtype: int64

In [30]:
reviews_sample_weight = reviews_sample[reviews_sample['Cleaned Reviews'].str.contains('weight')]

In [43]:
reviews_sample_weight['Score'] = reviews_sample_weight['Final Reviews'].apply(getPolarityScore) 
reviews_sample_weight['Polarity'] = reviews_sample_weight['Score'].apply(getPolarity)
reviews_sample_weight.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews_sample_weight['Score'] = reviews_sample_weight['Final Reviews'].apply(getPolarityScore)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews_sample_weight['Polarity'] = reviews_sample_weight['Score'].apply(getPolarity)


Unnamed: 0,Rating,Content,Cleaned Reviews,Final Reviews,Score,Polarity
30,5,The Kindle Fire does everything it promised it...,the kindle fire does everything it promised it...,kindl fire everyth promis would opinion much ...,0.191304,Positive
60,5,Everything the average person could want in a ...,everything the average person could want in a ...,everyth averag person could want tablet devic...,0.316667,Positive


In [44]:
reviews_sample_weight['Polarity'].value_counts()

Positive    893
Negative     35
Neutral       3
Name: Polarity, dtype: int64

In [32]:
reviews_sample_price = reviews_sample[reviews_sample['Cleaned Reviews'].str.contains('price')]

In [45]:
reviews_sample_price['Score'] = reviews_sample_price['Final Reviews'].apply(getPolarityScore) 
reviews_sample_price['Polarity'] = reviews_sample_price['Score'].apply(getPolarity)
reviews_sample_price.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews_sample_price['Score'] = reviews_sample_price['Final Reviews'].apply(getPolarityScore)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews_sample_price['Polarity'] = reviews_sample_price['Score'].apply(getPolarity)


Unnamed: 0,Rating,Content,Cleaned Reviews,Final Reviews,Score,Polarity
22,4,"Kindle Fire, Full Color 7"" Multi-touch Display...",kindle fire full color multi touch display wi ...,kindl fire full color multi touch display wi ...,0.444444,Positive
27,5,"I bought the Fire for my wife for Christmas, k...",i bought the fire for my wife for christmas kn...,bought fire wife christma know match ipad use...,0.25,Positive


In [46]:
reviews_sample_price['Polarity'].value_counts()

Positive    3828
Negative     143
Neutral       27
Name: Polarity, dtype: int64

In [47]:
def token_stop(text):
    tokens = word_tokenize(text) # tokenize the text
    newlist = []
    for word in tokens:
        if word not in set(stopwords.words('english')): # remove stop words
          newlist.append(word)
    return newlist

reviews_sample['LDA Reviews'] = reviews_sample['Cleaned Reviews'].apply(token_stop)
reviews_sample.head(2)

Unnamed: 0,Rating,Content,Cleaned Reviews,Final Reviews,LDA Reviews
1,2,The operating system is is an early android. Y...,the operating system is is an early android yo...,oper system earli android cant use download k...,"[operating, system, early, android, cant, use,..."
2,2,I have been reading on Kindle since the Kindle...,i have been reading on kindle since the kindle...,read kindl sinc kindl releas pretti heavili i...,"[reading, kindle, since, kindle, released, pre..."


In [49]:
dict_ = corpora.Dictionary(reviews_sample['LDA Reviews'])
print(dict_)

Dictionary<28518 unique tokens: ['android', 'barns', 'cant', 'com', 'download']...>


In [50]:
# convert list of documents (corpus) into Document Term Matrix using the dictionary 
doc_term_matrix = [dict_.doc2bow(i) for i in reviews_sample['LDA Reviews']]

## LDA 

In [51]:
topic_3 = 3
topic_5 = 5
topic_7 = 7
chunksize = 1000
passes = 20
iterations = 400
eval_every = None  # don't evaluate model perplexity, takes too much time

lda = gensim.models.LdaModel

### 3 topics

In [52]:
lda_model_3 = lda(doc_term_matrix,
    num_topics=topic_3,
    id2word = dict_,
    iterations=iterations,
    passes=passes,
    eval_every=eval_every,
    random_state=9651
)

In [53]:
lda_model_3.print_topics(num_words=10)

[(0,
  '0.040*"kindle" + 0.039*"fire" + 0.014*"books" + 0.012*"love" + 0.012*"great" + 0.012*"ipad" + 0.011*"read" + 0.011*"reading" + 0.010*"use" + 0.010*"like"'),
 (1,
  '0.038*"kindle" + 0.027*"fire" + 0.017*"amazon" + 0.012*"one" + 0.010*"would" + 0.009*"get" + 0.006*"christmas" + 0.006*"got" + 0.006*"bought" + 0.005*"time"'),
 (2,
  '0.013*"tablet" + 0.010*"device" + 0.010*"android" + 0.009*"app" + 0.009*"apps" + 0.009*"screen" + 0.008*"amazon" + 0.007*"ipad" + 0.007*"like" + 0.006*"use"')]

In [57]:
lda_model_3.log_perplexity(doc_term_matrix)

-7.4003680145915345

### 5 topics

In [58]:
lda_model_5 = lda(doc_term_matrix,
    num_topics=topic_5,
    id2word = dict_,
    iterations=iterations,
    passes=passes,
    eval_every=eval_every,
    random_state=9651
)

In [59]:
lda_model_5.print_topics(num_words=10)

[(0,
  '0.026*"screen" + 0.015*"battery" + 0.012*"use" + 0.011*"reading" + 0.010*"touch" + 0.010*"life" + 0.009*"kindle" + 0.009*"g" + 0.009*"wifi" + 0.008*"e"'),
 (1,
  '0.033*"fire" + 0.031*"kindle" + 0.020*"amazon" + 0.012*"books" + 0.010*"device" + 0.010*"like" + 0.009*"one" + 0.009*"would" + 0.008*"read" + 0.007*"book"'),
 (2,
  '0.017*"tablet" + 0.013*"android" + 0.012*"ipad" + 0.012*"apps" + 0.011*"app" + 0.011*"device" + 0.008*"screen" + 0.006*"like" + 0.006*"good" + 0.006*"use"'),
 (3,
  '0.033*"kindle" + 0.018*"fire" + 0.015*"amazon" + 0.012*"one" + 0.011*"get" + 0.010*"would" + 0.008*"problem" + 0.008*"back" + 0.007*"service" + 0.007*"new"'),
 (4,
  '0.057*"kindle" + 0.048*"fire" + 0.026*"love" + 0.021*"great" + 0.018*"ipad" + 0.013*"use" + 0.012*"books" + 0.011*"easy" + 0.009*"read" + 0.009*"would"')]

In [60]:
lda_model_5.log_perplexity(doc_term_matrix)

-7.42172094844327

### 7 topics

In [61]:
lda_model_7 = lda(doc_term_matrix,
    num_topics=topic_7,
    id2word = dict_,
    iterations=iterations,
    passes=passes,
    eval_every=eval_every,
    random_state=9651
)

In [62]:
lda_model_7.print_topics(num_words=10)

[(0,
  '0.046*"tablet" + 0.024*"android" + 0.014*"apps" + 0.013*"app" + 0.012*"screen" + 0.011*"market" + 0.010*"good" + 0.010*"great" + 0.010*"battery" + 0.008*"also"'),
 (1,
  '0.040*"kindle" + 0.027*"fire" + 0.020*"amazon" + 0.017*"one" + 0.011*"get" + 0.010*"would" + 0.009*"buy" + 0.008*"books" + 0.008*"bought" + 0.006*"got"'),
 (2,
  '0.012*"usb" + 0.011*"use" + 0.009*"computer" + 0.008*"files" + 0.008*"laptop" + 0.007*"keyboard" + 0.007*"need" + 0.007*"pc" + 0.007*"app" + 0.007*"file"'),
 (3,
  '0.028*"kindle" + 0.016*"fire" + 0.015*"service" + 0.014*"amazon" + 0.014*"problem" + 0.013*"customer" + 0.010*"support" + 0.010*"back" + 0.010*"would" + 0.010*"device"'),
 (4,
  '0.061*"kindle" + 0.051*"fire" + 0.026*"love" + 0.021*"great" + 0.020*"books" + 0.015*"use" + 0.014*"read" + 0.013*"easy" + 0.010*"ipad" + 0.010*"reading"'),
 (5,
  '0.020*"ipad" + 0.018*"device" + 0.018*"amazon" + 0.013*"fire" + 0.011*"apps" + 0.009*"app" + 0.008*"kindle" + 0.008*"tablet" + 0.007*"like" + 0.007*"

In [63]:
lda_model_5.log_perplexity(doc_term_matrix)

-7.421720808702167