In [1]:
import pandas as pd
import numpy as np
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import helper_functions
from helper_functions import split_into_sentences, unstack_listcol, count_alpha
import matplotlib.pyplot as plt
import matplotlib as mpl
%matplotlib inline
plt.style.use('seaborn')
mpl.rcParams['font.family'] = 'serif'

### reading in the review data from feather

In [2]:
PATH = "../../../Dropbox/RA/stefano_coffeemachines/amazon_reviews/"
df = pd.read_feather(f'{PATH}reviews_Home_and_Kitchen_5-raw')
df.shape
# let's take 100 reviews
df = df.iloc[:100]
df['review_id'] = np.arange(df.shape[0])+1
cols = ['review_id', 'reviewText', 'overall']
df = df[cols]
print(df.head())

   review_id                                         reviewText  overall
0          1  My daughter wanted this book and the price on ...      5.0
1          2  I bought this zoku quick pop for my daughterr ...      5.0
2          3  There is no shortage of pop recipes available ...      4.0
3          4  This book is a must have if you get a Zoku (wh...      5.0
4          5  This cookbook is great.  I have really enjoyed...      4.0


## 1) Sentiment analysis on the full reviews

In [3]:
analyzer = SentimentIntensityAnalyzer()

df['sentdict'] = df.apply(lambda row: analyzer.polarity_scores(row['reviewText']), axis = 1)
df['pos'] = df.apply(lambda row: row['sentdict']['pos'], axis = 1)
df['neu'] = df.apply(lambda row: row['sentdict']['neu'], axis = 1)
df['neg'] = df.apply(lambda row: row['sentdict']['neg'], axis = 1)
df['comp'] = df.apply(lambda row: row['sentdict']['compound'], axis = 1)

df = df.drop(['sentdict'], axis = 1)
print(df.head())

   review_id                                         reviewText  overall  \
0          1  My daughter wanted this book and the price on ...      5.0   
1          2  I bought this zoku quick pop for my daughterr ...      5.0   
2          3  There is no shortage of pop recipes available ...      4.0   
3          4  This book is a must have if you get a Zoku (wh...      5.0   
4          5  This cookbook is great.  I have really enjoyed...      4.0   

     pos    neu    neg    comp  
0  0.270  0.730  0.000  0.8625  
1  0.233  0.767  0.000  0.7906  
2  0.162  0.782  0.056  0.9949  
3  0.178  0.822  0.000  0.9022  
4  0.249  0.712  0.039  0.9750  


## 2) Sentiment analysis on sentence-level data

In [4]:
df = df.drop(columns = ['pos', 'neg', 'neu', 'comp'], axis = 1)

# splitting the text column into sentences
df['sentence'] = df.apply(lambda row: split_into_sentences(row['reviewText']), axis=1)
# unstacking the list column
a = unstack_listcol(df, 'sentence')

# removing non-ascii characters
a.sentence.replace({r'[^\x00-\x7F]+':''}, regex=True, inplace=True)

# keeping only sentences with at least one alphabetic character
a['nalpha'] = a.apply(lambda row: count_alpha(row['sentence']), axis = 1)
a = a.loc[a['nalpha'] >= 1]

# keeping only sentences with length >= 2
a['sentlen'] = a.apply(lambda row: len(row['sentence']), axis = 1)
a = a.loc[a['sentlen'] >= 2]

# removing some columns
a = a.drop(columns = ['nalpha', 'sentlen', 'reviewText'], axis=1)

# adding a sentence id column
a = a.reset_index()
ncols = len(a.columns)
a.insert(ncols-1, 'sentence_id', 0)
a['sentence_id'] = a.index + 1

# remove the index column
a = a.drop(columns = ['index'], axis=1)

In [5]:
analyzer = SentimentIntensityAnalyzer()

a['sentdict'] = a.apply(lambda row: analyzer.polarity_scores(row['sentence']), axis = 1)
a['pos'] = a.apply(lambda row: row['sentdict']['pos'], axis = 1)
a['neu'] = a.apply(lambda row: row['sentdict']['neu'], axis = 1)
a['neg'] = a.apply(lambda row: row['sentdict']['neg'], axis = 1)
a['comp'] = a.apply(lambda row: row['sentdict']['compound'], axis = 1)

a = a.drop(['sentdict'], axis = 1)
print(a.head())
print(a.shape)

   review_id  overall  sentence_id  \
0          1      5.0            1   
1          1      5.0            2   
2          1      5.0            3   
3          2      5.0            4   
4          2      5.0            5   

                                            sentence    pos    neu  neg  \
0  My daughter wanted this book and the price on ...  0.349  0.651  0.0   
1  She has already tried one recipe a day after r...  0.000  1.000  0.0   
2                           She seems happy with it.  0.481  0.519  0.0   
3  I bought this zoku quick pop for my daughterr ...  0.000  1.000  0.0   
4  She loves it and have fun to make her own ice ...  0.412  0.588  0.0   

     comp  
0  0.7096  
1  0.0000  
2  0.5719  
3  0.0000  
4  0.7906  
(417, 8)


In [6]:
a = a.groupby(['review_id']).agg({'pos':'mean', 'neu':'mean', 'neg':'mean', 'comp':'mean'})
print(a.head())
print(a.shape)

                pos       neu       neg      comp
review_id                                        
1          0.276667  0.723333  0.000000  0.427167
2          0.206000  0.794000  0.000000  0.395300
3          0.147000  0.808923  0.044115  0.217535
4          0.165000  0.835000  0.000000  0.337680
5          0.341000  0.632125  0.026875  0.424013
(100, 4)
