In [2]:
import numpy as np
import pandas as pd

import boto3
import json

import nltk
from nltk import data
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

import matplotlib.pyplot as plt
import seaborn as sns

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
from IPython.display import display

#from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
#from vaderSentiment import SentimentIntensityAnalyzer

In [5]:
pd.set_option('display.max_colwidth', -1)

nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True);

In [11]:
sentence_tokenizer = data.load('tokenizers/punkt/english.pickle')
english_stop_words = stopwords.words('english')
comprehend = boto3.client(
                          service_name='comprehend', 
                          region_name='us-west-2',
                         )
#analyzer = SentimentIntensityAnalyzer()

In [8]:
reviews_raw = pd.read_csv('review_details.csv',engine='python').dropna(subset=['review_content_text'])
reviews_deduped = reviews_raw.drop_duplicates(subset=['review_id'])
reviews_verified = reviews_deduped[reviews_deduped.is_verified == 1]
reviews_bluetooth = reviews_verified[(reviews_verified.review_asin == 'B074QLB1Y7') |
                                   (reviews_verified.review_asin == 'B00P24XKS8')]

reviews = reviews_verified.copy()

topics = ['material quality',
         'battery life',
         'sound quality',
         'volume control',
         'tech support']
sentiments = ['POSITIVE','NEGATIVE','NEUTRAL','MIXED']
asins = list(reviews.review_asin.unique())


In [28]:
reviews.ix[0]

id                                  2.90604e+08                                                                                                                                                                                                
page_id                             232302871                                                                                                                                                                                                  
data_id                             43374262                                                                                                                                                                                                   
asin                                B074QLB1Y7                                                                                                                                                                                                 
review_asin                         B074

In [9]:
def preprocess(corpus):
    lemmatizer = WordNetLemmatizer()

    return [' '.join([lemmatizer.lemmatize(word.lower()) for word in doc.split()
                      if word not in english_stop_words]) for doc in corpus]

def sentiment(score):
    # score [-1,1]
    if score > 0.05:
        return 'Positive'
    elif score < -0.05:
        return 'Negative'
    else:
        return 'Mixed'

In [13]:
def sentiment(score):
    # score [-1,1]
    if score > 0.05:
        return 'Positive'
    elif score < -0.05:
        return 'Negative'
    else:
        return 'Mixed'

for index, row in reviews.iterrows():
    review = row['review_content_text']
    sentences = sentence_tokenizer.tokenize(review)
    sentences_preprocessed = preprocess(sentences)
    topics_sentiment = []
    
    for topic in topics:
        sentences_topic = [sentence for sentence in sentences if topic in sentence]
        
        if sentences_topic:
            sentence_topic = ' '.join(sentences_topic)
            reviews.at[index, 'topic_{}'.format(topic)] = sentence_topic
            try:
                #vs = analyzer.polarity_scores(sentence_topic)
                aws_sentiment = comprehend.detect_sentiment(Text=sentence_topic, LanguageCode='en')
                reviews.at[index, 'topic_{}_sentiment'.format(topic)] = aws_sentiment['Sentiment']#sentiment(vs['compound'])
            except Exception as ex:
                reviews.at[index, 'topic_{}_sentiment'.format(topic)] = 'N/A'
                print(ex)
                

# 亚马逊评论话题的情绪分析

In [36]:
@interact
def filter_by_sentiment(商品 = asins,
                        话题 = topics,
                        情绪 = sentiments):
    reviews_asin = reviews[reviews.review_asin == 商品]
    reviews_topic = reviews_asin[reviews_asin['topic_{}'.format(话题)].notnull()]
    try:
        reviews_topic['topic_{}_sentiment'.format(话题)].value_counts().sort_index().plot(kind='bar', 
                                                                                        title='{}'.format(话题), 
                                                                                        color=['y','g','b','r'],
                                                                                        )
        plt.xlabel('Sentiment')
        plt.ylabel('Count')
    except Exception as ex:
        #print(ex)
        print('商品{}的评论没有提及关于{}的话题'.format(商品, 话题))
    return reviews_asin.loc[reviews_asin['topic_{}_sentiment'.format(话题)] == 情绪][
        ['topic_{}'.format(话题),'review_content_text']].rename(columns={'topic_{}'.format(话题):'话题节选',
                                                                          'review_content_text':'评论全文'}).reset_index().drop('index',axis=1).shift()[1:]
