## HW9: Sentiment Analysis using AWS Comprehend and GCP Natural Language API

## Importing libraries and defining functions for sentiment analysis

In [47]:
import argparse
import sys
import pandas as pd
from google.cloud import language
from google.cloud.language import enums
from google.cloud.language import types
import six
import boto3
import json
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/Users/prakhar/Downloads/My First Project-1ffdea6c6512.json"


def gcp_sentiment_analysis(text):
    """Detects sentiment in the text."""
    client = language.LanguageServiceClient()

    if isinstance(text, six.binary_type):
        text = text.decode('utf-8')

    # Instantiates a plain text document.
    # [START migration_document_text]
    # [START migration_analyze_sentiment]
    document = types.Document(
        content=text,
        type=enums.Document.Type.PLAIN_TEXT)
    # [END migration_document_text]

    # Detects sentiment in the document. You can also analyze HTML with:
    #   document.type == enums.Document.Type.HTML
    sentiment = client.analyze_sentiment(document).document_sentiment

    if(sentiment.score > 0.3):
        return "POSITIVE"
    elif(sentiment.score < -0.3):
        return "NEGATIVE"
    else:
        return "NEUTRAL"
    
def aws_sentiment_analysis(text):
    comprehend = boto3.client(service_name='comprehend', region_name='us-east-1')
    result = eval(json.dumps(comprehend.detect_sentiment(Text=text, LanguageCode='en'), sort_keys=True, indent=4))
    return result['Sentiment']
    
    #return(comprehend.detect_sentiment(Text=text, LanguageCode='en'), sort_keys=True, indent=4)

## Data loading and cleaning

In [56]:
def parse(path):
  #g = gzip.open(path, 'r')
  g = open(path, 'rb')
  for l in g:
    yield eval(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')



df = getDF('amazonreviews') 
df = df.drop('reviewerName', 1)
df = df.drop('helpful', 1)
df = df.drop('summary', 1)
df = df.drop('unixReviewTime', 1)
df = df.drop('reviewerID', 1)
df = df.drop('asin', 1)
df = df.drop('reviewTime', 1)
df = df.drop('overall', 1)
df.head()

Unnamed: 0,reviewText
0,Spiritually and mentally inspiring! A book tha...
1,This is one my must have books. It is a master...
2,This book provides a reflection that you can a...
3,I first read THE PROPHET in college back in th...
4,A timeless classic. It is a very demanding an...


## Removing reviews > 5000 bytes in size as AWS has that size limit

In [67]:
df['size'] = df['reviewText'].apply(sys.getsizeof)
df = df[df['size'] < 5000]

## GCP Sentiment Analysis

In [68]:
df['gcp_sentiment_analysis'] = df['reviewText'].apply(gcp_sentiment_analysis)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


## AWS Sentiment Analysis

In [70]:
df['aws_sentiment_analysis'] = df['reviewText'].apply(aws_sentiment_analysis)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [71]:
df

Unnamed: 0,reviewText,gcp_sentiment_analysis,size,aws_sentiment_analysis
0,Spiritually and mentally inspiring! A book tha...,POSITIVE,178,POSITIVE
1,This is one my must have books. It is a master...,NEUTRAL,317,POSITIVE
2,This book provides a reflection that you can a...,POSITIVE,251,POSITIVE
3,I first read THE PROPHET in college back in th...,POSITIVE,880,POSITIVE
4,A timeless classic. It is a very demanding an...,POSITIVE,862,POSITIVE
5,Reading this made my mind feel like a still po...,POSITIVE,343,POSITIVE
6,"As you read, Gibran's poetry brings spiritual ...",POSITIVE,287,POSITIVE
7,"Deep, moving dramatic verses of the heart and ...",POSITIVE,182,POSITIVE
8,This is a timeless classic. Over the years I'...,POSITIVE,336,POSITIVE
9,An amazing work. Realizing extensive use of Bi...,POSITIVE,309,POSITIVE


## GCP Sentiment Analysis Summary

In [75]:
df.groupby(['gcp_sentiment_analysis']).count()

Unnamed: 0_level_0,reviewText,size,aws_sentiment_analysis
gcp_sentiment_analysis,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
NEGATIVE,60,60,60
NEUTRAL,374,374,374
POSITIVE,562,562,562


## AWS Sentiment Analysis Summary

In [76]:
df.groupby(['aws_sentiment_analysis']).count()

Unnamed: 0_level_0,reviewText,gcp_sentiment_analysis,size
aws_sentiment_analysis,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MIXED,53,53,53
NEGATIVE,56,56,56
NEUTRAL,42,42,42
POSITIVE,845,845,845
