# Sentiment Analysis

#### Ryan Bales (@ryanbales)<br>ryan@balesofdata.com

***

### Import Packages

In [1]:
import boto3
import json

### Setup NLTK

We're using the Vader Algorithm from NLTK (Natural Language Toolkit) to analyze the sentiment of every sentence in our corpus. <br/>
The Vader Algorithm is a rules-based sentinment analysis alogrithm.  More details:  http://comp.social.gatech.edu/papers/icwsm14.vader.hutto.pdf

In [2]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import tokenize

nltk.download('punkt')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to /Users/rbales/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/rbales/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

### Load Transcription

In [3]:
with open("data/2016_debates/transcripts/debate_1.mp3.json", 'r') as transcript_file:  
    transcription = json.load(transcript_file)
    text = transcription["results"]["transcripts"][0]["transcript"]

### Tokenize Sentences

In [4]:
sentences = tokenize.sent_tokenize(text)

### Analyze the Sentiment of each Sentences

In [5]:
sentinment_analyzer = SentimentIntensityAnalyzer()

sentence_scores = {
    "pos": 0.000,
    "neu": 0.000,
    "neg": 0.000
}

for sentence in sentences:
    sentiment_score = sentinment_analyzer.polarity_scores(text)
    for score_type in sorted(sentiment_score):
        if score_type != "compound":
            sentence_scores[score_type] += sentiment_score[score_type]

### Display Sentiment Results

In [6]:
print("Sentence Count: {}".format(len(sentences)))
print("\n")
print("Average Sentinment Scores")
print("Positive {}".format(sentence_scores["pos"]/len(sentences)))
print("Neutral {}".format(sentence_scores["neu"]/len(sentences)))
print("Negative {}".format(sentence_scores["neg"]/len(sentences)))

Sentence Count: 1181


Average Sentinment Scores
Positive 0.13300000000000042
Neutral 0.769000000000005
Negative 0.09799999999999917


***

### Let's Try AWS Comprehend (Max 5k bytes)

In [7]:
with open("data/2016_debates/transcripts/debate_1.mp3.json", 'r') as transcript_file:  
    transcription = json.load(transcript_file)
    text = transcription["results"]["transcripts"][0]["transcript"]

In [8]:
boto3.setup_default_session(region_name="us-east-1", profile_name="personal")
comprehend_client = boto3.client(service_name="comprehend")

### Let's check the Sentiment at the Start of the Debate

In [9]:
comprehend_client.detect_sentiment(Text=text[:5000], LanguageCode="en")

{'Sentiment': 'NEUTRAL',
 'SentimentScore': {'Positive': 0.18922650814056396,
  'Negative': 0.021369684487581253,
  'Neutral': 0.75711989402771,
  'Mixed': 0.032283976674079895},
 'ResponseMetadata': {'RequestId': 'cc387434-1231-11e9-8d7f-41e58abee49e',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'date': 'Mon, 07 Jan 2019 04:07:50 GMT',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '161',
   'connection': 'keep-alive',
   'x-amzn-requestid': 'cc387434-1231-11e9-8d7f-41e58abee49e'},
  'RetryAttempts': 0}}

### What's the Sentiment at the End of the Debate

In [10]:
comprehend_client.detect_sentiment(Text=text[-5000:], LanguageCode="en")

{'Sentiment': 'NEGATIVE',
 'SentimentScore': {'Positive': 0.18204408884048462,
  'Negative': 0.46523579955101013,
  'Neutral': 0.2150057703256607,
  'Mixed': 0.13771434128284454},
 'ResponseMetadata': {'RequestId': 'cc5f36a4-1231-11e9-9473-7393b998d8f1',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'date': 'Mon, 07 Jan 2019 04:07:50 GMT',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '162',
   'connection': 'keep-alive',
   'x-amzn-requestid': 'cc5f36a4-1231-11e9-9473-7393b998d8f1'},
  'RetryAttempts': 0}}