### Large Scale Sentiment Analysis 
### by: Pratyush Singh & Mike Volpe

This notebook's purpose is to give you an idea how Mike & I approached and solved the problem using NLU on large amounts of data.

In [1]:
import json
import sys
import requests
import urllib.parse
import string

import pandas as pd
import numpy as np

from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

- #### If you want to read in your own data, simply replace the 'Twitter_Broadcast Raw Data...' with your own file_name. <br>
- Pandas has the ability to read many other types formats such as CSV's. For the full documentation visit the pandas website

- https://pandas.pydata.org/pandas-docs/stable/

In [2]:
# reading in the files. Replace with the path to your own file. Visit pandas to learn how to read other files
q1 = None
q2 = None
q3 = None
q4 = None

In [None]:
q1.head()

In [262]:
sentiment_score, sentiment_label = [], []
emotion_score, emotion_label = [], []
file_name = None

def make_request(row):
    """ This function makes retrieves emotion and sentiment for twitter
    
    The function makes a call to Watson NLU to retrieve targeted sentiment and
    emotion for the word twitter. After the results are retrieved it calls 
    'process_request' to handle the post-processesing of the json response.
    
    Args:
        - row: the row of data
    
    Returns: None
    
    """
    text = row.Text # replace "Text" with the name of the column you wish to analyze
    index = row.name
    
    if not index % 10:
        save(file_name)
    
    # some pre-processesing of the text such as lower-casing and removing punctuation
    text = text.strip().lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = str(text)
    
    try:
        if 'twitter' in text:
            
            # make the actual request to the api with the proper headers
            headers = {'Content-Type': 'application/json',}
            params = (
                    ('text', text),
                    ('features','emotion,sentiment'),
                    ('entities.emotion', 'true'),
                    ('entities.sentiment', 'true'),
                    ('emotion.targets', 'twitter'),
                    ('sentiment.targets', 'twitter'),
                )

            response = requests.get("https://gateway.watsonplatform.net/natural-language-understanding/api/v1/analyze?version=2019-07-12", 
                                     headers=headers, 
                                     params=params,
                                     auth=('apikey', API_KEY))
            
            response = json.loads(response.text)
            
            process_request(response)

        else:
            # if twitter is not in the text, then add this placeholder text to the list
            sentiment_label.append('Twitter not in text')
            sentiment_score.append(np.inf) 

            emotion_label.append('Twitter not in text')
            emotion_score.append(np.inf)

    except KeyboardInterrupt:
        # in the case of a keyboard interrupt, save the file and exit the program
        print("Saving your progress...")
        _save(filename)
        
        sys.exit()

    except Exception as e:
        # if any exception has occurred then we add in a filler message
        sentiment_label.append(e)
        emotion_label.append(e)

        sentiment_score.append(np.inf)
        emotion_score.append(np.inf)

        _save(filename)

        
def process_request(response):
    """ Extracts the sentiment and emotion from the response 
    
    This function puts the sentiment and emotion labels and score 
    into their respective lists. If an error is encountered a filler
    message is added.
    
    Args:
        response (JSON): the full JSON response from the API containing
            informaiton on the emotion and sentiment of the broadcast data
    
    Returns:
        None
    """
    
    # two variables used as a flag to indicate an error has occurred
    sentiment_error = False 
    emotion_error = False
    
    '''
    Call helper function _key exists to check if the response has the
    appropriate keys before we access them. Sanity Check
    ''' 
    
    if _key_exists(response, 'sentiment'):
        if _key_exists(response['sentiment'], 'targets'):
            score = response['sentiment']['targets'][0]['score']
            label = response['sentiment']['targets'][0]['label']

            sentiment_score.append(score)
            sentiment_label.append(label)

        else:
            sentiment_error = True

    else:
        sentiment_error = True

    if _key_exists(response, 'emotion'):
        if _key_exists(response['emotion'], 'targets'):
            emotions = response['emotion']['targets'][0]['emotion']
            
            label, score = [], []
            
            for em_label, value in emotions.items():
                label.append(em_label)
                score.append(value)
            
            
            label = ' '.join(label) # join the emotion labels together in a string
        
            emotion_score.append(score)
            emotion_label.append(label)

        else:
            emotion_error = True

    else:
        emotion_error = True
    
    # if an error has occurred we add a filler error message
    if sentiment_error:
        sentiment_label.append('Sentiment not retrieved for this text')
        sentiment_score.append(np.inf)

    if emotion_error:
        emotion_label.append('Emotion not retrieved for this text')
        emotion_score.append(np.inf)

def _key_exists(data, key):
    """ This function checks if key exists in data
    
    Args:
        data: dict that we are checking if key exists 
        key: variable to check if it exists in data
    
    Returns: True or False if the key exists or not
    """
    
    if key in data.keys():
        return True
    
    return False

def save(filename=None):
    """ Saves the items in the lists to a file
    
    Args:
        filename
    
    Returns:
        None
    """
    with open(f'{filename}.txt', 'w') as filehandle:
        filehandle.writelines("%s\n" % ("Emotion Label, Emotion Score, Sentence Label, Sentence Score"))
        for em_label, em_score, sent_label, sent_score in zip(emotion_label, emotion_score, sentiment_label, sentiment_score):
            filehandle.writelines("%s, %s, %s, %s\n" % (em_label, em_score, sent_label, sent_score))

In [282]:
tqdm.pandas(desc=f"Processesing Each File", position=1, leave=False)

queries = [q1, q2, q3, q4, q5]
for index, q in enumerate(tqdm(queries, position=0, leave=True, desc='Files Completed')):
    file_name = index
    q.head().progress_apply(lambda x: make_request(x), axis=1)
    save(file_name)