In [4]:
import multiprocessing as mp
import pandas as pd
import numpy as np
import requests
import time
from bs4 import BeautifulSoup
import sys
sys.setrecursionlimit(10000)
from stanfordcorenlp import StanfordCoreNLP
from pycorenlp import StanfordCoreNLP
import requests
import json
import numpy as np

## EC2 Syntax

Set up EC2 instance successfully using: 

aws ec2 run-instances --image-id ami-0ac019f4fcb7cb7e6 --count 1 --instance-type t2.xlarge --key-name aws-key1 --security-group-ids sg-0fea89a4bb6f8a4b5 --user-data file://CoreNLP.sh

then verified that corenlp was properly installed in the ec2 instance with this in browser: 
http://ec2-35-173-202-124.compute-1.amazonaws.com:9000/

In [2]:
import re
import urllib.request
from bs4 import BeautifulSoup
#referenced: https://www.quora.com/How-can-I-extract-only-text-data-from-HTML-pages, https://github.com/smilli/py-corenlp

html1 = urllib.request.urlopen('https://www.nytimes.com/2018/09/05/opinion/trump-white-house-anonymous-resistance.html?module=inline')
html2 = urllib.request.urlopen('https://www.nytimes.com/2018/10/20/opinion/sunday/nafta-mexico-trump-ambassador.html?rref=collection%2Fsectioncollection%2Fopinion&action=click&contentCollection=opinion&region=rank&module=package&version=highlights&contentPlacement=7&pgtype=sectionfront')

'''
Gets all text from articles in text stream.
'''
soup = BeautifulSoup(html1, "lxml")
soup2 = BeautifulSoup(html2, "lxml")

data1 = soup.findAll(text=True)
data2 = soup2.findAll(text=True)

'''
Function that returns true for relevant text from scraped articles.
'''
def find_relevant_text(data_input):
    return (not data_input.parent.name in ['style', 'script', '[document]', 'head', 'title'] and not re.match('<!--.*-->', str(data_input.encode('utf-8'))))

'''
Function that filters all text to relevant text, converts filtered object to a list, and puts it in the queue.
'''
def list_from_text(data, q):
    full_text = filter(find_relevant_text, data)
    text = list(full_text)
    q.put(text)
        
'''
Function that retrieves items from queue, converts to string, and produces 
sentiment analysis results using StanfordCoreNLP and the running EC2 instance.
'''    
def nlp_consumer(q):
    nlp = StanfordCoreNLP('http://ec2-52-90-212-123.compute-1.amazonaws.com:9000/')
    
    final_list = []
    while True: 
        text1 = q.get()
        tmp_list = list(text1)
        str_1 = ''.join(tmp_list)
        ssplit = str_1.split()
        
        for word in ssplit:        
            output = nlp.annotate(word, properties={'annotators': 'sentiment','outputFormat': 'json','timeout': 1000,})
            final_list.append((output['sentences'][0]['sentiment']))
            
        np_a = np.array(final_list)
        for each in np_a:
            unique_elements, counts_elements = np.unique(a, return_counts=True)
        print("Frequency of unique values in array:")
        print(np.asarray((unique_elements, counts_elements)))
   
        if len(tmp_list) == 273:  #break condition
            q.close()
            break

In [3]:
start_time = time.time()

'''
Multiprocessing and queue steps below.
'''
q = mp.Queue()
p1 = mp.Process(name='scrape_text1', target=list_from_text, args=(data1, q))
p2 = mp.Process(name='scrape_text2', target=list_from_text, args=(data2, q))
p3 = mp.Process(name='nlp_consumer', target=nlp_consumer, args=(q,))

p1.start()
p2.start()
p3.start()

p1.join()
p2.join()
p3.join()

print("run time:" + str(time.time() - start_time))

Frequency of unique values of the said array:
[['Negative' 'Neutral' 'Positive' 'Verynegative' 'Verypositive']
 ['36' '1218' '50' '6' '7']]
Frequency of unique values of the said array:
[['Negative' 'Neutral' 'Positive' 'Verynegative' 'Verypositive']
 ['65' '2716' '100' '10' '11']]
run time:623.9797768592834


## Write-up

For this assignment, I chose to take New York Times op-ed articles written by members of the Trump administration as my text stream -- and ultimately do sentiment analysis on the scraped text.  Doing so involved spinning up an Ubunutu ec2 instance with corenlp deployed (code for this is included at the top of this notebook), ingesting my chosen text stream, creating two producer processes that made use of my functions that scrape relevant article text, creating a consumer process that took items from the queue, parellelzing the processes, and using nlp functions to analyze my stream.  

The first article in my stream is the now notorious and anonymous 'I Am Part of the Resistance Inside the Trump Administration' -- while the second article in my stream is titled 'My Year as a Trump Ambassador' written publicly by former US Ambassador to Mexico Roberta Jacobson. From the start, I was curious about how positive or negative the sentiments of these articles would be when discussing the environment of working for the Trump administration -- and if the anonymous first article was meaningfully different in sentiment than the second public article in the text stream.

Without considering neutral words, the results of my sentiment analysis for the first anonymous article show that ~42% (42/99) of the words analyzed were negative or very negative, while ~58% (57/99) were positive or very positive.  In the second article, again without considering neutral words, the results of my analysis show that ~40% (75/186) of the words analyzed were negative or very negative, while ~60% (111/186) were positive or very positive.  The results on my chosen text stream are interesting to me because of how remarkably similar to each other across the two articles analyzed -- despite one being anonymous and the other being public.