### Load the data from DynamoDB

In [1]:
import boto3
from boto3.dynamodb.conditions import Key

In [2]:
table_name = 'test_data'

In [3]:
dynamodb = boto3.resource('dynamodb')
table = dynamodb.Table(table_name)

In [4]:
# target_analysis_window = '2020-07-14_Hour=14'

In [5]:
# response = table.query(
#     KeyConditionExpression=Key('analysis_window').eq(target_analysis_window)
# )

In [6]:
response = table.scan()

In [7]:
# This is a list of dictionaries where
# the keys of each dictionary is a key/column in the DynamoDB table
news_data = response['Items']

### Start spark and parallelize the data for processing

In [8]:
from pyspark.sql import SparkSession

In [9]:
spark = (SparkSession.builder
            .appName("SparkTest") # Set app name
            .master("local[4]") # Run locally with 4 cores
            .getOrCreate())

In [10]:
# This method throws deprecated function warning
#Convert list to RDD
news_rdd = spark.sparkContext.parallelize(news_data)
#Create data frame
news_df = spark.createDataFrame(news_rdd)

# Alternative method which doesn't throw deprecated warning:
# https://kontext.tech/column/spark/366/convert-python-dictionary-list-to-pyspark-dataframe
# from pyspark.sql import Row
# news_df = spark.createDataFrame([Row(**i) for i in news_data])
# BUT! This method throws an error with .rdd.flatMap(lambda x: x).collect()
# java.lang.IllegalStateException: Input row doesn't have expected number of values required by the schema. 
# 11 fields are required while 10 values are provided.
# However, we don't usually collect until the end of the pipeline.

news_df.printSchema()



root
 |-- analysis_window: string (nullable = true)
 |-- api_success_utc_str: string (nullable = true)
 |-- api_success_utc_ts: decimal(38,18) (nullable = true)
 |-- news_content: string (nullable = true)
 |-- news_link: string (nullable = true)
 |-- news_publisher: string (nullable = true)
 |-- news_timestamp: decimal(38,18) (nullable = true)
 |-- news_title: string (nullable = true)
 |-- source_api: string (nullable = true)
 |-- symb_id_source: string (nullable = true)
 |-- t_symb: string (nullable = true)



In [11]:
news_data[0]['news_title']

'Apple News adds new audio features, including a daily briefing, alongside expanded local coverage'

In [12]:
news_df = news_df.na.drop()

In [13]:
# https://stackoverflow.com/questions/38610559/
content_list = news_df.select("news_content").rdd.flatMap(lambda x: x).collect()

### Cleaning and processing pipeline

In [15]:
from bs4 import BeautifulSoup
from transformers import pipeline
from multiprocessing import cpu_count
from multiprocessing import Pool

In [21]:
class SentimentAnalysisPipeline:
    def __init__(self):
        # Note that the transformers pipelines can take list of strings as input
        # instead of just one string.
        self.summarizer_pipeline = pipeline("summarization")
#         self.sentiment_pipeline = pipeline("sentiment-analysis")
        
        # multiprocessing core count heuristic from 
        # comment in https://stackoverflow.com/questions/20886565/
        self.pool_cores = cpu_count()-1 or 1
        
    def strip_html(self,input_string):
        """
        Strips any HTML tags from a string
        """ 
        
        cleaned_text = BeautifulSoup(input_string).text
        
        return cleaned_text
    
    def strip_html_multi(self, input_string_list):
                
        # Make sure input is a list, or if it's one string
        # convert to list.
        if type(input_string_list) is not list:
            input_string_list = list(input_string_list)     

        # TODO may want to initialize and destroy pool somwhere else
        # Or it might not matter too much given this function is only
        # called once per spark job / AWS EMR startup
        p = 2
        with Pool(processes=p) as pool:
            chunksize = 3
            no_html_text = pool.map(self.strip_html, input_string_list, chunksize)        
        
        return no_html_text
    
    def raw_text_to_sentiment(self,input_string):
        """
        Takes a list of strings or just one regular string and runs it
        through a pipeline to get the positive/negative label and the
        respective scores.
        
        Pipeline consists of:
        (1) Removing HTML if present
        (2) Summarizing the news articles
        (3) Calculating a sentiment score and label (Positive or Negative)
        
        Returns a list of dictionaries with label and score as keys.
        """

#         no_html_text = self.strip_html_multi(input_string)
        
        news_summary = self.summarizer_pipeline(no_html_text, 
                                                max_length=300, 
                                                min_length=30)[0]['summary_text']
#         sentiment_scores = self.sentiment_pipeline(summary_news)

        return news_summary
#         return sentiment_scores

In [16]:
s_pipe = SentimentAnalysisPipeline()

In [17]:
test_result = s_pipe.raw_text_to_sentiment(content_list)

In [16]:
test_summarizer_pipeline = pipeline("summarization")

In [None]:
test_summarizer_pipeline(content_list[0:5])

In [16]:
def strip_html_test(input_string):
    """
    Strips any HTML tags from a list of strings with multiple processes.
    """   

    cleaned_text = BeautifulSoup(input_string).text

    return cleaned_text

In [17]:
p = 2
with Pool(processes=p) as pool:
    chunksize = 3
    no_html_text = pool.map(strip_html_test, content_list, chunksize)  

In [18]:
from bs4 import BeautifulSoup

In [30]:
def strip_html(input_string):
    cleaned_text = BeautifulSoup(input_string).text
    return cleaned_text

# Define Spark UDF for parallel processing
strip_html_udf = udf(strip_html)

In [132]:
clean_df = drop_na_df.select('news_content', strip_html_udf('news_content').alias('clean_content'))

In [13]:
from transformers import pipeline

summarizer = pipeline("summarization")


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1578.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898822.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=26.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1222317369.0, style=ProgressStyle(descr…




In [29]:
ARTICLE = """

WhatsApp was briefly down Tuesday, with users unable to send or receive messages on the Facebook-owned end-to-end encrypted messaging app.

Affected users may have seen that WhatsApp was "connecting" to the service when trying to send a message. The outage started at about 4 p.m. ET but appeared to be working again by 4:30 p.m. ET.

WhatsApp failing to connect to its servers. (Image Credits: TechCrunch)

When reached, a Facebook spokesperson confirmed the outage but didn't divulge details.

"We're aware that some people are currently having trouble sending messages and we're working to restore WhatsApp for everyone as quickly as possible," the spokesperson said.

WhatsApp hit the 2 billion user mark earlier this year. Facebook bought WhatsApp for $19 billion in 2014 in what became one of the social media giant's biggest purchases.

Updated with comment from Facebook.

"""



In [31]:
clean_article = strip_html(ARTICLE)

In [36]:
clean_article

'\n\nWhatsApp was briefly down Tuesday, with users unable to send or receive messages on the Facebook-owned end-to-end encrypted messaging app.\n\nAffected users may have seen that WhatsApp was "connecting" to the service when trying to send a message. The outage started at about 4 p.m. ET but appeared to be working again by 4:30 p.m. ET.\n\nWhatsApp failing to connect to its servers. (Image Credits: TechCrunch)\n\nWhen reached, a Facebook spokesperson confirmed the outage but didn\'t divulge details.\n\n"We\'re aware that some people are currently having trouble sending messages and we\'re working to restore WhatsApp for everyone as quickly as possible," the spokesperson said.\n\nWhatsApp hit the 2 billion user mark earlier this year. Facebook bought WhatsApp for $19 billion in 2014 in what became one of the social media giant\'s biggest purchases.\n\nUpdated with comment from Facebook.\n\n'

In [33]:
summary_news = summarizer(clean_article, max_length=300, min_length=30)[0]['summary_text']

Your max_length is set to 300, but you input_length is only 204. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)


In [34]:
summary_news

" Facebook bought WhatsApp for $19 billion in 2014 . The end-to-end encrypted messaging app was briefly down at about 4 p.m. ET Tuesday . Facebook confirmed the outage but didn't divulge details ."

In [27]:
from transformers import pipeline

nlp = pipeline("sentiment-analysis")


In [37]:
nlp([summary_news,summary_news,summary_news])

[{'label': 'NEGATIVE', 'score': 0.991114616394043},
 {'label': 'NEGATIVE', 'score': 0.991114616394043},
 {'label': 'NEGATIVE', 'score': 0.991114616394043}]

In [11]:
import sparknlp
spark = sparknlp.start()

In [21]:
# Convert list to RDD
news_rdd = spark.sparkContext.parallelize(news_data)

# Create data frame
news_df = spark.createDataFrame(news_rdd)

In [28]:
deep_sentence_detector = sparknlp.annotators.DeepSentenceDetector() \
    .setInputCols(["document", "token", "ner_con"]) \
    .setOutputCol("sentence") \
    .setIncludePragmaticSegmenter(True) \
    .setEndPunctuation([".", "?"])

In [40]:
content_df = news_df.select('news_content')

In [56]:
drop_na_df = news_df.na.drop()

In [112]:
test1 = content_df.toPandas()

Clear HTML tags

In [18]:
from bs4 import BeautifulSoup

In [124]:
from pyspark.sql.functions import udf

In [125]:
def strip_html(input_string):
    cleaned_text = BeautifulSoup(input_string).text
    return cleaned_text

strip_html_udf = udf(strip_html)

In [132]:
clean_df = drop_na_df.select('news_content', strip_html_udf('news_content').alias('clean_content'))

In [140]:
clean_df.count()

153

In [37]:
# https://towardsdatascience.com/introduction-to-spark-nlp-foundations-and-basic-components-part-i-c83b7629ed59

In [151]:
from sparknlp import DocumentAssembler

from sparknlp.annotator import SentenceDetector, DeepSentenceDetector

In [156]:
from pyspark.ml import Pipeline

document_assembler = DocumentAssembler()\
                        .setInputCol("clean_content")\
                        .setOutputCol("document")

sentenceDetector = SentenceDetector()\
                     .setInputCols(["document"])\
                     .setOutputCol("sentences")



In [152]:
from pyspark.ml import Pipeline

document_assembler = DocumentAssembler()\
                        .setInputCol("clean_content")\
                        .setOutputCol("document")

sentenceDetector = DeepSentenceDetector()\
                     .setInputCols(["document"])\
                     .setOutputCol("sentences")\
                     .setIncludePragmaticSegmenter(True)


In [157]:
nlpPipeline = Pipeline(stages=[
 document_assembler, 
 sentenceDetector,
])

In [158]:
content_sentences = nlpPipeline.fit(clean_df).transform(clean_df)

In [159]:
content_sentences.printSchema()

root
 |-- news_content: string (nullable = true)
 |-- clean_content: string (nullable = true)
 |-- document: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
 |-- sentences: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |   

In [160]:
sentences_collect = content_sentences.select('sentences').toPandas()

In [161]:
sentences_collect

Unnamed: 0,sentences
0,"[(document, 0, 44, Apple News is getting a si..."
1,"[(document, 0, 172, More than 2,500 mobile gam..."
2,"[(document, 0, 250, (Bloomberg) -- The Twitter..."
3,"[(document, 0, 173, (Bloomberg) -- It wasn’t m..."
4,"[(document, 0, 135, (Bloomberg) -- TikTok has ..."
...,...
148,"[(document, 0, 481, Netflix NFLX is set to rep..."
149,"[(document, 0, 150, Netflix NFLX has landed th..."
150,"[(document, 0, 291, The Walt Disney Company (N..."
151,"[(document, 1, 36, Click here to read the full..."


In [141]:
first_row_list = sentences_collect.iloc[0,:][0]

In [142]:
len(first_row_list)

43

In [146]:
[row['result'] for row in first_row_list]

['Apple  News is getting a significant upgrade.',
 'The news aggregation app, which ships preinstalled on Apple devices, is introducing several new features for readers and premium subscribers, including audio stories, a daily audio briefing called "Apple News Today" and expanded local coverage.',
 "The audio briefing is somewhat of a competitor to Alexa's Flash Briefing, which has become a popular way to catch up on the top news headlines.",
 "But in Apple's case, the briefing is hosted by people, not a virtual assistant.",
 "Apple News editors and co-hosts, Shumita Basu and Duarte Geraldino, will guide listeners through the day's headlines.",
 'They will then spend the remainder of the briefing discussing around three or four articles in a more in-depth fashion.',
 'Image Credits: Apple\n\n\nIn total, the briefing will run for roughly seven to eight minutes in length and will be accessible to all Apple News readers in the U.S.',
 'A new briefing will arrive every Monday through Frida

Unpack the list of sentence finder results

In [166]:
found_sentences_df = content_sentences.select('sentences')

In [169]:
found_sentences_df.select(found_sentences_df.sentences.getField("result")).toPandas()

Unnamed: 0,sentences.result
0,[Apple News is getting a significant upgrade....
1,"[More than 2,500 mobile games have been remove..."
2,[(Bloomberg) -- The Twitter accounts of some o...
3,[(Bloomberg) -- It wasn’t meant to be like thi...
4,[(Bloomberg) -- TikTok has become one of the w...
...,...
148,[Netflix NFLX is set to report second-quarter ...
149,[Netflix NFLX has landed the new romantic come...
150,"[The Walt Disney Company (NYSE:DIS), a global ..."
151,"[Click here to read the full article., The “wa..."


In [None]:
from pyspark.sql.functions import explode
df2 = found_sentences_df.select(found_sentences_df.name,explode(df.knownLanguages))

## Try adding a UDF for transformers

In [None]:
def strip_html(input_string):
    cleaned_text = BeautifulSoup(input_string).text
    return cleaned_text

strip_html_udf = udf(strip_html)

In [155]:
from transformers import pipeline

sentimentAnalysis = pipeline("sentiment-analysis")



print(sentimentAnalysis("Transformers piplines are easy to use"))

[{'label': 'POSITIVE', 'score': 0.9305248856544495}]


In [None]:
print(sentimentAnalysis("Big Tech Stocks Top Trillion-Dollar Each: ETFs to Bet On"))

In [None]:
import pyspark
import sparknlp

In [None]:
from pyspark.sql import SparkSession

In [None]:
sparknlp.version()

In [None]:
spark = sparknlp.start()

In [None]:
spark = (SparkSession.builder
    .appName("Spark NLP") # Set app name
    .master("local[4]") # Run locally with 4 cores
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.11:2.5.3s") # configure spark nlp
#     .config("spark.driver.memory","16G")
#     .config("spark.driver.maxResultSize", "0") 
#     .config("spark.kryoserializer.buffer.max", "1000M")
    .getOrCreate())