### Load the data

In [1]:
import boto3
from boto3.dynamodb.conditions import Key

In [2]:
table_name = 'test_data'

In [3]:
dynamodb = boto3.resource('dynamodb')
table = dynamodb.Table(table_name)


In [4]:
# target_analysis_window = '2020-07-14_Hour=14'

In [5]:
# response = table.query(
#     KeyConditionExpression=Key('analysis_window').eq(target_analysis_window)
# )

In [6]:
response = table.scan()

In [7]:
# This is a list of dictionaries where
# the keys of each dictionary is a key/column in the DynamoDB table
news_data = response['Items']

Start spark and parallelize the data for processing

In [13]:
from pyspark.sql import SparkSession, Row

In [10]:
spark = (SparkSession.builder
            .appName("SparkTest") # Set app name
            .master("local[4]") # Run locally with 4 cores
            .getOrCreate())

In [17]:
# https://kontext.tech/column/spark/366/convert-python-dictionary-list-to-pyspark-dataframe
news_df = spark.createDataFrame([Row(**i) for i in news_data])
news_df.printSchema()

root
 |-- analysis_window: string (nullable = true)
 |-- api_success_utc_str: string (nullable = true)
 |-- api_success_utc_ts: decimal(38,18) (nullable = true)
 |-- news_content: string (nullable = true)
 |-- news_link: string (nullable = true)
 |-- news_publisher: string (nullable = true)
 |-- news_timestamp: decimal(38,18) (nullable = true)
 |-- news_title: string (nullable = true)
 |-- source_api: string (nullable = true)
 |-- symb_id_source: string (nullable = true)
 |-- t_symb: string (nullable = true)



In [180]:
len(news_data)

251

In [181]:
news_data[0]['news_title']

'Apple News adds new audio features, including a daily briefing, alongside expanded local coverage'

In [11]:
import sparknlp
spark = sparknlp.start()

In [21]:
# Convert list to RDD
news_rdd = spark.sparkContext.parallelize(news_data)

# Create data frame
news_df = spark.createDataFrame(news_rdd)

In [28]:
deep_sentence_detector = sparknlp.annotators.DeepSentenceDetector() \
    .setInputCols(["document", "token", "ner_con"]) \
    .setOutputCol("sentence") \
    .setIncludePragmaticSegmenter(True) \
    .setEndPunctuation([".", "?"])

In [40]:
content_df = news_df.select('news_content')

In [56]:
drop_na_df = news_df.na.drop()

In [112]:
test1 = content_df.toPandas()

Clear HTML tags

In [115]:
from bs4 import BeautifulSoup

In [124]:
from pyspark.sql.functions import udf

In [125]:
def strip_html(input_string):
    cleaned_text = BeautifulSoup(input_string).text
    return cleaned_text

strip_html_udf = udf(strip_html)

In [132]:
clean_df = drop_na_df.select('news_content', strip_html_udf('news_content').alias('clean_content'))

In [140]:
clean_df.count()

153

In [37]:
# https://towardsdatascience.com/introduction-to-spark-nlp-foundations-and-basic-components-part-i-c83b7629ed59

In [151]:
from sparknlp import DocumentAssembler

from sparknlp.annotator import SentenceDetector, DeepSentenceDetector

In [156]:
from pyspark.ml import Pipeline

document_assembler = DocumentAssembler()\
                        .setInputCol("clean_content")\
                        .setOutputCol("document")

sentenceDetector = SentenceDetector()\
                     .setInputCols(["document"])\
                     .setOutputCol("sentences")



In [152]:
from pyspark.ml import Pipeline

document_assembler = DocumentAssembler()\
                        .setInputCol("clean_content")\
                        .setOutputCol("document")

sentenceDetector = DeepSentenceDetector()\
                     .setInputCols(["document"])\
                     .setOutputCol("sentences")\
                     .setIncludePragmaticSegmenter(True)


In [157]:
nlpPipeline = Pipeline(stages=[
 document_assembler, 
 sentenceDetector,
])

In [158]:
content_sentences = nlpPipeline.fit(clean_df).transform(clean_df)

In [159]:
content_sentences.printSchema()

root
 |-- news_content: string (nullable = true)
 |-- clean_content: string (nullable = true)
 |-- document: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
 |-- sentences: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |   

In [160]:
sentences_collect = content_sentences.select('sentences').toPandas()

In [161]:
sentences_collect

Unnamed: 0,sentences
0,"[(document, 0, 44, Apple News is getting a si..."
1,"[(document, 0, 172, More than 2,500 mobile gam..."
2,"[(document, 0, 250, (Bloomberg) -- The Twitter..."
3,"[(document, 0, 173, (Bloomberg) -- It wasn’t m..."
4,"[(document, 0, 135, (Bloomberg) -- TikTok has ..."
...,...
148,"[(document, 0, 481, Netflix NFLX is set to rep..."
149,"[(document, 0, 150, Netflix NFLX has landed th..."
150,"[(document, 0, 291, The Walt Disney Company (N..."
151,"[(document, 1, 36, Click here to read the full..."


In [141]:
first_row_list = sentences_collect.iloc[0,:][0]

In [142]:
len(first_row_list)

43

In [146]:
[row['result'] for row in first_row_list]

['Apple  News is getting a significant upgrade.',
 'The news aggregation app, which ships preinstalled on Apple devices, is introducing several new features for readers and premium subscribers, including audio stories, a daily audio briefing called "Apple News Today" and expanded local coverage.',
 "The audio briefing is somewhat of a competitor to Alexa's Flash Briefing, which has become a popular way to catch up on the top news headlines.",
 "But in Apple's case, the briefing is hosted by people, not a virtual assistant.",
 "Apple News editors and co-hosts, Shumita Basu and Duarte Geraldino, will guide listeners through the day's headlines.",
 'They will then spend the remainder of the briefing discussing around three or four articles in a more in-depth fashion.',
 'Image Credits: Apple\n\n\nIn total, the briefing will run for roughly seven to eight minutes in length and will be accessible to all Apple News readers in the U.S.',
 'A new briefing will arrive every Monday through Frida

Unpack the list of sentence finder results

In [166]:
found_sentences_df = content_sentences.select('sentences')

In [169]:
found_sentences_df.select(found_sentences_df.sentences.getField("result")).toPandas()

Unnamed: 0,sentences.result
0,[Apple News is getting a significant upgrade....
1,"[More than 2,500 mobile games have been remove..."
2,[(Bloomberg) -- The Twitter accounts of some o...
3,[(Bloomberg) -- It wasn’t meant to be like thi...
4,[(Bloomberg) -- TikTok has become one of the w...
...,...
148,[Netflix NFLX is set to report second-quarter ...
149,[Netflix NFLX has landed the new romantic come...
150,"[The Walt Disney Company (NYSE:DIS), a global ..."
151,"[Click here to read the full article., The “wa..."


In [None]:
from pyspark.sql.functions import explode
df2 = found_sentences_df.select(found_sentences_df.name,explode(df.knownLanguages))

## Try adding a UDF for transformers

In [None]:
def strip_html(input_string):
    cleaned_text = BeautifulSoup(input_string).text
    return cleaned_text

strip_html_udf = udf(strip_html)

In [155]:
from transformers import pipeline

sentimentAnalysis = pipeline("sentiment-analysis")



print(sentimentAnalysis("Transformers piplines are easy to use"))

[{'label': 'POSITIVE', 'score': 0.9305248856544495}]


In [None]:
print(sentimentAnalysis("Big Tech Stocks Top Trillion-Dollar Each: ETFs to Bet On"))

In [None]:
import pyspark
import sparknlp

In [None]:
from pyspark.sql import SparkSession

In [None]:
sparknlp.version()

In [None]:
spark = sparknlp.start()

In [None]:
spark = (SparkSession.builder
    .appName("Spark NLP") # Set app name
    .master("local[4]") # Run locally with 4 cores
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.11:2.5.3s") # configure spark nlp
#     .config("spark.driver.memory","16G")
#     .config("spark.driver.maxResultSize", "0") 
#     .config("spark.kryoserializer.buffer.max", "1000M")
    .getOrCreate())