In [13]:
import sys
sys.path.append('../../')
import os

import findspark
findspark.init()

#from pyspark.sql import SQLContext
from pyspark.sql import SparkSession

from sparknlp.annotator import *
from pyspark.ml import Pipeline
from sparknlp.base import DocumentAssembler, Finisher
from pyspark.sql import Row


In [14]:
# os.getcwd()
# '/home/ubuntu'

In [15]:
tar_data = 's3a://twitter-data-dump/test.tar'
data_folder = 's3a://twitter-data-dump/smallportion/'
trump_file = 's3a://twitter-data-dump/celebrities/trump.json'

In [16]:
# instantiate a spark context object
appname= "large_read_tar"
master="local"

# Create Spark Session
spark = SparkSession.builder.appName(appname).getOrCreate()

In [17]:
# raw_rdd = sc.textFile(tar_data)
df = spark.read.json(data_folder)
df_trump = spark.read.json(trump_file)

In [18]:
# Session for declaring Spark-NLP parameters --> at least in config file
LEMMA_SRC = '/home/ubuntu/spark_test/resources/nlp/lemma-corpus/AntBNC_lemmas_ver_001.txt'
# Will change and play around with other later
SENTIMENT_SRC = '/home/ubuntu/spark_test/resources/nlp/sentiment-corpus/default-sentiment-dict.txt'
sentence_bound_set = ["\n","\?","\*","\!", '\.']

In [19]:
def process_task(data, *args):
    pipeline = Pipeline(stages=args)
    model = pipeline.fit(data)
    result = model.transform(data)
    #result.printSchema()
    return result

In [20]:
# pipeline for text cleaning
document_assembler = DocumentAssembler() \
    .setInputCol("text")\
    .setOutputCol("document")

sentence_detector = SentenceDetectorModel() \
    .setInputCols(["document"]) \
    .setCustomBounds(sentence_bound_set)\
    .setOutputCol("sentence")
    
tokenizer = RegexTokenizer() \
    .setInputCols(["sentence"]) \
    .setOutputCol("token")   
    
# normalizer --> all lower cases    
normalizer = Normalizer() \
    .setInputCols(["token"]) \
    .setOutputCol("normalized_token")    

# lemmatization based on Ant Corpus    
lemmatizer = Lemmatizer()\
    .setInputCols(["normalized_token"])\
    .setOutputCol("lemma") \
    .setDictionary(LEMMA_SRC)
    
# sentiment_dector
sentiment_detector = SentimentDetectorModel() \
    .setInputCols(["lemma", "sentence"]) \
    .setOutputCol("sentiment_score") \
    .setDictPath("../../../src/test/resources/sentiment-corpus/default-sentiment-dict.txt")    

In [21]:
# Region for Setting 2 Finisher --> for 2 pipelines    
finisher_sentence = Finisher() \
    .setInputCols(["sentence"]) \
    .setOutputCols(["sentence_output"])\
    .setCleanAnnotations(False)

In [22]:
result = process_task(df_trump, document_assembler,sentence_detector,finisher_sentence)

root
 |-- created_at: string (nullable = true)
 |-- favorite_count: long (nullable = true)
 |-- id_str: string (nullable = true)
 |-- is_retweet: boolean (nullable = true)
 |-- retweet_count: long (nullable = true)
 |-- source: string (nullable = true)
 |-- text: string (nullable = true)
 |-- document: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |-- sentence: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)

In [24]:
df_trump.printSchema()
result.printSchema()

root
 |-- created_at: string (nullable = true)
 |-- favorite_count: long (nullable = true)
 |-- id_str: string (nullable = true)
 |-- is_retweet: boolean (nullable = true)
 |-- retweet_count: long (nullable = true)
 |-- source: string (nullable = true)
 |-- text: string (nullable = true)

root
 |-- created_at: string (nullable = true)
 |-- favorite_count: long (nullable = true)
 |-- id_str: string (nullable = true)
 |-- is_retweet: boolean (nullable = true)
 |-- retweet_count: long (nullable = true)
 |-- source: string (nullable = true)
 |-- text: string (nullable = true)
 |-- document: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = t

In [25]:
df_trump.show()
result.show()

+--------------------+--------------+------------------+----------+-------------+------------------+--------------------+
|          created_at|favorite_count|            id_str|is_retweet|retweet_count|            source|                text|
+--------------------+--------------+------------------+----------+-------------+------------------+--------------------+
|Tue Jan 23 14:16:...|         43524|955806333667807232|     false|        10053|Twitter for iPhone|Thank you to Gene...|
|Tue Jan 23 13:34:...|         56437|955795912374267907|     false|        13019|Twitter for iPhone|Nobody knows for ...|
|Tue Jan 23 11:55:...|         78243|955771016319590400|     false|        23409|Twitter for iPhone|In one of the big...|
|Tue Jan 23 11:51:...|          3377|955769850022055936|     false|         1115|Twitter for iPhone|In one of the big...|
|Tue Jan 23 11:31:...|         75650|955764970590961665|     false|        17246|Twitter for iPhone|Even Crazy Jim Ac...|
|Tue Jan 23 04:30:...|  