In [11]:
#Imports
import sys
sys.path.append('../../')

from pyspark.sql import SparkSession
from pyspark.ml import Pipeline

from sparknlp.annotator import *
from sparknlp.common import RegexRule
from sparknlp.base import DocumentAssembler, Finisher

In [12]:
# instantiate a spark context object
appname= "large_read_tar"
master="local"

# Create Spark Session
spark = SparkSession.builder.appName(appname).getOrCreate()

In [13]:
tar_data = 's3a://twitter-data-dump/test.tar'
data_folder = 's3a://twitter-data-dump/smallportion/'
trump_file = 's3a://twitter-data-dump/celebrities/trump.json'


# df = spark.read.json(data_folder)
df_trump = spark.read.json(trump_file)

resource_path ='/home/ubuntu/Desktop/spark-nlp/src/test/resources/'
#type(data) --> data frame

# to access each data and do modification
# rez = df_trump.take(10)
# rez_0 = rez[9]
# for item in rez_0:
#     print item

In [14]:
#spark-nlp pipeline --> each can be included in the data frame
document_assembler = DocumentAssembler() \
    .setInputCol("text")\
    .setOutputCol("document")

finisher_text = Finisher() \
    .setInputCols(["document"]) \
    .setOutputCols(["document_output"])    

sentence_detector = SentenceDetectorModel() \
    .setInputCols(["document"]) \
    .setOutputCol("sentence")
    
finisher_sentence = Finisher() \
    .setInputCols(["sentence"]) \
    .setOutputCols(["sentence_output"])

tokenizer = RegexTokenizer() \
    .setInputCols(["sentence"]) \
    .setOutputCol("token")
    
finisher_tokenizer = Finisher() \
    .setInputCols(["token"]) \
    .setOutputCols(["token_output"])

lemmatizer = Lemmatizer() \
    .setInputCols(["token"]) \
    .setOutputCol("lemma") \
    .setDictionary(resource_path+"lemma-corpus/AntBNC_lemmas_ver_001.txt")
    

        
# sentiment analysis requires 2 arguments: lemman and sentence to determine 
# the context of that particular sentence
sentiment_detector = SentimentDetectorModel() \
    .setInputCols(["lemma", "sentence"]) \
    .setOutputCol("sentiment_score") \
    .setDictPath(resource_path+"sentiment-corpus/default-sentiment-dict.txt")
    

In [52]:
normalizer = Normalizer() \
    .setInputCols(["token"]) \
    .setOutputCol("normalized_token") 
    
lemmatizer = Lemmatizer() \
    .setInputCols(["normalized_token"]) \
    .setOutputCol("lemma") \
    .setDictionary(resource_path+"lemma-corpus/AntBNC_lemmas_ver_001.txt")    
    
# Set CleanAnnotation False to have columns of intermediate data column!    
finisher_lemmatizer = Finisher() \
    .setInputCols(["lemma"]) \
    .setOutputCols(["lemma_output"])\
    .setCleanAnnotations(False)\
    .setOutputAsArray(True)\
    .setIncludeKeys(True) 
    
finisher_lemmatizer_noary = Finisher() \
    .setInputCols(["lemma"]) \
    .setOutputCols(["lemma_output"])\
    .setCleanAnnotations(False)\
    .setOutputAsArray(False)\
    .setIncludeKeys(False)     
    

In [53]:
# Building 2 pipelines
pipeline_lemmatizer = Pipeline(stages=[document_assembler, sentence_detector,tokenizer,
                            normalizer, lemmatizer,
                            finisher_lemmatizer])

model = pipeline_lemmatizer.fit(df_trump)
result = model.transform(df_trump)
result.printSchema()
#result.show()

root
 |-- created_at: string (nullable = true)
 |-- favorite_count: long (nullable = true)
 |-- id_str: string (nullable = true)
 |-- is_retweet: boolean (nullable = true)
 |-- retweet_count: long (nullable = true)
 |-- source: string (nullable = true)
 |-- text: string (nullable = true)
 |-- document: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |-- sentence: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)

In [54]:
pipeline_lemmatizer_noary = Pipeline(stages=[document_assembler, sentence_detector,tokenizer,
                            normalizer, lemmatizer,
                            finisher_lemmatizer_noary])

model = pipeline_lemmatizer_noary.fit(df_trump)
result_noary = model.transform(df_trump)
result_noary.printSchema()

root
 |-- created_at: string (nullable = true)
 |-- favorite_count: long (nullable = true)
 |-- id_str: string (nullable = true)
 |-- is_retweet: boolean (nullable = true)
 |-- retweet_count: long (nullable = true)
 |-- source: string (nullable = true)
 |-- text: string (nullable = true)
 |-- document: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |-- sentence: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)

In [55]:
result_noary.select('document').first()

Row(document=[Row(annotatorType=u'document', begin=0, end=257, result=u'Thank you to General John Kelly, who is doing a fantastic job, and all of the Staff and others in the White House, for a job well done. Long hours and Fake reporting makes your job more difficult, but it is always great to WIN, and few have won more than us!', metadata={})])

In [56]:
result_document = result.select('document')
result_document.first()

Row(document=[Row(annotatorType=u'document', begin=0, end=257, result=u'Thank you to General John Kelly, who is doing a fantastic job, and all of the Staff and others in the White House, for a job well done. Long hours and Fake reporting makes your job more difficult, but it is always great to WIN, and few have won more than us!', metadata={})])

In [57]:
result_noary.select('sentence').first()

Row(sentence=[Row(annotatorType=u'document', begin=0, end=134, result=u'Thank you to General John Kelly, who is doing a fantastic job, and all of the Staff and others in the White House, for a job well done.', metadata={}), Row(annotatorType=u'document', begin=136, end=257, result=u'Long hours and Fake reporting makes your job more difficult, but it is always great to WIN, and few have won more than us!', metadata={})])

In [58]:
result_sentence = result.select('sentence')
result_sentence.first()

Row(sentence=[Row(annotatorType=u'document', begin=0, end=134, result=u'Thank you to General John Kelly, who is doing a fantastic job, and all of the Staff and others in the White House, for a job well done.', metadata={}), Row(annotatorType=u'document', begin=136, end=257, result=u'Long hours and Fake reporting makes your job more difficult, but it is always great to WIN, and few have won more than us!', metadata={})])

In [59]:
result_noary.select('token').first()

Row(token=[Row(annotatorType=u'token', begin=0, end=4, result=u'Thank', metadata={u'sentence': u'1'}), Row(annotatorType=u'token', begin=6, end=8, result=u'you', metadata={u'sentence': u'1'}), Row(annotatorType=u'token', begin=10, end=11, result=u'to', metadata={u'sentence': u'1'}), Row(annotatorType=u'token', begin=13, end=19, result=u'General', metadata={u'sentence': u'1'}), Row(annotatorType=u'token', begin=21, end=24, result=u'John', metadata={u'sentence': u'1'}), Row(annotatorType=u'token', begin=26, end=31, result=u'Kelly,', metadata={u'sentence': u'1'}), Row(annotatorType=u'token', begin=33, end=35, result=u'who', metadata={u'sentence': u'1'}), Row(annotatorType=u'token', begin=37, end=38, result=u'is', metadata={u'sentence': u'1'}), Row(annotatorType=u'token', begin=40, end=44, result=u'doing', metadata={u'sentence': u'1'}), Row(annotatorType=u'token', begin=46, end=46, result=u'a', metadata={u'sentence': u'1'}), Row(annotatorType=u'token', begin=48, end=56, result=u'fantastic'

In [60]:
result_token = result.select('token')
result_token.first()

Row(token=[Row(annotatorType=u'token', begin=0, end=4, result=u'Thank', metadata={u'sentence': u'1'}), Row(annotatorType=u'token', begin=6, end=8, result=u'you', metadata={u'sentence': u'1'}), Row(annotatorType=u'token', begin=10, end=11, result=u'to', metadata={u'sentence': u'1'}), Row(annotatorType=u'token', begin=13, end=19, result=u'General', metadata={u'sentence': u'1'}), Row(annotatorType=u'token', begin=21, end=24, result=u'John', metadata={u'sentence': u'1'}), Row(annotatorType=u'token', begin=26, end=31, result=u'Kelly,', metadata={u'sentence': u'1'}), Row(annotatorType=u'token', begin=33, end=35, result=u'who', metadata={u'sentence': u'1'}), Row(annotatorType=u'token', begin=37, end=38, result=u'is', metadata={u'sentence': u'1'}), Row(annotatorType=u'token', begin=40, end=44, result=u'doing', metadata={u'sentence': u'1'}), Row(annotatorType=u'token', begin=46, end=46, result=u'a', metadata={u'sentence': u'1'}), Row(annotatorType=u'token', begin=48, end=56, result=u'fantastic'

In [61]:
result_noary.select('lemma_output').first()

Row(lemma_output=u'thank@you@to@general@john@kelly@who@be@do@a@fantastic@job@and@all@of@the@staff@and@other@in@the@white@house@for@a@job@well@do@long@hour@and@fake@report@make@you@job@more@difficult@but@it@be@always@great@to@win@and@few@have@win@more@than@we')

In [62]:
result_lemma_output = result.select('lemma_output')
result_lemma_output.first()

Row(lemma_output=[u'thank', u'you', u'to', u'general', u'john', u'kelly', u'who', u'be', u'do', u'a', u'fantastic', u'job', u'and', u'all', u'of', u'the', u'staff', u'and', u'other', u'in', u'the', u'white', u'house', u'for', u'a', u'job', u'well', u'do', u'long', u'hour', u'and', u'fake', u'report', u'make', u'you', u'job', u'more', u'difficult', u'but', u'it', u'be', u'always', u'great', u'to', u'win', u'and', u'few', u'have', u'win', u'more', u'than', u'we'])