In [1]:
#Imports
import sys
import datetime
sys.path.append('../../')

from pyspark.sql import SparkSession, Row
from pyspark.ml import Pipeline

from sparknlp.annotator import *
from sparknlp.common import RegexRule
from sparknlp.base import DocumentAssembler, Finisher
from pyspark.sql.functions import explode

from dateutil.parser import parse
# for tokenizing
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
# for schema
from pyspark.sql.types import *


In [2]:
# instantiate a spark context object
appname= "large_read_tar"
master="local"

# Create Spark Session
spark = SparkSession.builder.appName(appname)\
                .config("spark.cassandra.connection.host", "localhost")\
                .config("spark.cassandra.connection.port", "9042")\
                .getOrCreate()

In [3]:
zip_path = 's3a://twitter-data-dump/test.tar'
trump_json = 's3a://twitter-data-dump/celebrities/trump.json'
#large_tar = 's3a://twitter-data-dump/zip_dump/archiveteam-twitter-stream-2013-09.tar'
small_portion = 's3a://twitter-data-dump/smallportion/'


df = spark.read.json(small_portion)
df_trump = spark.read.json(trump_json)

resource_path ='/home/ubuntu/Desktop/spark-nlp/src/test/resources/'
#type(data) --> data frame


In [4]:
df.printSchema()

root
 |-- contributors: string (nullable = true)
 |-- coordinates: struct (nullable = true)
 |    |-- coordinates: array (nullable = true)
 |    |    |-- element: double (containsNull = true)
 |    |-- type: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- delete: struct (nullable = true)
 |    |-- status: struct (nullable = true)
 |    |    |-- id: long (nullable = true)
 |    |    |-- id_str: string (nullable = true)
 |    |    |-- user_id: long (nullable = true)
 |    |    |-- user_id_str: string (nullable = true)
 |-- entities: struct (nullable = true)
 |    |-- hashtags: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- indices: array (nullable = true)
 |    |    |    |    |-- element: long (containsNull = true)
 |    |    |    |-- text: string (nullable = true)
 |    |-- media: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- display_url: string (nullable = true)


In [28]:
# Select interested col attributes
main_df = df.selectExpr('id AS tid',\
                        'user.id AS uid',\
                        'text AS tweet',\
                        'user.created_at AS creation_time',\
                        'user.time_zone AS time_zone',\
                        'user.followers_count AS followers_count',\
                        'user.friends_count AS friends_count',\
                        'place.name AS city_name',\
                        'place.country AS country_name',\
                        'entities.media.media_url AS media_ary',\
                        'entities.hashtags.text AS hashtag_ary',\
                        'retweet_count',\
                        'favorite_count'                        
                       ).where('tid is NOT NULL AND uid is NOT NULL')

main_df.printSchema()

root
 |-- tid: long (nullable = true)
 |-- uid: long (nullable = true)
 |-- tweet: string (nullable = true)
 |-- creation_time: string (nullable = true)
 |-- time_zone: string (nullable = true)
 |-- followers_count: long (nullable = true)
 |-- friends_count: long (nullable = true)
 |-- city_name: string (nullable = true)
 |-- country_name: string (nullable = true)
 |-- media_ary: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- hashtag_ary: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- retweet_count: long (nullable = true)
 |-- favorite_count: long (nullable = true)



In [30]:
# Main DF column name set
col_exp_set = ['tid','uid','tweet','creation_time',
               'time_zone','followers_count',
               'friends_count','city_name','country_name',
               'media_ary','hashtag_ary','retweet_count',
               'favorite_count',
              ]


root
 |-- tid: long (nullable = true)
 |-- uid: long (nullable = true)
 |-- tweet: string (nullable = true)
 |-- creation_time: string (nullable = true)
 |-- time_zone: string (nullable = true)
 |-- followers_count: long (nullable = true)
 |-- friends_count: long (nullable = true)
 |-- city_name: string (nullable = true)
 |-- country_name: string (nullable = true)
 |-- media_ary: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- hashtag_ary: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- retweet_count: long (nullable = true)
 |-- favorite_count: long (nullable = true)



In [31]:
base_rdd = main_df.rdd
base_rdd.take(1)

[Row(tid=362814954803433472, uid=194531988, tweet=u'Psalm 119:133', creation_time=u'Fri Sep 24 11:04:40 +0000 2010', time_zone=u'London', followers_count=140, friends_count=204, city_name=None, country_name=None, media_ary=None, hashtag_ary=[], retweet_count=0, favorite_count=0)]

In [51]:
# return 2 items: sentence_count,  <word_tuple>
def process_tweet(description):
    # base case
    if description is None or description == "":
        return (0,[])

    stemmer = SnowballStemmer("english")
    tokenizer = RegexpTokenizer(r'\w+')
    word_list = []
 
    words = tokenizer.tokenize(description)
    stopWords = set(stopwords.words('english'))
    for w in words:
        if w not in stopWords:
            word_list.append(stemmer.stem(w.lower()))

    return (len(sent_tokenize(description)),word_list)


In [52]:
# parse twitter time string to (date_str, timestamp_str, hour int)
# note later only timestamp_str is changed to asTYpe
def parse_time(creation_time):  
    # fcn that converts dt to date-str and time-str
    def cassandra_convert(dt):
        hour = dt.strftime("%H")
        return (str(dt.date()), str(dt),int(hour))
    
    dt = None
    try:
        dt = parse(creation_time)
    except Exception as e:
        # 1. log and 
        # 2.use current system time instead
        dt = datetime.datetime.now()
    return cassandra_convert(dt)

# Mapping: 1. 13 cols to 15 cols (word_list, date, 
#          timestamp, hour)
#          2. (media_ary-> media_attached; tag_ary -> tag_count)
def map_row(row):
    # return if image attached
    def check_image(media_ary):
        if media_ary is None or\
           len(media_ary) == 0:
            return False
        return True
    
    # count # of hashtags
    def count_tag(tag_ary):
        if tag_ary is None:
            return 0
        return len(tag_ary)
    
    # create a list of tuples: [(str1, like_num),] or [(str1, ret_num)] 
    def create_tuple_list(vector, num):
        result = [num]
        for item in vector:
            result.append(item)
        if len(result) == 0:
            return []
        return result
        
    try:       
        tid = row.tid
        uid = row.uid
        tweet = row.tweet
        retweet_count = row.retweet_count
        favorite_count = row.favorite_count
        # Map token: sentence_count, word_list
        sentence_count, word_list\
            = process_tweet(tweet)
        # Map Time
        creation_time = row.creation_time
        date, timestamp, hour\
            = parse_time(creation_time)
        time_zone = row.time_zone
        followers_count = row.followers_count
        friends_count = row.friends_count
        city_name = row.city_name
        country_name = row.country_name
        media_ary = row.media_ary
        # boolean
        media_attached = check_image(media_ary)        
        hashtag_ary = row.hashtag_ary
        # count tags
        tag_count = count_tag(hashtag_ary)
                
    except Exception as e:
        # 1. log e and 2.return default        
        return Row(tid=-1, uid=-1,
            followers_count=-1,
            friends_count=-1,
            tweet='n/a',retweet_count=-1,
            favorite_count=-1,sentence_count=-1,
            word_list=[],
            phrase_list=['n/a'],
            date='2000-1-1',timestamp='00:00:00',hour=0,
            time_zone='n/a',city_name='n/a',
            country_name='n/a',
            media_attached=False,tag_count=tag_count
            )
    
    r = Row(tid=tid, uid=uid,
            followers_count=followers_count,
            friends_count=friends_count,
            tweet=tweet,retweet_count=retweet_count,
            favorite_count=favorite_count,sentence_count=sentence_count,
            word_list=word_list,
            phrase_list=['n/a'],
            date=date,timestamp=timestamp,hour=hour,
            time_zone=time_zone,city_name=city_name,
            country_name=country_name,
            media_attached=media_attached,tag_count=tag_count
            )
    return r 
    
base_map_rdd = base_rdd.map(map_row)    

In [59]:
# Main DF column name set
col_exp_set = ['tid','uid','tweet','followers_count',
               'friends_count','retweet_count',
               'favorite_count','sentence_count',
               'word_list',
               'phrase_rt',
               'date','timestamp','hour',
               'time_zone','city_name','country_name',
               'media_attached','tag_count'
              ]
# Sanity Check
r = base_map_rdd.take(1)[0]
r
# len(col_exp_set) --> 20

Row(city_name=None, country_name=None, date='2010-09-24', favorite_count=0, followers_count=140, friends_count=204, hour=11, media_attached=False, phrase_list=['n/a'], retweet_count=0, sentence_count=1, tag_count=0, tid=362814954803433472, time_zone=u'London', timestamp='2010-09-24 11:04:40+00:00', tweet=u'Psalm 119:133', uid=194531988, word_list=[u'psalm', u'119', u'133'])

In [58]:
len(r)

18

In [None]:

# Stage 1

# Result type: (arg1={(k1,c1),(k2, c2)}, arg2=total_tweets)
# mapping function for bm_rdd, 
def seqOp(result, row):
    word_list = row[17]
    fav_count = row[3]
    retweet_count = row[9]
    
    # sanity check for word_list_fav could be empty
    if len(word_list_fav) > 0:
        

    
    # increment tweet count
    result[1] + 1
    return result

# aggregate each partition, p1 refers to partition result
def comOp(p1, p2):
    pass



In [None]:
# Stage 2
# Mapping function that sorts the results: easier to do final sort 
# instead of local sort

In [46]:
# compute when group by uid with bm_rdd 
def compute_uid(bm_rdd):
    
    pass

In [219]:
def ary_type(col_name, col_type):
    return StructField(col_name, ArrayType(StringType(), containsNull=True), True)
# build the table schema
def build_schema():
    schema = \
    StructType([
        StructField("city_name",StringType(), True),
        StructField("country_name",StringType(), True),
        StructField("creation_date",StringType(), True),
        StructField("creation_hour",IntegerType(), True),         
        StructField("creation_timestamp",StringType(), True),       
        StructField("followers_count",IntegerType(), True),
        StructField("friends_count",IntegerType(), True),
        StructField("hashtag_count",IntegerType(), True),
        StructField("media_attached",BooleanType(), False),
        StructField("phrase_token",ArrayType(StringType(), containsNull=True),
                    True),        
        StructField("sentence_count",IntegerType(), True),         
        StructField("tid",LongType(), False),  
        StructField("time_zone",StringType(), True),         
        StructField("tweet",StringType(), True),
        StructField("uid",LongType(), False),
        StructField("word_token_set",ArrayType(StringType(), containsNull=True),
                    True),                     
    ])
    return schema

In [220]:
# fit rdd with schema
schema = build_schema()
df_uncasted = spark.createDataFrame(base_map_rdd, schema)
df_uncasted.printSchema()

root
 |-- city_name: string (nullable = true)
 |-- country_name: string (nullable = true)
 |-- creation_date: string (nullable = true)
 |-- creation_hour: integer (nullable = true)
 |-- creation_timestamp: string (nullable = true)
 |-- followers_count: integer (nullable = true)
 |-- friends_count: integer (nullable = true)
 |-- hashtag_count: integer (nullable = true)
 |-- media_attached: boolean (nullable = false)
 |-- phrase_token: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- sentence_count: integer (nullable = true)
 |-- tid: long (nullable = false)
 |-- time_zone: string (nullable = true)
 |-- tweet: string (nullable = true)
 |-- uid: long (nullable = false)
 |-- word_token_set: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [221]:
df_uncasted.first()

Row(city_name=None, country_name=None, creation_date=u'2010-09-24', creation_hour=11, creation_timestamp=u'2010-09-24 11:04:40+00:00', followers_count=140, friends_count=204, hashtag_count=0, media_attached=False, phrase_token=[u'n/a'], sentence_count=1, tid=362814954803433472, time_zone=u'London', tweet=u'Psalm 119:133', uid=194531988, word_token_set=[u'psalm', u'119', u'133'])

In [224]:
df_final = df_uncasted.select('creation_date',
                              'creation_hour',
                              df_uncasted.creation_timestamp.astype(TimestampType()),
                              'city_name',
                              'country_name',
                              'followers_count',
                              'friends_count',
                              'hashtag_count',
                              'media_attached',
                              'phrase_token',
                              'sentence_count',
                              'tid',
                              'time_zone',
                              'tweet',
                              'uid',
                              'word_token_set'
                             )
df_final.printSchema()

root
 |-- creation_date: string (nullable = true)
 |-- creation_hour: integer (nullable = true)
 |-- creation_timestamp: timestamp (nullable = true)
 |-- city_name: string (nullable = true)
 |-- country_name: string (nullable = true)
 |-- followers_count: integer (nullable = true)
 |-- friends_count: integer (nullable = true)
 |-- hashtag_count: integer (nullable = true)
 |-- media_attached: boolean (nullable = false)
 |-- phrase_token: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- sentence_count: integer (nullable = true)
 |-- tid: long (nullable = false)
 |-- time_zone: string (nullable = true)
 |-- tweet: string (nullable = true)
 |-- uid: long (nullable = false)
 |-- word_token_set: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [229]:
df_final.take(1)

part = df_final.limit(100)
type(part)

pyspark.sql.dataframe.DataFrame

In [230]:
part.write.format("org.apache.spark.sql.cassandra").\
            mode('append').options(table='b0',keyspace='twitter').save() 
    

In [231]:
def save_data_frame(df, table_name):
    df.write.format("org.apache.spark.sql.cassandra").\
            mode('append').options(table=table_name,keyspace='twitter').save()    

In [232]:
save_data_frame(df_final,'b0')

In [234]:
# Region to parse and count words! ==> start from base_map_rdd
# Region to parse and count words! ==> start from base_map_rdd
# Region to parse and count words! ==> start from base_map_rdd
# Region to parse and count words! ==> start from base_map_rdd
# Region to parse and count words! ==> start from base_map_rdd
base_map_rdd.take(2)[1]

Row(city_name=None, country_name=None, creation_date='2012-11-02', creation_hour=21, creation_timestamp='2012-11-02 21:07:21+00:00', followers_count=199, friends_count=147, hashtag_count=0, media_attached=False, phrase_token=['n/a'], sentence_count=1, tid=362814954803429378, time_zone=u'Pacific Time (US & Canada)', tweet=u"so i want to pee but my dog fell asleep on my lap and she looks so cute and i don't want to wake her, ugh the struggle", uid=921796843, word_token_set=[u'want', u'pee', u'dog', u'fell', u'asleep', u'lap', u'look', u'cute', u'want', u'wake', u'ugh', u'struggl'])

In [240]:
type(base_map_rdd)
base_map_rdd.take(1)

[Row(city_name=None, country_name=None, creation_date='2010-09-24', creation_hour=11, creation_timestamp='2010-09-24 11:04:40+00:00', followers_count=140, friends_count=204, hashtag_count=0, media_attached=False, phrase_token=['n/a'], sentence_count=1, tid=362814954803433472, time_zone=u'London', tweet=u'Psalm 119:133', uid=194531988, word_token_set=[u'psalm', u'119', u'133'])]

In [249]:
# fcn that get uid
def get_uid(bmrdd_row):
        pass
# pipedlined rdd -> df -> rdd (work around for now)  
bm_rdd = base_map_rdd.toDF().rdd
base_map_rdd.toDF().printSchema()

root
 |-- city_name: string (nullable = true)
 |-- country_name: string (nullable = true)
 |-- creation_date: string (nullable = true)
 |-- creation_hour: long (nullable = true)
 |-- creation_timestamp: string (nullable = true)
 |-- followers_count: long (nullable = true)
 |-- friends_count: long (nullable = true)
 |-- hashtag_count: long (nullable = true)
 |-- media_attached: boolean (nullable = true)
 |-- phrase_token: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- sentence_count: long (nullable = true)
 |-- tid: long (nullable = true)
 |-- time_zone: string (nullable = true)
 |-- tweet: string (nullable = true)
 |-- uid: long (nullable = true)
 |-- word_token_set: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [248]:
# map each token as a tuple 
def map_token_count()

AttributeError: 'RDD' object has no attribute 'printSchema'

In [None]:
# Scrach Space: Previous Tests and Etc

In [111]:

from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

# filter and return list of lower cased words 
def clean_stem_tweet(description):
    # base case
    if description is None or description == "":
        return []

    stemmer = SnowballStemmer("english")
    tokenizer = RegexpTokenizer(r'\w+')
    word_list = []
    words = tokenizer.tokenize(description)
    stopWords = set(stopwords.words('english'))
    for w in words:
        if w not in stopWords:
            word_list.append(stemmer.stem(w.lower()))
            
    return word_list


In [115]:
data = "All work and no play makes jack dull boy. All work and no play makes jack a dull boy."

r = clean_stem_tweet(data)
print(r)
phrases = sent_tokenize('data')
print(len(phrases))
words = word_tokenize(data)
 
print('phrases', phrases)

print("words:" , words)

[u'all', u'work', u'play', u'make', u'jack', u'dull', u'boy', u'all', u'work', u'play', u'make', u'jack', u'dull', u'boy']
1
('phrases', ['data'])
('words:', ['All', 'work', 'and', 'no', 'play', 'makes', 'jack', 'dull', 'boy', '.', 'All', 'work', 'and', 'no', 'play', 'makes', 'jack', 'a', 'dull', 'boy', '.'])


In [39]:
# data = [1, 2, 3, 4, 5]
# distData = sc.parallelize(data)
# type(distData)
# distData.take(3)

[1, 2, 3]

In [50]:
from dateutil.parser import parse
import datetime

# parse twitter time string to (y-m-d,h-m-s)
def parse_time(creation_time):    
    # fcn that converts dt to date-str and time-str
    def cassandra_convert(dt):
        time = dt.strftime("%H:%M:%S")
        return (str(dt.date()), time)
    try:
        dt = parse(creation_time)
    except Exception as e:
        # 1. log and 
        # 2.use current system time instead
        dt = datetime.datetime.now()
    return cassandra_convert(dt)

# rez = None
# creation_time = u'Fri Sep 24 11:04:40 +0000 2010'
# try:
#     rez = parse(creation_time)
# except ValueError:
#     rez = 'default'
# except OverflowError:
#     rez = 'default'
# rez


datetime.datetime(2010, 9, 24, 11, 4, 40, tzinfo=tzlocal())

In [65]:
print (rez.date())
print(rez.time())


2010-09-24
11:04:40
11:04:40


In [54]:


dt = datetime.datetime.now()
print(dt.date())
print(dt.time())
m = dt.strftime("%H:%M:%S")
print(m)
dt

2018-01-26
09:27:15.881532
09:27:15


datetime.datetime(2018, 1, 26, 9, 27, 15, 881532)

In [None]:
t1 = spark.createDataFrame([Row(a=1, b=[1,2,3],c=[7,8,9], d='foo')])

In [73]:
t1.printSchema()

root
 |-- a: long (nullable = true)
 |-- b: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- c: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- d: string (nullable = true)



In [15]:
t1 = t.rdd
t2 = t1.map(lambda x: x[0])
t2.take(8)

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 3 in stage 5.0 failed 1 times, most recent failure: Lost task 3.0 in stage 5.0 (TID 73, localhost, executor driver): org.apache.spark.SparkException: Failed to execute user defined function($anonfun$dfAssembleNoExtras$1: (string) => array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>>>)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:395)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:389)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:148)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
	at org.apache.spark.scheduler.Task.run(Task.scala:108)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:338)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.NullPointerException
	at com.johnsnowlabs.nlp.DocumentAssembler.com$johnsnowlabs$nlp$DocumentAssembler$$assemble(DocumentAssembler.scala:52)
	at com.johnsnowlabs.nlp.DocumentAssembler$$anonfun$dfAssembleNoExtras$1.apply(DocumentAssembler.scala:72)
	at com.johnsnowlabs.nlp.DocumentAssembler$$anonfun$dfAssembleNoExtras$1.apply(DocumentAssembler.scala:71)
	... 13 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1517)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1505)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1504)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1504)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:814)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1732)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1687)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1676)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:630)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2029)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2050)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2069)
	at org.apache.spark.api.python.PythonRDD$.runJob(PythonRDD.scala:455)
	at org.apache.spark.api.python.PythonRDD.runJob(PythonRDD.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.SparkException: Failed to execute user defined function($anonfun$dfAssembleNoExtras$1: (string) => array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>>>)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:395)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:389)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:148)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
	at org.apache.spark.scheduler.Task.run(Task.scala:108)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:338)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more
Caused by: java.lang.NullPointerException
	at com.johnsnowlabs.nlp.DocumentAssembler.com$johnsnowlabs$nlp$DocumentAssembler$$assemble(DocumentAssembler.scala:52)
	at com.johnsnowlabs.nlp.DocumentAssembler$$anonfun$dfAssembleNoExtras$1.apply(DocumentAssembler.scala:72)
	at com.johnsnowlabs.nlp.DocumentAssembler$$anonfun$dfAssembleNoExtras$1.apply(DocumentAssembler.scala:71)
	... 13 more


In [None]:
#Region for NLP Module: --> Non-working to RDD
#Region for NLP Module: --> Non-working to RDD
#Region for NLP Module: --> Non-working to RDD
#Region for NLP Module: --> Non-working to RDD
#Region for NLP Module: --> Non-working to RDD
#Region for NLP Module: --> Non-working to RDD
#Region for NLP Module: --> Non-working to RDD

#spark-nlp pipeline --> each can be included in the data frame
document_assembler = DocumentAssembler() \
    .setInputCol("tweet")\
    .setOutputCol("document")

sentence_detector = SentenceDetectorModel() \
    .setInputCols(["document"]) \
    .setOutputCol("sentence")

tokenizer = RegexTokenizer() \
    .setInputCols(["sentence"]) \
    .setOutputCol("token")
    
normalizer = Normalizer() \
    .setInputCols(["token"]) \
    .setOutputCol("normalized_token")     
    
lemmatizer = Lemmatizer() \
    .setInputCols(["normalized_token"]) \
    .setOutputCol("lemma") \
    .setDictionary(resource_path+"lemma-corpus/AntBNC_lemmas_ver_001.txt")    
          
# sentiment analysis requires 2 arguments: lemman and sentence to determine 
# the context of that particular sentence
sentiment_detector = SentimentDetectorModel() \
    .setInputCols(["lemma", "sentence"]) \
    .setOutputCol("sentiment_score") \
    .setDictPath(resource_path+"sentiment-corpus/default-sentiment-dict.txt")
    

In [8]:
# Set CleanAnnotation False to have columns of intermediate data column!    
finisher_lemmatizer = Finisher() \
    .setInputCols(["sentiment_score"]) \
    .setOutputCols(["sentiment_score"])\
    .setCleanAnnotations(False)\
    .setIncludeKeys(False)
    

In [9]:
# Building 2 pipelines
pipeline_lemmatizer = Pipeline(stages=[document_assembler, sentence_detector,tokenizer,
                            normalizer, lemmatizer, sentiment_detector,
                            finisher_lemmatizer])

model = pipeline_lemmatizer.fit(main_df)
nlp_df = model.transform(main_df)
nlp_df.printSchema()

root
 |-- tid: long (nullable = true)
 |-- uid: long (nullable = true)
 |-- tweet: string (nullable = true)
 |-- creation_time: string (nullable = true)
 |-- time_zone: string (nullable = true)
 |-- followers_count: long (nullable = true)
 |-- friends_count: long (nullable = true)
 |-- city_name: string (nullable = true)
 |-- country_name: string (nullable = true)
 |-- media_ary: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- display_url: string (nullable = true)
 |    |    |-- expanded_url: string (nullable = true)
 |    |    |-- id: long (nullable = true)
 |    |    |-- id_str: string (nullable = true)
 |    |    |-- indices: array (nullable = true)
 |    |    |    |-- element: long (containsNull = true)
 |    |    |-- media_url: string (nullable = true)
 |    |    |-- media_url_https: string (nullable = true)
 |    |    |-- sizes: struct (nullable = true)
 |    |    |    |-- large: struct (nullable = true)
 |    |    |    |    |-- h: long (nul