In [1]:
#Imports
import sys
import datetime
import re
from datetime import timedelta

sys.path.append('../../')

from pyspark.sql import SparkSession, Row
from pyspark.ml import Pipeline

# from sparknlp.annotator import *
# from sparknlp.common import RegexRule
# from sparknlp.base import DocumentAssembler, Finisher
from pyspark.sql.functions import explode

from dateutil.parser import parse
# for tokenizing
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk import pos_tag, RegexpParser

# for schema
from pyspark.sql.types import *

# for testing 
import random


In [2]:
# instantiate a spark context object
appname= "spark-nlp-test"
master="local"

# Create Spark Session
spark = SparkSession.builder.appName(appname)\
                .config("spark.cassandra.connection.host", "localhost")\
                .config("spark.cassandra.connection.port", "9042")\
                .config("spark.eventLog.enabled", True)\
                .config("spark.eventLog.dir", "/home/ubuntu/spark_tmp/")\
                .getOrCreate()

In [3]:
zip_path = 's3a://twitter-data-dump/test.tar'
trump_json = 's3a://twitter-data-dump/celebrities/trump.json'
#large_tar = 's3a://twitter-data-dump/zip_dump/archiveteam-twitter-stream-2013-09.tar'
small_portion = 's3a://twitter-data-dump/smallportion/'


df = spark.read.json(small_portion)
df_trump = spark.read.json(trump_json)

resource_path ='/home/ubuntu/Desktop/spark-nlp/src/test/resources/'
#type(data) --> data frame

In [4]:
df_trump.printSchema()
df_trump.take(10)[5]

root
 |-- created_at: string (nullable = true)
 |-- favorite_count: long (nullable = true)
 |-- id_str: string (nullable = true)
 |-- is_retweet: boolean (nullable = true)
 |-- retweet_count: long (nullable = true)
 |-- source: string (nullable = true)
 |-- text: string (nullable = true)



Row(created_at=u'Tue Jan 23 04:30:33 +0000 2018', favorite_count=103887, id_str=u'955658992793149440', is_retweet=False, retweet_count=23224, source=u'Twitter for iPhone', text=u'Big win for Republicans as Democrats cave on Shutdown. Now I want a big win for everyone, including Republicans, Democrats and DACA, but especially for our Great Military and Border Security. Should be able to get there. See you at the negotiating table!')

In [13]:
# trump data frame to base_df
# note i split the data into 666 and 1666 id set
def trump_mapper(row):
    # set default trump as 666
    uid = 666    
    tid = row[2]
    creation_time = row[0]
    retweet_count = row[4]    
    favorite_count = row[1]
    tweet = row[-1]
    if len(tweet) > 200:
        uid = 1666
    
    time_zone = 'blabla'
    followers_count='100'
    friends_count= '2000'
    city_name='Washington'
    country_name = 'U.S'
    media_ary = []
    hashtag_ary = []
    return (tid, uid, tweet, creation_time,
            time_zone, followers_count,
            friends_count, city_name, country_name,
            media_ary, hashtag_ary, retweet_count,
            favorite_count )
        
trump_rdd = df_trump.rdd.map(trump_mapper)

In [14]:
trump_rdd.take(10)[0]

(u'955806333667807232',
 1666,
 u'Thank you to General John Kelly, who is doing a fantastic job, and all of the Staff and others in the White House, for a job well done. Long hours and Fake reporting makes your job more difficult, but it is always great to WIN, and few have won more than us!',
 u'Tue Jan 23 14:16:02 +0000 2018',
 'blabla',
 '100',
 '2000',
 'Washington',
 'U.S',
 [],
 [],
 10053,
 43524)

In [8]:
df.printSchema()

root
 |-- contributors: string (nullable = true)
 |-- coordinates: struct (nullable = true)
 |    |-- coordinates: array (nullable = true)
 |    |    |-- element: double (containsNull = true)
 |    |-- type: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- delete: struct (nullable = true)
 |    |-- status: struct (nullable = true)
 |    |    |-- id: long (nullable = true)
 |    |    |-- id_str: string (nullable = true)
 |    |    |-- user_id: long (nullable = true)
 |    |    |-- user_id_str: string (nullable = true)
 |-- entities: struct (nullable = true)
 |    |-- hashtags: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- indices: array (nullable = true)
 |    |    |    |    |-- element: long (containsNull = true)
 |    |    |    |-- text: string (nullable = true)
 |    |-- media: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- display_url: string (nullable = true)


In [188]:
df.first()

Row(contributors=None, coordinates=None, created_at=u'Thu Aug 01 06:00:00 +0000 2013', delete=None, entities=Row(hashtags=[], media=None, symbols=[], urls=[], user_mentions=[]), favorite_count=0, favorited=False, filter_level=u'medium', geo=None, id=362814954803433472, id_str=u'362814954803433472', in_reply_to_screen_name=None, in_reply_to_status_id=None, in_reply_to_status_id_str=None, in_reply_to_user_id=None, in_reply_to_user_id_str=None, lang=u'en', place=None, possibly_sensitive=None, retweet_count=0, retweeted=False, retweeted_status=None, source=u'<a href="http://blackberry.com/twitter" rel="nofollow">Twitter for BlackBerry\xae</a>', text=u'Psalm 119:133', truncated=False, user=Row(contributors_enabled=False, created_at=u'Fri Sep 24 11:04:40 +0000 2010', default_profile=False, default_profile_image=False, description=u'Street is the name, hustle is the game, money is the motive, and God is the way.. #TeamBarca  #TeamOOU #TeamUgly #TeamPotential', favourites_count=358, follow_req

In [11]:
# Select interested col attributes
main_df = df.selectExpr('id AS tid',\
                        'user.id AS uid',\
                        'text AS tweet',\
                        'user.created_at AS creation_time',\
                        'user.time_zone AS time_zone',\
                        'user.followers_count AS followers_count',\
                        'user.friends_count AS friends_count',\
                        'place.name AS city_name',\
                        'place.country AS country_name',\
                        'entities.media.media_url AS media_ary',\
                        'entities.hashtags.text AS hashtag_ary',\
                        'retweet_count',\
                        'favorite_count'                        
                       ).where('tid is NOT NULL AND uid is NOT NULL')

main_df.printSchema()

root
 |-- tid: long (nullable = true)
 |-- uid: long (nullable = true)
 |-- tweet: string (nullable = true)
 |-- creation_time: string (nullable = true)
 |-- time_zone: string (nullable = true)
 |-- followers_count: long (nullable = true)
 |-- friends_count: long (nullable = true)
 |-- city_name: string (nullable = true)
 |-- country_name: string (nullable = true)
 |-- media_ary: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- hashtag_ary: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- retweet_count: long (nullable = true)
 |-- favorite_count: long (nullable = true)



In [12]:
# Main DF column name set
col_exp_set = ['tid','uid','tweet','creation_time',
               'time_zone','followers_count',
               'friends_count','city_name','country_name',
               'media_ary','hashtag_ary','retweet_count',
               'favorite_count',
              ]


In [167]:
main_df.first()

Row(tid=362814954803433472, uid=194531988, tweet=u'Psalm 119:133', creation_time=u'Fri Sep 24 11:04:40 +0000 2010', time_zone=u'London', followers_count=140, friends_count=204, city_name=None, country_name=None, media_ary=None, hashtag_ary=[], retweet_count=0, favorite_count=0)

In [13]:
###### Debuggging 3 records only

#base_rdd = main_df.limit(3).rdd
#base_rdd.count()
base_rdd = main_df.rdd
base_rdd.take(1)

[Row(tid=362814954803433472, uid=194531988, tweet=u'Psalm 119:133', creation_time=u'Fri Sep 24 11:04:40 +0000 2010', time_zone=u'London', followers_count=140, friends_count=204, city_name=None, country_name=None, media_ary=None, hashtag_ary=[], retweet_count=0, favorite_count=0)]

In [81]:
# Modfication for single word extraction as well
# Modfication for single word extraction as well
# Modfication for single word extraction as well

# extract word_list(exclusing noun+stopwords) and noun phrase list
def extract_word_phrase(tokens):
    tokens_tag = pos_tag(tokens)
    
    stemmer = SnowballStemmer("english")
    stem_tokens = [stemmer.stem(token) for token in tokens]
    stem_tokens_tag = pos_tag(stem_tokens)    
    stopWords = set(stopwords.words('english'))        
    
    word_list = [word for word, tag in stem_tokens_tag if tag != 'NN'\
                 and word not in stopWords]
    
    # set up for entity extraction 
    grammar = "NP: {<DT>?<JJ.*>*<NN.*>+}"
    cp = RegexpParser(grammar)
    np_ary = []
    
    root_tree = cp.parse(tokens_tag)
    for i in list( root_tree.subtrees(filter=lambda x: x.label() == 'NP')):
        st = ""
        for t in i.leaves():
            st = st + t[0] + " "
        word_list.append(st.strip().lower())
    # entity extracted   
    
    # combine them together
    #    np_ary = np_ary + word_list
    return (word_list, np_ary)
    
# return 2 items: sentence_count,  <word_tuple>, <name_entity>
def process_tweet(description):
    # base case
    if description is None or description == "":
        return (0,[],[])
    # filter out http items
    description =\
        re.sub(r'https:[^ ]*', '', description, flags=re.IGNORECASE)

    tokenizer = RegexpTokenizer(r'\w+')
    words = tokenizer.tokenize(description)    
    # extract word_list, entity_list
    word_list, entity_list = extract_word_phrase(words)
    
#     word_list = []
#     stopWords = set(stopwords.words('english'))
#     for w in words:
#         if w not in stopWords:
#             word_list.append(stemmer.stem(w.lower()))

    return (len(sent_tokenize(description)),word_list, entity_list)


In [82]:
# parse twitter time string to (date, timestamp_str, hour int)
# note later only timestamp_str is changed to asTYpe
def parse_time(creation_time):  
    # fcn that converts dt to date-str and time-str
    def cassandra_convert(dt):
        hour = dt.strftime("%H")
        # debug:
        base_date = datetime.date(2008,4,1)
        days_count = random.randint(1,3000)
        dt = base_date + timedelta(days=days_count)
        return (dt, str(dt),int(hour))       
        #return (dt.date(), str(dt),int(hour))
    
    dt = None
    try:
        dt = parse(creation_time)
    except Exception as e:
        # 1. log and 
        # 2.use current system time instead
        dt = datetime.datetime.now()
    return cassandra_convert(dt)

# Mapping: 1. 13 cols to 15 cols (word_list, date, 
#          timestamp, hour)
#          2. (media_ary-> media_attached; tag_ary -> tag_count)
def map_nlp_row(row):
    # return if image attached
    def check_image(media_ary):
        if media_ary is None or\
           len(media_ary) == 0:
            return False
        return True
    
    # count # of hashtags
    def count_tag(tag_ary):
        if tag_ary is None:
            return 0
        return len(tag_ary)
    
    # create a list of tuples: [(str1, like_num),] or [(str1, ret_num)] 
    def create_tuple_list(vector, num):
        result = [num]
        for item in vector:
            result.append(item)
        if len(result) == 0:
            return []
        return result
        
#     try:       
    tid = row[0]
    uid = row[1]
    tweet = row[2]
    retweet_count = row[11]
    favorite_count = row[12]
    # Map token: sentence_count, word_list
    sentence_count, word_list, entity_list\
        = process_tweet(tweet)
    # Map Time
    creation_time = row[3]
    date, timestamp, hour\
        = parse_time(creation_time)
    time_zone = row[4]
    followers_count = row[5]
    friends_count = row[6]
    city_name = row[7]
    country_name = row[8]
    media_ary = row[9]
    # boolean
    media_attached = check_image(media_ary)        
    hashtag_ary = row[10]
    # count tags
    tag_count = count_tag(hashtag_ary)
                
#     except Exception as e:
#         # 1. log e and 2.return default        
#         return Row(tid=-1, uid=-1,
#             followers_count=-1,
#             friends_count=-1,
#             tweet='n/a',retweet_count=-1,
#             favorite_count=-1,sentence_count=-1,
#             word_list=[], entity_list=[]
#             date=datetime.date(2001, 1, 1),timestamp='00:00:00',hour=0,
#             time_zone='n/a',city_name='n/a',
#             country_name='n/a',
#             media_attached=False,tag_count=-1
#             )
    
    r = Row(tid=tid, uid=uid,
            followers_count=followers_count,
            friends_count=friends_count,
            tweet=tweet,retweet_count=retweet_count,
            favorite_count=favorite_count,sentence_count=sentence_count,
            word_list=word_list, entity_list=entity_list,
            date=date,timestamp=timestamp,hour=hour,
            time_zone=time_zone,city_name=city_name,
            country_name=country_name,
            media_attached=media_attached,tag_count=tag_count
            )
    return r 
    
#base_nlp_rdd = base_rdd.map(map_nlp_row)    

In [61]:
# Sanity Check
base_nlp_rdd = base_rdd.map(map_nlp_row) 
r = base_nlp_rdd.take(1)

# index order
r

NameError: name 'base_rdd' is not defined

In [9]:
# base_mapped_rdd: all attributes after NLP
# base_mapped_rdd: all attributes after NLP
i_city_name=0
i_country_name=1
i_date=2
i_entity_list = 3
i_favorite_count=4
i_followers_count=5
i_friends_count=6,
i_hour=7
i_media_attached=8
i_retweet_count=9
i_sentence_count=10
i_tag_count=11
i_tid=12
i_time_zone=13
i_timestamp=14
i_tweet=15
i_uid=16
i_word_list=17


In [10]:
# get only interested attributes from base_rdd
# get only interested attributes from base_rdd
# get only interested attributes from base_rdd
# get only interested attributes from base_rdd
def get_time_map(row):
    uid_i = i_uid
    tid_i = i_tid
    rt_c_i = i_retweet_count
    fav_c_i = i_favorite_count
    word_list_i = -1
    date = i_date
    # order
    return (row[uid_i],row[tid_i],row[date],
            row[rt_c_i],
            row[fav_c_i], row[word_list_i])

In [212]:
base_nlp_rdd.take(1)

[Row(city_name=None, country_name=None, date=datetime.date(2014, 8, 9), entity_list=[u'psalm'], favorite_count=0, followers_count=140, friends_count=204, hour=11, media_attached=False, retweet_count=0, sentence_count=1, tag_count=0, tid=362814954803433472, time_zone=u'London', timestamp='2014-08-09', tweet=u'Psalm 119:133', uid=194531988, word_list=[u'psalm', u'119', u'133'])]

In [213]:
time_rdd = base_nlp_rdd.map(get_time_map)
time_rdd.take(1)

[(194531988,
  362814954803433472,
  datetime.date(2014, 8, 9),
  0,
  0,
  [u'psalm', u'119', u'133'])]

In [79]:
# Trump Interface --> map trump_rdd to language processing
trump_nlp_rdd = trump_rdd.map(map_nlp_row)
# get trump_time_rdd
trump_time_rdd = trump_nlp_rdd.map(get_time_map)

# interface changes
time_rdd = trump_time_rdd

In [68]:
trump_time_rdd.take(20)[16]

(666,
 u'954901073587834882',
 datetime.date(2009, 7, 6),
 6175,
 44091,
 [u'eric', u'trump', u'judgejeanin', u'foxnew'])

In [216]:
# change df to have key, value with min_date at the end
mtime_rdd = trump_time_rdd.keyBy(lambda x: x[0]).reduceByKey(lambda x, y: x if x[2] < y[2] else y)
# (key, min_date)
mtime_2 = mtime_rdd.mapValues(lambda x: x[2])
# group min_date together 
min_time_df = time_rdd.keyBy(lambda x: x[0]).join(mtime_2).mapValues(lambda x: x[0]+(x[1],) )
#min_time_df.take(1)

# key: uid
# value: uid, tid, date, rt_count, fav_count, word_list, min_date

In [217]:
min_time_df.take(1)

[(1666,
  (1666,
   u'955806333667807232',
   datetime.date(2013, 9, 9),
   10053,
   43524,
   [u'thank',
    u'general',
    u'john',
    u'kelli',
    u'fantast',
    u'staff',
    u'white',
    u'hous',
    u'well',
    u'done',
    u'long',
    u'hour',
    u'fake',
    u'make',
    u'difficult',
    u'alway',
    u'great',
    u'win',
    u'us'],
   datetime.date(2008, 5, 9)))]

In [75]:
import heapq
    
# get top_6 for input_dic
# return a tuple of 6 entries
def get_top_6_dic(input_dic):        
    top_list = heapq.nlargest(20, input_dic.items(), key=lambda x: x[1])     
    return top_list

# time_interval dic row
def get_top_6_ti(row):
    rez_dic = {}
    time_dic = row[1] 
    # key is date, v is dic_set
    for k, v in time_dic.iteritems():
        rt_top = get_top_6_dic(v[0])
        fav_top = get_top_6_dic(v[1])
        rez_dic.update( {k: (rt_top, fav_top)})
    return rez_dic

# max for top_6 through all time
def get_top_6_alltime(row):
    print(len(row))
    rt_dic =  row[1]
    fav_dic = row[2]
    rt_top = get_top_6_dic(rt_dic)
    fav_top = get_top_6_dic(fav_dic)
    
    return ( rt_top, fav_top)
    
    


In [36]:
# 30-day time interval split, c_date vs date_base (earliest date)
def get_relative_date(date_c, date_b):
    delta = date_c - date_b
    round_days = delta.days - delta.days% 30
    relative_date = date_b + timedelta(days=round_days)
    # return string representation for storage
    return str(relative_date)

# delta = datetime.date(2018,1,29) - datetime.date(2010,9,24)
# round_days = delta.days - delta.days % 30
# relative_date = datetime.date(2010,9,24) + timedelta(days=round_days)
# str(relative_date)
# '2018-01-15'

In [23]:
# helper-function for combineByKey time! 
def add_to_dic(dic, word, count):
    if word in dic:
        k_record = dic.get(word)
        dic.update({word: (k_record[0]+count, k_record[1]+1)})
    else:
        # word and tuple of count and # of times for word
        record = {word:(count,1)}
        dic.update(record)

# add factor of 2 for len(words)
def create_dic_set(row, rt_i=3, fav_i=4, word_list_i=5):
    rt_dic = {}
    fav_dic = {}
    rt_c = row[rt_i]
    fav_c = row[fav_i]
    rt_c_double = rt_c * 2
    fav_c_double = fav_c * 2    
    word_list = row[word_list_i]
    
    if rt_c > 0 and fav_c > 0:
        for w in word_list:
            if " " in w:
                add_to_dic(rt_dic, w, rt_c_double)
                add_to_dic(fav_dic, w, fav_c_double)
            else:
                add_to_dic(rt_dic, w, rt_c)
                add_to_dic(fav_dic, w, fav_c)
    elif rt_c > 0:
        for w in word_list:
            if " " in w:
                add_to_dic(rt_dic, w, rt_c_double)
            else:
                add_to_dic(rt_dic, w, rt_c)
    elif fav_c > 0 :
        for w in word_list:
            if " " in w:
                add_to_dic(fav_dic, w, fav_c_double)
            else:
                add_to_dic(fav_dic, w, fav_c)
    # return dic_set
    return (rt_dic, fav_dic)

# merges 2 dic together
def merge_dic(dic1, dic2):
    if len(dic1) < len(dic2):
        small_dic = dic1
        large_dic = dic2
    else:
        small_dic = dic2
        large_dic = dic1
    for k in small_dic.keys():
        # if k in small_dic
        if k in large_dic:
            # merge values together
            kv1 = small_dic.get(k)
            kv2 = large_dic.get(k)
            value = (kv1[0]+kv2[0], kv1[1]+kv1[1])
            large_dic.update({k:value})
        # simply insert to large_dic    
        else:
            large_dic.update({k:small_dic.get(k)})
    # return large_dic        
    return large_dic

In [38]:
# uid_i = 0, tid_i = 1, date_i = 2, 
# rt_c_i = 3, # fav_c_i = 4, word_list_i = 5, min_date = -1

# new value is of format dic{ k_date0: (rt_dic, fav_dic),
#                             k_date1: (rt_dic, fav_dic)}

# process info for cCombiner_t
def parse_row_t(row):
    date = row[2]
    m_date = row[-1]
    # get k_date for dic
    k_date = get_relative_date(date, m_date)   
    tid = row[1]
    rt_dic, fav_dic = create_dic_set(row)
    
    return (tid, k_date, rt_dic, fav_dic)
    
def cCombiner_t(row):
    # handle row
    tid, k_date, rt_dic, fav_dic =\
        parse_row_t(row)
    
    # create dic by time
    dic_by_time = {k_date:(rt_dic, fav_dic)}    
    # create a list of tid
    return ([tid], dic_by_time)

# merges 2 dic together --> because of double pointer crap
def merge_dic_large(large_dic, small_dic):
    for k in small_dic.keys():
        # if k in small_dic
        if k in large_dic:
            # merge values together
            kv1 = small_dic.get(k)
            kv2 = large_dic.get(k)
            value = (kv1[0]+kv2[0], kv1[1]+kv1[1])
            large_dic.update({k:value})
        # simply insert to large_dic    
        else:
            large_dic.update({k:small_dic.get(k)})

# which merges V into C
def mValue_t(new_row, row):
    tid, k_date, rt_dic, fav_dic =\
        parse_row_t(row)
    # get dic_by_time    
    dic_by_time = new_row[1]    
    
    # combine the dic_set together
    if k_date in dic_by_time:
        dic_set = dic_by_time.get(k_date)
        merge_dic_large(dic_set[0],rt_dic)
        merge_dic_large(dic_set[1],fav_dic)
    else:
        dic_by_time.update({k_date:(rt_dic, fav_dic)})
    
    tid_list = new_row[0]
    tid_list.append(tid)
    # return the result
    return (tid_list, dic_by_time)

# combine two C's (new row)
def mCombiners_t(r1, r2):
    list_merge = r1[0] + r2[0]
    # merge dic_by_time
    if len(r1[1]) < len(r2[1]):
        small_t_dic = r1[1]
        large_t_dic = r2[1]
    else:
        small_t_dic = r2[1]
        large_t_dic = r1[1]
        
    for k, v in small_t_dic:
        if k in large_t_dic:
            l_dic_set = large_t_dic[1]
            l_dic_set[0] = merge_dic(v[0],l_dic_set[0])
            l_dic_set[1] = merge_dic(v[1], l_dic_set[1])
        else:
            large_t_dic.update({k:v})
                                
    return (list_merge, large_t_dic)

In [39]:
# testing time interval aggregation!!!
# testing time interval aggregation!!!
# testing time interval aggregation!!!

rez = min_time_df.combineByKey(cCombiner_t, mValue_t, mCombiners_t)

In [117]:
rez.take(1)

[(1666,
  ([u'955806333667807232',
    u'955795912374267907',
    u'955658992793149440',
    u'955056249925750784',
    u'954878124214415360',
    u'954843844402647040',
    u'954788467069870081',
    u'954680914998648833',
    u'954674157144477696',
    u'954541219970977793',
    u'954478044487520257',
    u'954456754137501697',
    u'954323750949982208',
    u'954097213608570880',
    u'954092417250222082',
    u'953979393180950528',
    u'953973568035086336',
    u'953951365532876800',
    u'953948941674078208',
    u'953796944564031489',
    u'953772162665590787',
    u'953771038114045954',
    u'953768657221451776',
    u'953270558573154305',
    u'953267506004754432',
    u'952887520790040576',
    u'952540700683497472',
    u'952538350333939713',
    u'952525384242876416',
    u'952301373479104512',
    u'952183452366929920',
    u'952166643202916352',
    u'951875499537641472',
    u'951813216291708928',
    u'951790999784783872',
    u'951788342647107584',
    u'95178558776528

In [118]:
# Good !!! feature implemented
rez.mapValues(get_top_6_ti).take(1)

[(1666,
  {'2008-05-05': ([(u'no trust', (73220, 1)),
     (u'haitians probably', (73220, 1)),
     (u'anything derogatory', (73220, 1)),
     (u'troubled country never', (73220, 1)),
     (u'dems', (73220, 1)),
     (u'haiti', (73220, 1))],
    [(u'no trust', (277600, 1)),
     (u'haitians probably', (277600, 1)),
     (u'anything derogatory', (277600, 1)),
     (u'troubled country never', (277600, 1)),
     (u'dems', (277600, 1)),
     (u'haiti', (277600, 1))]),
   '2008-06-04': ([(u'different reasons', (38414, 1)),
     (u'republicans', (38414, 1)),
     (u'remember republicans', (38414, 1)),
     (u'this year', (38414, 1)),
     (u'foxandfriends', (38414, 1)),
     (u'congressional races', (38414, 1))],
    [(u'different reasons', (176416, 1)),
     (u'republicans', (176416, 1)),
     (u'remember republicans', (176416, 1)),
     (u'this year', (176416, 1)),
     (u'foxandfriends', (176416, 1)),
     (u'congressional races', (176416, 1))]),
   '2008-07-04': ([(u'military amp veteran

In [19]:
# Do total count computing here: map selected ones only
# take selected columns from base_map_rdd
def map_total_cols(row):
    uid_i = i_uid
    tid_i = i_tid   
    rt_c_i = i_retweet_count
    fav_c_i = i_favorite_count
    word_list_i = i_word_list
    
    return (row[uid_i],row[tid_i],row[rt_c_i],
            row[fav_c_i], row[word_list_i])

#df_tot_raw_k.collect()

In [20]:
# helper-function for combineByKey 
def add_to_dic(dic, word, count):
    if word in dic:
        k_record = dic.get(word)
        dic.update({word: (k_record[0]+count, k_record[1]+1)})
    else:
        # word and tuple of count and # of times for word
        record = {word:(count,1)}
        dic.update(record)        

def createCombiner(row):
    #print("len of createCombiner row is: ",len(row))
    rt_dic, fav_dic = create_dic_set(row,rt_i=2,fav_i=3,word_list_i=-1 )
    tid = row[1]
    # create a list of tid
    return ([tid], rt_dic, fav_dic)

# merges 2 dic together
def merge_dic(dic1, dic2):
    if len(dic1) < len(dic2):
        small_dic = dic1
        large_dic = dic2
    else:
        small_dic = dic2
        large_dic = dic1
    for k in small_dic.keys():
        # if k in small_dic
        if k in large_dic:
            # merge values together
            kv1 = small_dic.get(k)
            kv2 = large_dic.get(k)
            value = (kv1[0]+kv2[0], kv1[1]+kv1[1])
            large_dic.update({k:value})
        # simply insert to large_dic    
        else:
            large_dic.update({k:small_dic.get(k)})
    # return large_dic        
    return large_dic

# which merges V into C
def mergeValue(new_row, row):
    #retrieve last 
    rt_dic, fav_dic = create_dic_set(row,rt_i=2,fav_i=3,word_list_i=-1 )
    tid = row[1]
    
    rt_dic_merge = merge_dic(rt_dic, new_row[1])
    fav_dic_merge = merge_dic(fav_dic, new_row[2])
    
    tid_list = new_row[0]
    tid_list.append(tid)
    # return the result
    return (tid_list, rt_dic_merge, fav_dic_merge)

# combine two C's (new row)
def mergeCombiners(r1, r2):
    list_merge = r1[0] + r2[0]
    rt_dic_merge = merge_dic(r1[1], r2[1])
    fav_dic_merge = merge_dic(r1[-1], r2[-1])
    
    return (list_merge, rt_dic_merge, fav_dic_merge)

In [71]:
# Trump Region

trump_nlp_rdd.take(1)

[Row(city_name='Washington', country_name='U.S', date=datetime.date(2010, 3, 30), entity_list=[], favorite_count=43524, followers_count='100', friends_count='2000', hour=14, media_attached=False, retweet_count=10053, sentence_count=2, tag_count=0, tid=u'955806333667807232', time_zone='blabla', timestamp='2010-03-30', tweet=u'Thank you to General John Kelly, who is doing a fantastic job, and all of the Staff and others in the White House, for a job well done. Long hours and Fake reporting makes your job more difficult, but it is always great to WIN, and few have won more than us!', uid=1666, word_list=[u'thank', u'general', u'john', u'kelli', u'fantast', u'staff', u'white', u'hous', u'well', u'done', u'long', u'hour', u'fake', u'make', u'difficult', u'alway', u'great', u'win', u'us', u'thank', u'general john kelly', u'a fantastic job', u'the staff', u'others', u'the white house', u'a job', u'long hours', u'fake reporting', u'job', u'win'])]

In [21]:
# Small Data Portion Region
# Small Data Portion Region
# Small Data Portion Region
# df_tot    
max_rdd= base_nlp_rdd.map(map_total_cols)
max_rdd_k = max_rdd.keyBy(lambda x: x[1])
max_rdd_k.take(1)


NameError: name 'base_nlp_rdd' is not defined

In [146]:
# Small Data Portion Region
# Small Data Portion Region
# compute small portion data region counting all
r = max_rdd_k.combineByKey(createCombiner, mergeValue, mergeCombiners)
r.take(1)

[(362817693688135680, ([362817693688135680], {}, {}))]

In [84]:
# Trump for computing max data
# Trump for computing max data
# Trump for computing max data

trump_max_rdd = trump_nlp_rdd.map(map_total_cols)
trump_max_rdd_k =  trump_max_rdd.keyBy(lambda x: x[0])
r = trump_max_rdd_k.combineByKey(createCombiner, mergeValue, mergeCombiners)
r.take(1)

[(1666,
  ([u'955806333667807232',
    u'955795912374267907',
    u'955658992793149440',
    u'955056249925750784',
    u'954878124214415360',
    u'954843844402647040',
    u'954788467069870081',
    u'954680914998648833',
    u'954674157144477696',
    u'954541219970977793',
    u'954478044487520257',
    u'954456754137501697',
    u'954323750949982208',
    u'954097213608570880',
    u'954092417250222082',
    u'953979393180950528',
    u'953973568035086336',
    u'953951365532876800',
    u'953948941674078208',
    u'953796944564031489',
    u'953772162665590787',
    u'953771038114045954',
    u'953768657221451776',
    u'953270558573154305',
    u'953267506004754432',
    u'952887520790040576',
    u'952540700683497472',
    u'952538350333939713',
    u'952525384242876416',
    u'952301373479104512',
    u'952183452366929920',
    u'952166643202916352',
    u'951875499537641472',
    u'951813216291708928',
    u'951790999784783872',
    u'951788342647107584',
    u'95178558776528

In [85]:
tr = r.mapValues(get_top_6_alltime)
tr.take(1)

[(1666,
  ([(u'great', (1452433, 2)),
    (u'fake', (931799, 2)),
    (u'big', (828189, 2)),
    (u'want', (699767, 2)),
    (u'country', (647465, 2)),
    (u'america', (642070, 2)),
    (u'amp', (624333, 4)),
    (u'american', (622590, 2)),
    (u'would', (587585, 2)),
    (u'get', (569668, 2)),
    (u'good', (564516, 2)),
    (u'mani', (538345, 2)),
    (u'bad', (523929, 4)),
    (u'back', (501903, 2)),
    (u'republican', (493884, 2)),
    (u'much', (485964, 2)),
    (u'the democrats', (479794, 2)),
    (u'000', (472174, 2)),
    (u'need', (470434, 4)),
    (u'media', (469632, 2))],
   [(u'great', (6272400, 2)),
    (u'fake', (3738059, 2)),
    (u'big', (3536134, 2)),
    (u'want', (2907726, 2)),
    (u'country', (2696338, 2)),
    (u'america', (2658582, 2)),
    (u'american', (2630227, 2)),
    (u'would', (2581627, 2)),
    (u'amp', (2482002, 4)),
    (u'get', (2424631, 2)),
    (u'good', (2288073, 2)),
    (u'back', (2256284, 2)),
    (u'much', (2108355, 2)),
    (u'bad', (2106957

In [44]:
# function to check how many times a word/phrase has been used
def occur(x):
    v  = x[1]
    dic = v[1]
    rez = []
    for k, v in dic.iteritems():
        if v[1] > 5:
            rez.append((k,v))
    return rez
r.map(occur).take(1)

[[(u'dossier', (146956, 6)),
  (u'blame', (111104, 6)),
  (u'foreign', (65185, 6)),
  (u'cut', (977054, 6))]]

In [181]:
def flat_map(row):
    fav_list = row[1][0]
    rt_list = row[1][1]
    
    fav_list_filtered = [(x[0],x[1][0]) for x in fav_list]
    rt_list_filtered = [(x[0],x[1][0]) for x in rt_list]
        

    return (row[0], rt_list_filtered, fav_list_filtered)

tr_m = tr.map(flat_map)
tr_m.take(1)

[(1666,
  [(u'country', 2696338),
   (u'the democrats', 2044864),
   (u'fake news', 1670770),
   (u'the world', 1587716),
   (u'the u s', 1540800),
   (u'china', 1517409)],
  [(u'country', 647465),
   (u'the democrats', 479794),
   (u'the world', 442978),
   (u'fake news', 430484),
   (u'years', 371083),
   (u'people', 361543)])]

In [182]:
tr_m.toDF().printSchema()

root
 |-- _1: long (nullable = true)
 |-- _2: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- _1: string (nullable = true)
 |    |    |-- _2: long (nullable = true)
 |-- _3: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- _1: string (nullable = true)
 |    |    |-- _2: long (nullable = true)



In [183]:
alltime_df = spark.createDataFrame(tr_m, at)

In [191]:
at = StructType([
        StructField("uid", IntegerType(), True),    
    
        StructField("rt_word_list", ArrayType(StructType([
        StructField("word_name", StringType(), True),
        StructField("count", IntegerType(), False)])
        , True), True)        
        ,
    
        StructField("fav_word_list", ArrayType(StructType([
        StructField("word_name", StringType(), True),
        StructField("count", IntegerType(), False)])
        , True), True)
    ])

In [192]:
alltime_df = spark.createDataFrame(tr_m, at)

In [193]:
alltime_df.printSchema()

root
 |-- uid: integer (nullable = true)
 |-- rt_word_list: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- word_name: string (nullable = true)
 |    |    |-- count: integer (nullable = false)
 |-- fav_word_list: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- word_name: string (nullable = true)
 |    |    |-- count: integer (nullable = false)



In [194]:
alltime_df.first()

Row(uid=1666, rt_word_list=[Row(word_name=u'country', count=2696338), Row(word_name=u'the democrats', count=2044864), Row(word_name=u'fake news', count=1670770), Row(word_name=u'the world', count=1587716), Row(word_name=u'the u s', count=1540800), Row(word_name=u'china', count=1517409)], fav_word_list=[Row(word_name=u'country', count=647465), Row(word_name=u'the democrats', count=479794), Row(word_name=u'the world', count=442978), Row(word_name=u'fake news', count=430484), Row(word_name=u'years', count=371083), Row(word_name=u'people', count=361543)])

In [189]:
def save_data_frame(df, table_name):
    df.write.format("org.apache.spark.sql.cassandra").\
            mode('append').options(table=table_name,keyspace='twitter').save()    

In [195]:
save_data_frame(alltime_df, "trump_top")

In [None]:
# Change Type Cast

In [165]:
df_uncasted = spark.createDataFrame(base_nlp_rdd, schema)
df_uncasted.printSchema()

NameError: name 'schema' is not defined

In [221]:
df_uncasted.first()

Row(city_name=None, country_name=None, creation_date=u'2010-09-24', creation_hour=11, creation_timestamp=u'2010-09-24 11:04:40+00:00', followers_count=140, friends_count=204, hashtag_count=0, media_attached=False, phrase_token=[u'n/a'], sentence_count=1, tid=362814954803433472, time_zone=u'London', tweet=u'Psalm 119:133', uid=194531988, word_token_set=[u'psalm', u'119', u'133'])

In [224]:
df_final = df_uncasted.select('creation_date',
                              'creation_hour',
                              df_uncasted.creation_timestamp.astype(TimestampType()),
                              'city_name',
                              'country_name',
                              'followers_count',
                              'friends_count',
                              'hashtag_count',
                              'media_attached',
                              'phrase_token',
                              'sentence_count',
                              'tid',
                              'time_zone',
                              'tweet',
                              'uid',
                              'word_token_set'
                             )
df_final.printSchema()

root
 |-- creation_date: string (nullable = true)
 |-- creation_hour: integer (nullable = true)
 |-- creation_timestamp: timestamp (nullable = true)
 |-- city_name: string (nullable = true)
 |-- country_name: string (nullable = true)
 |-- followers_count: integer (nullable = true)
 |-- friends_count: integer (nullable = true)
 |-- hashtag_count: integer (nullable = true)
 |-- media_attached: boolean (nullable = false)
 |-- phrase_token: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- sentence_count: integer (nullable = true)
 |-- tid: long (nullable = false)
 |-- time_zone: string (nullable = true)
 |-- tweet: string (nullable = true)
 |-- uid: long (nullable = false)
 |-- word_token_set: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [230]:
part.write.format("org.apache.spark.sql.cassandra").\
            mode('append').options(table='b0',keyspace='twitter').save() 
    

In [None]:
# Scrach Space: Previous Tests and Etc

In [50]:
from dateutil.parser import parse
import datetime

# parse twitter time string to (y-m-d,h-m-s)
def parse_time(creation_time):    
    # fcn that converts dt to date-str and time-str
    def cassandra_convert(dt):
        time = dt.strftime("%H:%M:%S")
        return (str(dt.date()), time)
    try:
        dt = parse(creation_time)
    except Exception as e:
        # 1. log and 
        # 2.use current system time instead
        dt = datetime.datetime.now()
    return cassandra_convert(dt)

# rez = None
# creation_time = u'Fri Sep 24 11:04:40 +0000 2010'
# try:
#     rez = parse(creation_time)
# except ValueError:
#     rez = 'default'
# except OverflowError:
#     rez = 'default'
# rez


datetime.datetime(2010, 9, 24, 11, 4, 40, tzinfo=tzlocal())

In [65]:
print (rez.date())
print(rez.time())


2010-09-24
11:04:40
11:04:40


In [53]:
# do pos and entity extraction for words as well
def extract_entity(tokens):
    tokens_tag = pos_tag(tokens)
    # set up for entity extraction 
    grammar = "NP: {<DT>?<JJ.*>*<NN.*>+}"
    cp = RegexpParser(grammar)
    np_ary = []
    
    root_tree = cp.parse(tokens_tag)
    for i in list( root_tree.subtrees(filter=lambda x: x.label() == 'NP')):
        st = ""
        for t in i.leaves():
            st = st + t[0] + " "
        np_ary.append(st.strip().lower())
    # entity extracted    
    return np_ary    
    
# return 2 items: sentence_count,  <word_tuple>, <name_entity>
def process_tweet(description):
    # base case
    if description is None or description == "":
        return (0,[],[])
    # filter out http items
    description =\
        re.sub(r'https:[^ ]*', '', description, flags=re.IGNORECASE)
    
    stemmer = SnowballStemmer("english")
    tokenizer = RegexpTokenizer(r'\w+')
    words = tokenizer.tokenize(description)    
    entity_list = extract_entity(words)
    
#     word_list = []
#     stopWords = set(stopwords.words('english'))
#     for w in words:
#         if w not in stopWords:
#             word_list.append(stemmer.stem(w.lower()))

    return (len(sent_tokenize(description)),entity_list)
