In [51]:
#Imports
import sys
import datetime
import re
from datetime import timedelta

sys.path.append('../../')

from pyspark.sql import SparkSession, Row
from pyspark.ml import Pipeline

# from sparknlp.annotator import *
# from sparknlp.common import RegexRule
# from sparknlp.base import DocumentAssembler, Finisher
from pyspark.sql.functions import explode

from dateutil.parser import parse
# for tokenizing
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk import pos_tag, RegexpParser

# for schema
from pyspark.sql.types import *

# for testing 
import random


In [52]:
# instantiate a spark context object
appname= "word-phrase-list"
master="local"

# Create Spark Session
spark = SparkSession.builder.appName(appname)\
                .config("spark.cassandra.connection.host", "localhost")\
                .config("spark.cassandra.connection.port", "9042")\
                .config("spark.eventLog.enabled", True)\
                .config("spark.eventLog.dir", "/home/ubuntu/spark_tmp/")\
                .getOrCreate()

In [53]:
trump_json = 's3a://twitter-data-dump/celebrities/trump.json'

trump_df = spark.read.json(trump_json)

resource_path ='/home/ubuntu/Desktop/spark-nlp/src/test/resources/'
#type(data) --> data frame

In [4]:
trump_df.printSchema()
trump_df.take(10)[5]

root
 |-- created_at: string (nullable = true)
 |-- favorite_count: long (nullable = true)
 |-- id_str: string (nullable = true)
 |-- is_retweet: boolean (nullable = true)
 |-- retweet_count: long (nullable = true)
 |-- source: string (nullable = true)
 |-- text: string (nullable = true)



Row(created_at=u'Tue Jan 23 04:30:33 +0000 2018', favorite_count=103887, id_str=u'955658992793149440', is_retweet=False, retweet_count=23224, source=u'Twitter for iPhone', text=u'Big win for Republicans as Democrats cave on Shutdown. Now I want a big win for everyone, including Republicans, Democrats and DACA, but especially for our Great Military and Border Security. Should be able to get there. See you at the negotiating table!')

In [54]:
# trump data frame to base_df
# note i split the data into 666 and 1666 id set
def trump_mapper(row):
    # set default trump as 666
    uid = 666    
    tid = row[2]
    creation_time = row[0]
    retweet_count = row[4]    
    favorite_count = row[1]
    tweet = row[-1]
    if len(tweet) > 200:
        uid = 1666
    
    time_zone = 'blabla'
    followers_count='100'
    friends_count= '2000'
    city_name='Washington'
    country_name = 'U.S'
    media_ary = []
    hashtag_ary = []
    return (tid, uid, tweet, creation_time,
            time_zone, followers_count,
            friends_count, city_name, country_name,
            media_ary, hashtag_ary, retweet_count,
            favorite_count )
        
#trump_rdd = trump_df.rdd.map(trump_mapper)

In [107]:
# Modfication for single word extraction as well
# Modfication for single word extraction as well
# Modfication for single word extraction as well

# extract word_list(exclusing noun+stopwords) and noun phrase list
def extract_entity(tokens):
    tokens_tag = pos_tag(tokens)
    # in case tokens_tags are emtpy, return empty list
    if (len(tokens_tag) == 0):
        return []
            
    # set up for entity extraction 
    grammar = "NP: {<DT>?<JJ.*>*<NN.*>+}"
    cp = RegexpParser(grammar)
    np_ary = []

    root_tree = cp.parse(tokens_tag)
    for i in list( root_tree.subtrees(filter=lambda x: x.label() == 'NP')):
        st = ""
        for t in i.leaves():
            st = st + t[0] + " "
        np_ary.append(st.strip())
    # entity extracted   
    
    # combine them together
    #    np_ary = np_ary + word_list
    return np_ary
    
# return 2 items: sentence_count,  <word_tuple>, <name_entity>
def process_tweet(description):
    # base case
    if description is None or description == "":
        return (0,[],[])
    # filter out http items
    description =\
        re.sub(r'https:[^\s]*', '', description, flags=re.IGNORECASE)
    tokenizer = RegexpTokenizer(r'\w+')
    
    # extract single word
    word_list =[]
    words = tokenizer.tokenize(description) 
    token_tag = pos_tag(words)    
    stemmer = SnowballStemmer("english")
    stopWords = set(stopwords.words('english'))  
    
    for word_pos in token_tag:
        stem_word = stemmer.stem(word_pos[0])
        if stem_word not in stopWords and word_pos[1] != 'NN':
            word_list.append(stem_word)            
                
    entity_list =extract_entity(words)    
    return (len(sent_tokenize(description)),word_list, entity_list)


In [71]:
# parse twitter time string to (date, timestamp_str, hour int)
# note later only timestamp_str is changed to asTYpe
def parse_time(creation_time):  
    # fcn that converts dt to date-str and time-str
    def cassandra_convert(dt):
        hour = dt.strftime("%H")
        # debug:
        base_date = datetime.date(2008,4,1)
        days_count = random.randint(1,3000)
        dt = base_date + timedelta(days=days_count)
        return (dt, str(dt),int(hour))       
        #return (dt.date(), str(dt),int(hour))
    
    dt = None
    try:
        dt = parse(creation_time)
    except Exception as e:
        # 1. log and 
        # 2.use current system time instead
        dt = datetime.datetime.now()
    return cassandra_convert(dt)

# Mapping: 1. 13 cols to 15 cols (word_list, date, 
#          timestamp, hour)
#          2. (media_ary-> media_attached; tag_ary -> tag_count)
def map_nlp_row(row):
    # return if image attached
    def check_image(media_ary):
        if media_ary is None or\
           len(media_ary) == 0:
            return False
        return True
    
    # count # of hashtags
    def count_tag(tag_ary):
        if tag_ary is None:
            return 0
        return len(tag_ary)
    
    # create a list of tuples: [(str1, like_num),] or [(str1, ret_num)] 
    def create_tuple_list(vector, num):
        result = [num]
        for item in vector:
            result.append(item)
        if len(result) == 0:
            return []
        return result
        
#     try:       
    tid = row[0]
    uid = row[1]
    tweet = row[2]
    retweet_count = row[11]
    favorite_count = row[12]
    # Map token: sentence_count, word_list
    sentence_count, word_list, entity_list\
        = process_tweet(tweet)
    # Map Time
    creation_time = row[3]
    date, timestamp, hour\
        = parse_time(creation_time)
    time_zone = row[4]
    followers_count = row[5]
    friends_count = row[6]
    city_name = row[7]
    country_name = row[8]
    media_ary = row[9]
    # boolean
    media_attached = check_image(media_ary)        
    hashtag_ary = row[10]
    # count tags
    tag_count = count_tag(hashtag_ary)
                
#     except Exception as e:
#         # 1. log e and 2.return default        
#         return Row(tid=-1, uid=-1,
#             followers_count=-1,
#             friends_count=-1,
#             tweet='n/a',retweet_count=-1,
#             favorite_count=-1,sentence_count=-1,
#             word_list=[], entity_list=[]
#             date=datetime.date(2001, 1, 1),timestamp='00:00:00',hour=0,
#             time_zone='n/a',city_name='n/a',
#             country_name='n/a',
#             media_attached=False,tag_count=-1
#             )
    
    r = Row(tid=tid, uid=uid,
            followers_count=followers_count,
            friends_count=friends_count,
            tweet=tweet,retweet_count=retweet_count,
            favorite_count=favorite_count,sentence_count=sentence_count,
            word_list=word_list, entity_list=entity_list,
            date=date,timestamp=timestamp,hour=hour,
            time_zone=time_zone,city_name=city_name,
            country_name=country_name,
            media_attached=media_attached,tag_count=tag_count
            )
    return r 
    
#base_nlp_rdd = base_rdd.map(map_nlp_row)   

In [57]:
# base_mapped_rdd: all attributes after NLP
# base_mapped_rdd: all attributes after NLP
i_city_name=0
i_country_name=1
i_date=2
i_entity_list = 3
i_favorite_count=4
i_followers_count=5
i_friends_count=6,
i_hour=7
i_media_attached=8
i_retweet_count=9
i_sentence_count=10
i_tag_count=11
i_tid=12
i_time_zone=13
i_timestamp=14
i_tweet=15
i_uid=16
i_word_list=17


In [80]:
# get only interested attributes from nlp_rdd
# get only interested attributes from nlp_rdd
# get only interested attributes from nlp_rdd

#order
# uid_0, tid_1, date_2, rt_c_3, fav_c_4, word_list_5, entity_list_6
def map_time(row):
    uid_i = i_uid
    tid_i = i_tid
    rt_c_i = i_retweet_count
    fav_c_i = i_favorite_count
    word_list_i = -1
    entity_list_i = i_entity_list
    date = i_date
    # order
    return (row[uid_i],row[tid_i],row[date],
            row[rt_c_i],
            row[fav_c_i], row[word_list_i], row[entity_list_i])

In [142]:
import heapq
    
# get top_n for input_dic
# return a tuple of n entries
# default to 20, but can be customized
def get_top_n_dic(input_dic):
    n = 20
    top_list = heapq.nlargest(n, input_dic.items(), key=lambda x: x[1])     
    return top_list

# time_interval dic row => mapValues
def get_top_n_ti(row):
    rez_ary = []
    time_dic = row[1] 
    # key is date, v is dic_set
    for k, v in time_dic.iteritems():
        rt_top = get_top_n_dic(v[0])
        rt_entity_top = get_top_n_dic(v[1])
        fav_top = get_top_n_dic(v[2])
        fav_entity_top = get_top_n_dic(v[3])
        
        rez_ary.append((k, rt_top, rt_entity_top, fav_top, fav_entity_top))
        
    return rez_ary

# max for top_6 through all time
def get_top_6_alltime(row):
    print(len(row))
    rt_dic =  row[1]
    fav_dic = row[2]
    rt_top = get_top_6_dic(rt_dic)
    fav_top = get_top_6_dic(fav_dic)
    
    return ( rt_top, fav_top)
    
    


In [167]:
# 30-day time interval split, c_date vs date_base (earliest date)
def get_relative_date(date_c, date_b):
    delta = date_c - date_b
    round_days = delta.days - delta.days% 30
    relative_date = date_b + timedelta(days=round_days)
    # return string representation for storage
    return relative_date

# delta = datetime.date(2018,1,29) - datetime.date(2010,9,24)
# round_days = delta.days - delta.days % 30
# relative_date = datetime.date(2010,9,24) + timedelta(days=round_days)
# str(relative_date)
# '2018-01-15'

In [None]:
# helper-function for creating dictionary of {word_1: (count_int, occur_int) }
# helper-function for combineByKey time! 

def add_to_dic(dic, word, count):
    if word in dic:
        k_record = dic.get(word)
        dic.update({word: (k_record[0]+count, k_record[1]+1)})
    else:
        # word and tuple of count and # of times for word
        record = {word:(count,1)}
        dic.update(record)

In [133]:
# Returning 
def create_dic_set(row, rt_i=3, fav_i=4, word_list_i=5, entity_list_i = 6):  
    # helper function that updates, _list is either word_list or entity_list 
    def perform_add_op(rt_c, fav_c, rt_dic, fav_dic, _list):
        if rt_c > 0 and fav_c > 0:
            for item in _list:
                add_to_dic(rt_dic, item, rt_c)
                add_to_dic(fav_dic, item, fav_c)
        elif rt_c > 0:
            for item in _list:
                add_to_dic(rt_dic, item, rt_c)
        elif fav_c > 0 :
            for item in _list:
                add_to_dic(fav_dic, item, fav_c)
        # End
    
    rt_dic = {}
    fav_dic = {}    
    rt_entity_dic = {}
    fav_entity_dic = {}
    
    rt_c = row[rt_i]
    fav_c = row[fav_i]
    
    word_list = row[word_list_i]
    entity_list = row[entity_list_i]
    
    # Perform word_list ADD Operation
    perform_add_op(rt_c, fav_c, rt_dic, fav_dic, word_list)
    perform_add_op(rt_c, fav_c, \
                   rt_entity_dic, fav_entity_dic, entity_list)

    # return dic_set
    return (rt_dic, rt_entity_dic, fav_dic, fav_entity_dic)

# merges 2 dic together
def merge_dic(dic1, dic2):
    if len(dic1) < len(dic2):
        small_dic = dic1
        large_dic = dic2
    else:
        small_dic = dic2
        large_dic = dic1
    for k in small_dic.keys():
        # if k in small_dic
        if k in large_dic:
            # merge values together
            kv1 = small_dic.get(k)
            kv2 = large_dic.get(k)
            value = (kv1[0]+kv2[0], kv1[1]+kv1[1])
            large_dic.update({k:value})
        # simply insert to large_dic    
        else:
            large_dic.update({k:small_dic.get(k)})
    # return large_dic        
    return large_dic

In [168]:
# Will change to static class methods later!!
# GetTopByInterval()

###### uid_0, tid_1, date_2, rt_c_3, 
###### fav_c_4, word_list_5, entity_list_6, min_date = -1

# new value is of format dic{ k_date0: (rt_dic, fav_dic),
#                             k_date1: (rt_dic, fav_dic)}

# create_dic_set REQUIRES
#  (rt_i, fav_i, word_list_i, entity_list_i)

# process info for cCombiner_t
def parse_row_t(row):
    date = row[2]
    m_date = row[-1]
    # get k_date for dic
    k_date = get_relative_date(date, m_date)   
    tid = row[1]
    rt_dic, rt_entity_dic,\
    fav_dic, fav_entity_dic = \
        create_dic_set(row,rt_i=3,fav_i=4,\
                       word_list_i=5, entity_list_i=6)
    

    return (tid, k_date, rt_dic, rt_entity_dic,\
            fav_dic, fav_entity_dic)

# create the base combiner for this
def cCombiner_t(row):
    # handle row
    tid, k_date, rt_dic, rt_entity_dic,\
        fav_dic, fav_entity_dic = parse_row_t(row)
    
    # create dic by time ==> word_list and entity_list
    # note 4 entries as a value (in python: pointer/memory address)
    dic_set_by_time = {k_date:(rt_dic, rt_entity_dic,  \
                               fav_dic, fav_entity_dic)}             
    
    # create a list of tid
    return ([tid], dic_set_by_time)

# merges 2 dic together --> because of double pointer crap
def merge_dic_large(large_dic, small_dic):
    for k in small_dic.keys():
        # if k in small_dic
        if k in large_dic:
            # merge values together
            kv1 = small_dic.get(k)
            kv2 = large_dic.get(k)
            value = (kv1[0]+kv2[0], kv1[1]+kv1[1])
            large_dic.update({k:value})
        # simply insert to large_dic    
        else:
            large_dic.update({k:small_dic.get(k)})

# which merges V into C
def mValue_t(new_row, row):
    # update_dic operation between two dictionaries
    # large_dic_set --> (k_date, (rt_dic, rt_entity_dic, fav_dic, fav_entity_dic))
    def update_dic_op(k_date, dic_set_by_time, \
                      rt_dic, rt_entity_dic, fav_dic, fav_entity_dic):
        if k_date in dic_set_by_time:
            # large_dic ==> 4 dictionaries
            large_dic = dic_set_by_time.get(k_date)
            # merge values
            merge_dic_large(large_dic[0],rt_dic)
            merge_dic_large(large_dic[1],rt_entity_dic)
            merge_dic_large(large_dic[2],fav_dic)
            merge_dic_large(large_dic[3],fav_entity_dic)
        else:
            value_set = (rt_dic, rt_entity_dic,\
                         fav_dic, fav_entity_dic
                        )
            dic_set_by_time.update({k_date:value_set})        
            
    # handle row
    tid, k_date, rt_dic, rt_entity_dic,\
        fav_dic, fav_entity_dic = parse_row_t(row)
        
    # get dic_set_by_time    
    dic_set_by_time = new_row[1]  
    
    update_dic_op(k_date, dic_set_by_time,\
                  rt_dic, rt_entity_dic,\
                  fav_dic, fav_entity_dic)
    # add tid to the tid list
    tid_list = new_row[0]
    tid_list.append(tid)
    # return the result
    return (tid_list, dic_set_by_time)

# combine two C's (new row)
def mCombiners_t(r1, r2):
    list_merge = r1[0] + r2[0]
    
    # format: {date_k: (rt_dic, rt_entity_dic, fav_dic, fav_entity_dic)}
    dic_set_by_time1 = r1[1]
    dic_set_by_time2 = r2[1]
    
    
    # merge dic_set_by_time based on # of times each partition has
    if len(dic_set_by_time1) < len(dic_set_by_time2):
        small_t_dic = dic_set_by_time1
        large_t_dic = dic_set_by_time2
    else:
        small_t_dic = dic_set_by_time2
        large_t_dic = dic_set_by_time1
        
    for k, v in small_t_dic:
        if k in large_t_dic:
            # get corresponding dictionaries
            value_set = large_t_dic.get(k)
            for idx, dic_item in enumerate(value_set):
                # also merge the small_t_dic dictionary to dic_item 
                # in the large item
                merge_dic_large(dic_item, v[indx])                
        else:
            large_t_dic.update({k:v})
            
                                            
    return (list_merge, large_t_dic)

In [169]:
# from NLP rdd
# Do total count computing here: map selected ones only

# order:
# uid, tid, rt_c, fav_c, word_list, entity_list
def map_total_count(row):
    uid_i = i_uid
    tid_i = i_tid   
    rt_c_i = i_retweet_count
    fav_c_i = i_favorite_count
    word_list_i = i_word_list
    entity_list_i = i_entity_list
    
    return (row[uid_i],row[tid_i],row[rt_c_i],
            row[fav_c_i], row[word_list_i], row[entity_list_i])


In [170]:
# This is for total_count ==> later change to static method for class GetTop
# This is for total_count
# This is for total_count

# createCombiner (row)
def createCombiner(row):
    #print("len of createCombiner row is: ",len(row))
    rt_dic, fav_dic = create_dic_set(row,rt_i=2,fav_i=3,word_list_i=-1 )
    tid = row[1]
    # create a list of tid
    return ([tid], rt_dic, fav_dic)

# merges 2 dic together
def merge_dic(dic1, dic2):
    if len(dic1) < len(dic2):
        small_dic = dic1
        large_dic = dic2
    else:
        small_dic = dic2
        large_dic = dic1
    for k in small_dic.keys():
        # if k in small_dic
        if k in large_dic:
            # merge values together
            kv1 = small_dic.get(k)
            kv2 = large_dic.get(k)
            value = (kv1[0]+kv2[0], kv1[1]+kv1[1])
            large_dic.update({k:value})
        # simply insert to large_dic    
        else:
            large_dic.update({k:small_dic.get(k)})
    # return large_dic        
    return large_dic

# which merges V into C
def mergeValue(new_row, row):
    #retrieve last 
    rt_dic, fav_dic = create_dic_set(row,rt_i=2,fav_i=3,word_list_i=-1 )
    tid = row[1]
    
    rt_dic_merge = merge_dic(rt_dic, new_row[1])
    fav_dic_merge = merge_dic(fav_dic, new_row[2])
    
    tid_list = new_row[0]
    tid_list.append(tid)
    # return the result
    return (tid_list, rt_dic_merge, fav_dic_merge)

# combine two C's (new row)
def mergeCombiners(r1, r2):
    list_merge = r1[0] + r2[0]
    rt_dic_merge = merge_dic(r1[1], r2[1])
    fav_dic_merge = merge_dic(r1[-1], r2[-1])
    
    return (list_merge, rt_dic_merge, fav_dic_merge)

In [176]:
# general twitter data dump
def twitter_nlp_interface(df):
    pass

# takes df_trump data frame, do mapped conversion, and nlp conversion
# trump_nlp_rdd is where Time Interval or Overall Functions performs on
def trump_nlp_interface(df):
    trump_map_rdd = df.rdd.map(trump_mapper)
    trump_nlp_rdd = trump_map_rdd.map(map_nlp_row)
    return trump_nlp_rdd

# given nlp_rdd, map to total_count_rdd
def total_count_interface(nlp_rdd):
    total_count_rdd = nlp_rdd.map(map_total_count)    
    return total_count_rdd

# given nlp_rdd, map to time_rdd
def time_interface(nlp_rdd):
    time_rdd = nlp_rdd.map(map_time)
    return time_rdd

# given time_rdd, map each entry with minimum date
# (uid, (uid, tid, date, rt_count, fav_count, 
#  word_list, entity_list, min_date) )

def min_time_interface(time_rdd):
    # change df to have key, value with min_date at the end
    mtime_1 = time_rdd.keyBy(lambda x: x[0]).reduceByKey(lambda x, y: x if x[2] < y[2] else y)
    # (key, min_date)
    mtime_2 = mtime_1.mapValues(lambda x: x[2])
    # group min_date together 
    min_time_rdd = time_rdd.keyBy(lambda x: x[0]).join(mtime_2).mapValues(lambda x: x[0]+(x[1],) )
    return min_time_rdd



In [177]:
# Input: min_time_rdd
# Output: combine all together

# map out the dictionary value set and rank based on count
# save the result as well
# 

def compute_time_rdd(min_time_rdd):
    # {key = uid : (tid_list, rt_dic, rt_entity_dic, fav_dic, fav_entity_dic ) }
    c_min_time_rdd = min_time_rdd.combineByKey(cCombiner_t,mValue_t,mCombiners_t)
    return c_min_time_rdd

# Input: computed_min_time_rdd ==> list of items
# Output: (uid, (---( date, rt_dic, rt_entity_dic, av_dic, fav_entity_dic), 
#                 ()---)   ) 
# Note: ALL dics are of format: ( {word_str: (count, occur)} )
def rank_time_rdd(c_min_time_rdd):
    top_rdd = c_min_time_rdd.mapValues(get_top_n_ti)
    return top_rdd



# Input top_rdd (key_uid, value) pair ==> separate out the occurence term
#                 value is actually:  ((time, dic,....,), (time2, dic,....)
# Output: (uid, rc_wc, rc_entity_wc, fav_wc, fav_entity_wc
#               rc_wo, rc_entity_wo, fav_wo, fav_entity_wo) -->not so interesting
def transform_rank_time_rdd(top_rdd):
    # main point is to decouple (count, occur)
    def flat_map(row):
        def decouple(_list):
            w, v_pair = zip(*_list)
            count, occurence = zip(*v_pair)
            wc = list(zip(w, count))
            wo = list(zip(w, occurence))
            # return w_count, w_occurence          
            return (wc, wo)   
        # code for flat_map              
        uid = row[0]    
        creation_time = row[1][0]
        rt_list = row[1][1]
        rt_entity_list = row[1][2]
        fav_list = row[1][3]
        fav_entity_list = row[1][4]
    
        rc_wc, rc_wo = decouple(rt_list)
        rc_entity_wc, rc_entity_wo = decouple(rt_entity_list)   
        fav_wc, fav_wo = decouple(fav_list)
        fav_entity_wc, fav_entity_wo = decouple(fav_entity_list)
        
        return (uid, creation_time, \
                rc_wc, rc_entity_wc, fav_wc, fav_entity_wc,\
                rc_wo, rc_entity_wo, fav_wo, fav_entity_wo)
    # flat the value set!!!
    flat_top_rdd = top_rdd.flatMapValues(lambda x: x)
                      
    # start by mapping with helper fcn                      
    final_rdd = flat_top_rdd.map(flat_map)                      
    return final_rdd


In [191]:
# Build Cassandra Database Schema
# take transform_rank_time_rdd output and converts to dataframe
# can be used to save to Cassandra
def to_time_top_df(rdd):
    schema = StructType([
        StructField("uid", IntegerType(), True),  
        StructField("creation_date", DateType(), True),

        StructField("rt_word_list", ArrayType(StructType([
        StructField("word", StringType(), True),
        StructField("count", IntegerType(), False)])
        , True), True)     
        
        ,
        StructField("rt_entity_list", ArrayType(StructType([
        StructField("entity", StringType(), True),
        StructField("count", IntegerType(), False)])
        , True), True)  
        
        ,    
        StructField("fav_word_list", ArrayType(StructType([
        StructField("word", StringType(), True),
        StructField("count", IntegerType(), False)])
        , True), True)     
        
        ,
        StructField("fav_entity_list", ArrayType(StructType([
        StructField("entity", StringType(), True),
        StructField("count", IntegerType(), False)])
        , True), True)  
        
        ,
        StructField("rt_word_occur_list", ArrayType(StructType([
        StructField("word", StringType(), True),
        StructField("count", IntegerType(), False)])
        , True), True)     
        
        ,
        StructField("rt_entity_occur_list", ArrayType(StructType([
        StructField("entity", StringType(), True),
        StructField("count", IntegerType(), False)])
        , True), True)  
        
        ,    
        StructField("fav_word_occur_list", ArrayType(StructType([
        StructField("word", StringType(), True),
        StructField("count", IntegerType(), False)])
        , True), True)     
        
        ,
        StructField("fav_entity_occur_list", ArrayType(StructType([
        StructField("entity", StringType(), True),
        StructField("count", IntegerType(), False)])
        , True), True)                                 
    ])
    
    df = spark.createDataFrame(rdd, schema)
    return df


# Schema for all time top/max likes
def get_top_schema():
    pass


In [192]:
def save_data_frame(df, table_name):
    df.write.format("org.apache.spark.sql.cassandra").\
            mode('append').options(table=table_name,keyspace='twitter').save()    

In [None]:
# Trump Processing
# Trump Processing
# Trump Processing


nlp_rdd = trump_nlp_interface(trump_df)
trump_time_rdd = time_interface(nlp_rdd)
# Test for min_time 
min_rdd = min_time_interface(trump_time_rdd)
rez_dic_set_rdd = compute_time_rdd(min_rdd)

top_rdd = rank_time_rdd(rez_dic_set_rdd)
rez = transform_rank_time_rdd(top_rdd)
df = to_time_top_df(rez)
save_data_frame(df, "user_top_list")

In [None]:
# call this function to process data into Cassandra database
def process_trump_data():
    trump_json = 's3a://twitter-data-dump/celebrities/trump.json'
    trump_df = spark.read.json(trump_json)
    
    nlp_rdd = trump_nlp_interface(trump_df)
    trump_time_rdd = time_interface(nlp_rdd)
    # Test for min_time 
    min_rdd = min_time_interface(trump_time_rdd)
    rez_dic_set_rdd = compute_time_rdd(min_rdd)

    top_rdd = rank_time_rdd(rez_dic_set_rdd)
    rez = transform_rank_time_rdd(top_rdd)
    df = to_time_top_df(rez)
    save_data_frame(df, "user_top_list")
    

In [91]:
trump_total_count_rdd = total_count_interface(trump_nlp_rdd)

In [92]:
trump_tc_rdd_k = trump_total_count_rdd.keyBy(lambda x: x[0])
r = trump_tc_rdd_k.combineByKey(createCombiner, mergeValue, mergeCombiners)
r.take(1)

[(1666,
  ([u'955806333667807232',
    u'955795912374267907',
    u'955658992793149440',
    u'955056249925750784',
    u'954878124214415360',
    u'954843844402647040',
    u'954788467069870081',
    u'954680914998648833',
    u'954674157144477696',
    u'954541219970977793',
    u'954478044487520257',
    u'954456754137501697',
    u'954323750949982208',
    u'954097213608570880',
    u'954092417250222082',
    u'953979393180950528',
    u'953973568035086336',
    u'953951365532876800',
    u'953948941674078208',
    u'953796944564031489',
    u'953772162665590787',
    u'953771038114045954',
    u'953768657221451776',
    u'953270558573154305',
    u'953267506004754432',
    u'952887520790040576',
    u'952540700683497472',
    u'952538350333939713',
    u'952525384242876416',
    u'952301373479104512',
    u'952183452366929920',
    u'952166643202916352',
    u'951875499537641472',
    u'951813216291708928',
    u'951790999784783872',
    u'951788342647107584',
    u'95178558776528

In [108]:
trump_tc_rdd_k.take(1)

[(1666,
  (1666,
   u'955806333667807232',
   10053,
   43524,
   [u'thank',
    u'general',
    u'john',
    u'kelli',
    u'fantast',
    u'staff',
    u'white',
    u'hous',
    u'well',
    u'done',
    u'long',
    u'hour',
    u'fake',
    u'make',
    u'difficult',
    u'alway',
    u'great',
    u'win',
    u'us'],
   [u'Thank',
    u'General John Kelly',
    u'a fantastic job',
    u'the Staff',
    u'others',
    u'the White House',
    u'a job',
    u'Long hours',
    u'Fake reporting',
    u'job',
    u'WIN']))]

In [93]:
tr = r.mapValues(get_top_6_alltime)
tr.take(1)

[(1666,
  ([(u'years', (371083, 2)),
    (u'China', (356231, 2)),
    (u'Country', (354444, 2)),
    (u'the U S', (331626, 2)),
    (u'people', (323951, 2)),
    (u'the Democrats', (302174, 2)),
    (u'country', (293021, 2)),
    (u'the Senate', (267012, 2)),
    (u'the United States', (265778, 2)),
    (u'the world', (253312, 2)),
    (u'Crooked Hillary Clinton', (248074, 2)),
    (u'a deal', (242424, 2)),
    (u'the Republicans', (227008, 2)),
    (u'life', (216215, 2)),
    (u'America', (205955, 2)),
    (u'Today', (201177, 2)),
    (u'MAKE AMERICA GREAT AGAIN', (199610, 2)),
    (u'the FBI', (198800, 2)),
    (u'great honor', (192392, 2)),
    (u'Fake News', (190302, 2))],
   [(u'China', (1517409, 2)),
    (u'Country', (1459121, 2)),
    (u'years', (1447089, 2)),
    (u'the U S', (1420098, 2)),
    (u'the Democrats', (1314890, 2)),
    (u'people', (1283102, 2)),
    (u'country', (1237217, 2)),
    (u'the Senate', (1149598, 2)),
    (u'the world', (1080080, 2)),
    (u'the Republica

In [95]:
# computes the total count given total_count_rdd, tc_rdd
# tc_rdd is only the mapped version without computation
def compute_total_count(tc_rdd):
    tc_rdd_k = tc_rdd.keyBy(lambda x: x[0])
    computed_rdd = \
        tc_rdd_k.combineByKey(createCombiner, mergeValue, \
                              mergeCombiners).mapValues(get_top_6_alltime)
    return computed_rdd
    

In [96]:
test = compute_total_count(trump_total_count_rdd)

In [97]:
test.map(get_top_6_alltime).take(1)

[(1666,
  ([(u'years', (371083, 2)),
    (u'China', (356231, 2)),
    (u'Country', (354444, 2)),
    (u'the U S', (331626, 2)),
    (u'people', (323951, 2)),
    (u'the Democrats', (302174, 2)),
    (u'country', (293021, 2)),
    (u'the Senate', (267012, 2)),
    (u'the United States', (265778, 2)),
    (u'the world', (253312, 2)),
    (u'Crooked Hillary Clinton', (248074, 2)),
    (u'a deal', (242424, 2)),
    (u'the Republicans', (227008, 2)),
    (u'life', (216215, 2)),
    (u'America', (205955, 2)),
    (u'Today', (201177, 2)),
    (u'MAKE AMERICA GREAT AGAIN', (199610, 2)),
    (u'the FBI', (198800, 2)),
    (u'great honor', (192392, 2)),
    (u'Fake News', (190302, 2))],
   [(u'China', (1517409, 2)),
    (u'Country', (1459121, 2)),
    (u'years', (1447089, 2)),
    (u'the U S', (1420098, 2)),
    (u'the Democrats', (1314890, 2)),
    (u'people', (1283102, 2)),
    (u'country', (1237217, 2)),
    (u'the Senate', (1149598, 2)),
    (u'the world', (1080080, 2)),
    (u'the Republica

[(1666,
  (1666,
   u'955806333667807232',
   datetime.date(2013, 9, 9),
   10053,
   43524,
   [u'thank',
    u'general',
    u'john',
    u'kelli',
    u'fantast',
    u'staff',
    u'white',
    u'hous',
    u'well',
    u'done',
    u'long',
    u'hour',
    u'fake',
    u'make',
    u'difficult',
    u'alway',
    u'great',
    u'win',
    u'us'],
   datetime.date(2008, 5, 9)))]

In [44]:
# function to check how many times a word/phrase has been used
def occur(x):
    v  = x[1]
    dic = v[1]
    rez = []
    for k, v in dic.iteritems():
        if v[1] > 5:
            rez.append((k,v))
    return rez
r.map(occur).take(1)

[[(u'dossier', (146956, 6)),
  (u'blame', (111104, 6)),
  (u'foreign', (65185, 6)),
  (u'cut', (977054, 6))]]

[(1666,
  [(u'country', 2696338),
   (u'the democrats', 2044864),
   (u'fake news', 1670770),
   (u'the world', 1587716),
   (u'the u s', 1540800),
   (u'china', 1517409)],
  [(u'country', 647465),
   (u'the democrats', 479794),
   (u'the world', 442978),
   (u'fake news', 430484),
   (u'years', 371083),
   (u'people', 361543)])]

In [182]:
tr_m.toDF().printSchema()

root
 |-- _1: long (nullable = true)
 |-- _2: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- _1: string (nullable = true)
 |    |    |-- _2: long (nullable = true)
 |-- _3: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- _1: string (nullable = true)
 |    |    |-- _2: long (nullable = true)



In [183]:
alltime_df = spark.createDataFrame(tr_m, at)

In [191]:
at = StructType([
        StructField("uid", IntegerType(), True),    
    
        StructField("rt_word_list", ArrayType(StructType([
        StructField("word_name", StringType(), True),
        StructField("count", IntegerType(), False)])
        , True), True)        
        ,
    
        StructField("fav_word_list", ArrayType(StructType([
        StructField("word_name", StringType(), True),
        StructField("count", IntegerType(), False)])
        , True), True)
    ])

In [192]:
alltime_df = spark.createDataFrame(tr_m, at)

In [193]:
alltime_df.printSchema()

root
 |-- uid: integer (nullable = true)
 |-- rt_word_list: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- word_name: string (nullable = true)
 |    |    |-- count: integer (nullable = false)
 |-- fav_word_list: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- word_name: string (nullable = true)
 |    |    |-- count: integer (nullable = false)



In [194]:
alltime_df.first()

Row(uid=1666, rt_word_list=[Row(word_name=u'country', count=2696338), Row(word_name=u'the democrats', count=2044864), Row(word_name=u'fake news', count=1670770), Row(word_name=u'the world', count=1587716), Row(word_name=u'the u s', count=1540800), Row(word_name=u'china', count=1517409)], fav_word_list=[Row(word_name=u'country', count=647465), Row(word_name=u'the democrats', count=479794), Row(word_name=u'the world', count=442978), Row(word_name=u'fake news', count=430484), Row(word_name=u'years', count=371083), Row(word_name=u'people', count=361543)])

In [195]:
save_data_frame(alltime_df, "trump_top")

In [None]:
# Change Type Cast

In [165]:
df_uncasted = spark.createDataFrame(base_nlp_rdd, schema)
df_uncasted.printSchema()

NameError: name 'schema' is not defined

In [221]:
df_uncasted.first()

Row(city_name=None, country_name=None, creation_date=u'2010-09-24', creation_hour=11, creation_timestamp=u'2010-09-24 11:04:40+00:00', followers_count=140, friends_count=204, hashtag_count=0, media_attached=False, phrase_token=[u'n/a'], sentence_count=1, tid=362814954803433472, time_zone=u'London', tweet=u'Psalm 119:133', uid=194531988, word_token_set=[u'psalm', u'119', u'133'])

In [224]:
df_final = df_uncasted.select('creation_date',
                              'creation_hour',
                              df_uncasted.creation_timestamp.astype(TimestampType()),
                              'city_name',
                              'country_name',
                              'followers_count',
                              'friends_count',
                              'hashtag_count',
                              'media_attached',
                              'phrase_token',
                              'sentence_count',
                              'tid',
                              'time_zone',
                              'tweet',
                              'uid',
                              'word_token_set'
                             )
df_final.printSchema()

root
 |-- creation_date: string (nullable = true)
 |-- creation_hour: integer (nullable = true)
 |-- creation_timestamp: timestamp (nullable = true)
 |-- city_name: string (nullable = true)
 |-- country_name: string (nullable = true)
 |-- followers_count: integer (nullable = true)
 |-- friends_count: integer (nullable = true)
 |-- hashtag_count: integer (nullable = true)
 |-- media_attached: boolean (nullable = false)
 |-- phrase_token: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- sentence_count: integer (nullable = true)
 |-- tid: long (nullable = false)
 |-- time_zone: string (nullable = true)
 |-- tweet: string (nullable = true)
 |-- uid: long (nullable = false)
 |-- word_token_set: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [230]:
part.write.format("org.apache.spark.sql.cassandra").\
            mode('append').options(table='b0',keyspace='twitter').save() 
    