In [40]:
#Imports
import sys
import datetime
from datetime import timedelta

sys.path.append('../../')

from pyspark.sql import SparkSession, Row
from pyspark.ml import Pipeline

from sparknlp.annotator import *
from sparknlp.common import RegexRule
from sparknlp.base import DocumentAssembler, Finisher
from pyspark.sql.functions import explode

from dateutil.parser import parse
# for tokenizing
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
# for schema
from pyspark.sql.types import *


In [3]:
# instantiate a spark context object
appname= "large_read_tar"
master="local"

# Create Spark Session
spark = SparkSession.builder.appName(appname)\
                .config("spark.cassandra.connection.host", "localhost")\
                .config("spark.cassandra.connection.port", "9042")\
                .config("spark.eventLog.enabled", True)\
                .config("spark.eventLog.dir", "/home/ubuntu/spark_tmp/")\
                .getOrCreate()

In [4]:
zip_path = 's3a://twitter-data-dump/test.tar'
trump_json = 's3a://twitter-data-dump/celebrities/trump.json'
#large_tar = 's3a://twitter-data-dump/zip_dump/archiveteam-twitter-stream-2013-09.tar'
small_portion = 's3a://twitter-data-dump/smallportion/'


df = spark.read.json(small_portion)
df_trump = spark.read.json(trump_json)

resource_path ='/home/ubuntu/Desktop/spark-nlp/src/test/resources/'
#type(data) --> data frame

In [8]:
df_trump.printSchema()
df_trump.take(1)

root
 |-- created_at: string (nullable = true)
 |-- favorite_count: long (nullable = true)
 |-- id_str: string (nullable = true)
 |-- is_retweet: boolean (nullable = true)
 |-- retweet_count: long (nullable = true)
 |-- source: string (nullable = true)
 |-- text: string (nullable = true)



[Row(created_at=u'Tue Jan 23 14:16:02 +0000 2018', favorite_count=43524, id_str=u'955806333667807232', is_retweet=False, retweet_count=10053, source=u'Twitter for iPhone', text=u'Thank you to General John Kelly, who is doing a fantastic job, and all of the Staff and others in the White House, for a job well done. Long hours and Fake reporting makes your job more difficult, but it is always great to WIN, and few have won more than us!')]

In [71]:
# trump data frame to base_df
# note i split the data into 666 and 1666 id set
def trump_mapper(row):
    # set default trump as 666
    uid = 666    
    tid = row[2]
    creation_time = row[0]
    retweet_count = row[4]    
    favorite_count = row[1]
    tweet = row[-1]
    if len(tweet) > 200:
        uid = 1666
    
    time_zone = 'blabla'
    followers_count='100'
    friends_count= '2000'
    city_name='Washington'
    country_name = 'U.S'
    media_ary = []
    hashtag_ary = []
    return (tid, uid, tweet, creation_time,
            time_zone, followers_count,
            friends_count, city_name, country_name,
            media_ary, hashtag_ary, retweet_count,
            favorite_count )
        
trump_rdd = df_trump.rdd.map(trump_mapper)


In [73]:
trump_rdd.take(10)

[(u'955806333667807232',
  1666,
  u'Thank you to General John Kelly, who is doing a fantastic job, and all of the Staff and others in the White House, for a job well done. Long hours and Fake reporting makes your job more difficult, but it is always great to WIN, and few have won more than us!',
  u'Tue Jan 23 14:16:02 +0000 2018',
  'blabla',
  '100',
  '2000',
  'Washington',
  'U.S',
  [],
  [],
  10053,
  43524),
 (u'955795912374267907',
  1666,
  u'Nobody knows for sure that the Republicans &amp; Democrats will be able to reach a deal on DACA by February 8, but everyone will be trying....with a big additional focus put on Military Strength and Border Security. The Dems have just learned that a Shutdown is not the answer!',
  u'Tue Jan 23 13:34:37 +0000 2018',
  'blabla',
  '100',
  '2000',
  'Washington',
  'U.S',
  [],
  [],
  13019,
  56437),
 (u'955771016319590400',
  666,
  u'In one of the biggest stories in a long time, the FBI now says it is missing five months worth of lov

In [74]:
df.printSchema()

root
 |-- contributors: string (nullable = true)
 |-- coordinates: struct (nullable = true)
 |    |-- coordinates: array (nullable = true)
 |    |    |-- element: double (containsNull = true)
 |    |-- type: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- delete: struct (nullable = true)
 |    |-- status: struct (nullable = true)
 |    |    |-- id: long (nullable = true)
 |    |    |-- id_str: string (nullable = true)
 |    |    |-- user_id: long (nullable = true)
 |    |    |-- user_id_str: string (nullable = true)
 |-- entities: struct (nullable = true)
 |    |-- hashtags: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- indices: array (nullable = true)
 |    |    |    |    |-- element: long (containsNull = true)
 |    |    |    |-- text: string (nullable = true)
 |    |-- media: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- display_url: string (nullable = true)


In [75]:
# Select interested col attributes
main_df = df.selectExpr('id AS tid',\
                        'user.id AS uid',\
                        'text AS tweet',\
                        'user.created_at AS creation_time',\
                        'user.time_zone AS time_zone',\
                        'user.followers_count AS followers_count',\
                        'user.friends_count AS friends_count',\
                        'place.name AS city_name',\
                        'place.country AS country_name',\
                        'entities.media.media_url AS media_ary',\
                        'entities.hashtags.text AS hashtag_ary',\
                        'retweet_count',\
                        'favorite_count'                        
                       ).where('tid is NOT NULL AND uid is NOT NULL')

main_df.printSchema()

root
 |-- tid: long (nullable = true)
 |-- uid: long (nullable = true)
 |-- tweet: string (nullable = true)
 |-- creation_time: string (nullable = true)
 |-- time_zone: string (nullable = true)
 |-- followers_count: long (nullable = true)
 |-- friends_count: long (nullable = true)
 |-- city_name: string (nullable = true)
 |-- country_name: string (nullable = true)
 |-- media_ary: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- hashtag_ary: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- retweet_count: long (nullable = true)
 |-- favorite_count: long (nullable = true)



In [76]:
# Main DF column name set
col_exp_set = ['tid','uid','tweet','creation_time',
               'time_zone','followers_count',
               'friends_count','city_name','country_name',
               'media_ary','hashtag_ary','retweet_count',
               'favorite_count',
              ]


In [27]:
###### Debuggging 3 records only

#base_rdd = main_df.limit(3).rdd
#base_rdd.count()
base_rdd = main_df.rdd
base_rdd.take(1)

[Row(tid=362814954803433472, uid=194531988, tweet=u'Psalm 119:133', creation_time=u'Fri Sep 24 11:04:40 +0000 2010', time_zone=u'London', followers_count=140, friends_count=204, city_name=None, country_name=None, media_ary=None, hashtag_ary=[], retweet_count=0, favorite_count=0)]

In [28]:
# return 2 items: sentence_count,  <word_tuple>
def process_tweet(description):
    # base case
    if description is None or description == "":
        return (0,[])

    stemmer = SnowballStemmer("english")
    tokenizer = RegexpTokenizer(r'\w+')
    word_list = []
 
    words = tokenizer.tokenize(description)
    stopWords = set(stopwords.words('english'))
    for w in words:
        if w not in stopWords:
            word_list.append(stemmer.stem(w.lower()))

    return (len(sent_tokenize(description)),word_list)


In [77]:
# parse twitter time string to (date, timestamp_str, hour int)
# note later only timestamp_str is changed to asTYpe
def parse_time(creation_time):  
    # fcn that converts dt to date-str and time-str
    def cassandra_convert(dt):
        hour = dt.strftime("%H")
        return (dt.date(), str(dt),int(hour))
    
    dt = None
    try:
        dt = parse(creation_time)
    except Exception as e:
        # 1. log and 
        # 2.use current system time instead
        dt = datetime.datetime.now()
    return cassandra_convert(dt)

# Mapping: 1. 13 cols to 15 cols (word_list, date, 
#          timestamp, hour)
#          2. (media_ary-> media_attached; tag_ary -> tag_count)
def map_row(row):
    # return if image attached
    def check_image(media_ary):
        if media_ary is None or\
           len(media_ary) == 0:
            return False
        return True
    
    # count # of hashtags
    def count_tag(tag_ary):
        if tag_ary is None:
            return 0
        return len(tag_ary)
    
    # create a list of tuples: [(str1, like_num),] or [(str1, ret_num)] 
    def create_tuple_list(vector, num):
        result = [num]
        for item in vector:
            result.append(item)
        if len(result) == 0:
            return []
        return result
        
    try:       
        tid = row[0]
        uid = row[1]
        tweet = row[2]
        retweet_count = row[11]
        favorite_count = row[12]
        # Map token: sentence_count, word_list
        sentence_count, word_list\
            = process_tweet(tweet)
        # Map Time
        creation_time = row[3]
        date, timestamp, hour\
            = parse_time(creation_time)
        time_zone = row[4]
        followers_count = row[5]
        friends_count = row[6]
        city_name = row[7]
        country_name = row[8]
        media_ary = row[9]
        # boolean
        media_attached = check_image(media_ary)        
        hashtag_ary = row[10]
        # count tags
        tag_count = count_tag(hashtag_ary)
                
    except Exception as e:
        # 1. log e and 2.return default        
        return Row(tid=-1, uid=-1,
            followers_count=-1,
            friends_count=-1,
            tweet='n/a',retweet_count=-1,
            favorite_count=-1,sentence_count=-1,
            word_list=[],
            phrase_list=['n/a'],
            date=datetime.date(2001, 1, 1),timestamp='00:00:00',hour=0,
            time_zone='n/a',city_name='n/a',
            country_name='n/a',
            media_attached=False,tag_count=tag_count
            )
    
    r = Row(tid=tid, uid=uid,
            followers_count=followers_count,
            friends_count=friends_count,
            tweet=tweet,retweet_count=retweet_count,
            favorite_count=favorite_count,sentence_count=sentence_count,
            word_list=word_list,
            phrase_list=['n/a'],
            date=date,timestamp=timestamp,hour=hour,
            time_zone=time_zone,city_name=city_name,
            country_name=country_name,
            media_attached=media_attached,tag_count=tag_count
            )
    return r 
    
base_map_rdd = base_rdd.map(map_row)    

In [78]:
# Main DF column name set
col_exp_set = ['tid','uid','tweet','followers_count',
               'friends_count','retweet_count',
               'favorite_count','sentence_count',
               'word_list',
               'phrase_rt',
               'date','timestamp','hour',
               'time_zone','city_name','country_name',
               'media_attached','tag_count'
              ]
# Sanity Check
r = base_map_rdd.take(1)

len(r) 
# len(col_exp_set) --> 18

# index order
r
# city_name=0, country_name=1, date=2, 
# favorite_count=3, followers_count=4, friends_count=5,
# hour=6, media_attached=7, phrase_list=8, 
# retweet_count=9, sentence_count=10, tag_count=11, tid=12, time_zone=13, 
# timestamp=14, tweet=15, uid=16, word_list=17

[Row(city_name=None, country_name=None, date=datetime.date(2010, 9, 24), favorite_count=0, followers_count=140, friends_count=204, hour=11, media_attached=False, phrase_list=['n/a'], retweet_count=0, sentence_count=1, tag_count=0, tid=362814954803433472, time_zone=u'London', timestamp='2010-09-24 11:04:40+00:00', tweet=u'Psalm 119:133', uid=194531988, word_list=[u'psalm', u'119', u'133'])]

In [79]:
# get only interested attributes from base_map_rdd
def get_time_map(row):
    uid_i = 16
    tid_i = 12   
    rt_c_i = 9
    fav_c_i = 3
    word_list_i = -1
    date = 2
    # order
    return (row[uid_i],row[tid_i],row[date],
            row[rt_c_i],
            row[fav_c_i], row[word_list_i])

In [80]:
time_rdd = base_map_rdd.map(get_time_map)
time_rdd.take(1)

[(194531988,
  362814954803433472,
  datetime.date(2010, 9, 24),
  0,
  0,
  [u'psalm', u'119', u'133'])]

In [81]:
# 30-day time interval split, c_date vs date_base (earliest date)
def get_relative_date(date_c, date_b):
    delta = date_c - date_b
    round_days = delta.days - delta.days% 30
    relative_date = date_b + timedelta(days=round_days)
    return relative_date

# delta = datetime.date(2018,1,29) - datetime.date(2010,9,24)
# round_days = delta.days - delta.days % 30
# relative_date = datetime.date(2010,9,24) + timedelta(days=round_days)
# str(relative_date)
# '2018-01-15'

In [82]:
# helper-function for combineByKey time! 
def add_to_dic(dic, word, count):
    if word in dic:
        k_record = dic.get(word)
        dic.update({word: (k_record[0]+count, k_record[1]+1)})
    else:
        # word and tuple of count and # of times for word
        record = {word:(count,1)}
        dic.update(record)
        
def create_dic_set(row):
    rt_dic = {}
    fav_dic = {}
    rt_c = row[2]
    fav_c = row[3]

    word_list = row[-1]
    if rt_c > 0 and fav_c > 0:
        for w in word_list:
            add_to_dic(rt_dic, w, rt_c)
            add_to_dic(fav_dic, w, fav_c)
    elif rt_c > 0:
        for w in word_list:
            add_to_dic(rt_dic, w, rt_c)
    elif fav_c > 0 :
        for w in word_list:
            add_to_dic(fav_dic, w, fav_c)
    # return dic_set
    return (rt_dic, fav_dic)

In [83]:
# uid_i = 0, tid_i = 1, date_i = 2, 
# rt_c_i = 3, # fav_c_i = 4, word_list_i = -1

def cCombiner_t(row):
    #print("len of createCombiner row is: ",len(row))
    rt_dic, fav_dic = create_dic_set(row)
    tid_i = row[1]
    # create a list of tid
    return ([tid], rt_dic, fav_dic)

# merges 2 dic together
def merge_dic(dic1, dic2):
    if len(dic1) < len(dic2):
        small_dic = dic1
        large_dic = dic2
    else:
        small_dic = dic2
        large_dic = dic1
    for k in small_dic.keys():
        # if k in small_dic
        if k in large_dic:
            # merge values together
            kv1 = small_dic.get(k)
            kv2 = large_dic.get(k)
            value = (kv1[0]+kv2[0], kv1[1]+kv1[1])
            large_dic.update({k:value})
        # simply insert to large_dic    
        else:
            large_dic.update({k:small_dic.get(k)})
    # return large_dic        
    return large_dic

# which merges V into C
def mValue_t(new_row, row):
    #retrieve last 
    rt_dic, fav_dic = create_dic_set(row)
    tid = row[1]
    
    rt_dic_merge = merge_dic(rt_dic, new_row[1])
    fav_dic_merge = merge_dic(fav_dic, new_row[2])
    
    tid_list = new_row[0]
    tid_list.append(tid)
    # return the result
    return (tid_list, rt_dic_merge, fav_dic_merge)

# combine two C's (new row)
def mCombiners_t(r1, r2):
    list_merge = r1[0] + r2[0]
    rt_dic_merge = merge_dic(r1[1], r2[1])
    fav_dic_merge = merge_dic(r1[-1], r2[-1])
    
    return (list_merge, rt_dic_merge, fav_dic_merge)

In [84]:
# Do total count computing here: map selected ones only
# take selected columns from base_map_rdd
def map_total_cols(row):
    uid_i = 16
    tid_i = 12   
    rt_c_i = 9
    fav_c_i = 3
    word_list_i = -1
    
    return (row[uid_i],row[tid_i],row[rt_c_i],
            row[fav_c_i], row[word_list_i])
# df_tot    
df_tot_raw = base_map_rdd.map(map_total_cols)

df_tot_raw_k = df_tot_raw.keyBy(lambda x: x[1])
df_tot_raw.take(1)
#df_tot_raw_k.collect()

[(194531988, 362814954803433472, 0, 0, [u'psalm', u'119', u'133'])]

In [85]:
# helper-function for combineByKey 
def add_to_dic(dic, word, count):
    if word in dic:
        k_record = dic.get(word)
        dic.update({word: (k_record[0]+count, k_record[1]+1)})
    else:
        # word and tuple of count and # of times for word
        record = {word:(count,1)}
        dic.update(record)
        
def create_dic_set(row):
    rt_dic = {}
    fav_dic = {}
    rt_c = row[2]
    fav_c = row[3]

    word_list = row[-1]
    if rt_c > 0 and fav_c > 0:
        for w in word_list:
            add_to_dic(rt_dic, w, rt_c)
            add_to_dic(fav_dic, w, fav_c)
    elif rt_c > 0:
        for w in word_list:
            add_to_dic(rt_dic, w, rt_c)
    elif fav_c > 0 :
        for w in word_list:
            add_to_dic(fav_dic, w, fav_c)
    # return dic_set
    return (rt_dic, fav_dic)


# uid_i = 0, tid_i = 1, rt_c_i = 2, fav_c_i = 3, word_list_i = -1
def createCombiner(row):
    #print("len of createCombiner row is: ",len(row))
    rt_dic, fav_dic = create_dic_set(row)
    tid = row[1]
    # create a list of tid
    return ([tid], rt_dic, fav_dic)

# merges 2 dic together
def merge_dic(dic1, dic2):
    if len(dic1) < len(dic2):
        small_dic = dic1
        large_dic = dic2
    else:
        small_dic = dic2
        large_dic = dic1
    for k in small_dic.keys():
        # if k in small_dic
        if k in large_dic:
            # merge values together
            kv1 = small_dic.get(k)
            kv2 = large_dic.get(k)
            value = (kv1[0]+kv2[0], kv1[1]+kv1[1])
            large_dic.update({k:value})
        # simply insert to large_dic    
        else:
            large_dic.update({k:small_dic.get(k)})
    # return large_dic        
    return large_dic

# which merges V into C
def mergeValue(new_row, row):
    #retrieve last 
    rt_dic, fav_dic = create_dic_set(row)
    tid = row[1]
    
    rt_dic_merge = merge_dic(rt_dic, new_row[1])
    fav_dic_merge = merge_dic(fav_dic, new_row[2])
    
    tid_list = new_row[0]
    tid_list.append(tid)
    # return the result
    return (tid_list, rt_dic_merge, fav_dic_merge)

# combine two C's (new row)
def mergeCombiners(r1, r2):
    list_merge = r1[0] + r2[0]
    rt_dic_merge = merge_dic(r1[1], r2[1])
    fav_dic_merge = merge_dic(r1[-1], r2[-1])
    
    return (list_merge, rt_dic_merge, fav_dic_merge)

In [95]:
# Trump Region
t_map_rdd = trump_rdd.map(map_row)
t_map_rdd.take(1)

t_raw = t_map_rdd.map(map_total_cols)
t_raw.take(2)

[(1666,
  u'955806333667807232',
  10053,
  43524,
  [u'thank',
   u'general',
   u'john',
   u'kelli',
   u'fantast',
   u'job',
   u'staff',
   u'other',
   u'white',
   u'hous',
   u'job',
   u'well',
   u'done',
   u'long',
   u'hour',
   u'fake',
   u'report',
   u'make',
   u'job',
   u'difficult',
   u'alway',
   u'great',
   u'win',
   u'us']),
 (1666,
  u'955795912374267907',
  13019,
  56437,
  [u'nobodi',
   u'know',
   u'sure',
   u'republican',
   u'amp',
   u'democrat',
   u'abl',
   u'reach',
   u'deal',
   u'daca',
   u'februari',
   u'8',
   u'everyon',
   u'tri',
   u'big',
   u'addit',
   u'focus',
   u'put',
   u'militari',
   u'strength',
   u'border',
   u'secur',
   u'the',
   u'dem',
   u'learn',
   u'shutdown',
   u'answer'])]

[(u'955806333667807232',
  (1666,
   u'955806333667807232',
   10053,
   43524,
   [u'thank',
    u'general',
    u'john',
    u'kelli',
    u'fantast',
    u'job',
    u'staff',
    u'other',
    u'white',
    u'hous',
    u'job',
    u'well',
    u'done',
    u'long',
    u'hour',
    u'fake',
    u'report',
    u'make',
    u'job',
    u'difficult',
    u'alway',
    u'great',
    u'win',
    u'us']))]

In [96]:
k_raw_trump = t_raw.keyBy(lambda x: x[0])
r = k_raw_trump.combineByKey(createCombiner, mergeValue, mergeCombiners)
r.take(1)

[(1666,
  ([u'955806333667807232',
    u'955795912374267907',
    u'955658992793149440',
    u'955056249925750784',
    u'954878124214415360',
    u'954843844402647040',
    u'954788467069870081',
    u'954680914998648833',
    u'954674157144477696',
    u'954541219970977793',
    u'954478044487520257',
    u'954456754137501697',
    u'954323750949982208',
    u'954097213608570880',
    u'954092417250222082',
    u'953979393180950528',
    u'953973568035086336',
    u'953951365532876800',
    u'953948941674078208',
    u'953796944564031489',
    u'953772162665590787',
    u'953771038114045954',
    u'953768657221451776',
    u'953270558573154305',
    u'953267506004754432',
    u'952887520790040576',
    u'952540700683497472',
    u'952538350333939713',
    u'952525384242876416',
    u'952301373479104512',
    u'952183452366929920',
    u'952166643202916352',
    u'951875499537641472',
    u'951813216291708928',
    u'951790999784783872',
    u'951788342647107584',
    u'95178558776528

In [97]:
def get_top_3(row_v):
    rt_dit = row_v[1]
    fv = -1
    fk = ''
    sv = -1
    sk = ''
    tv = -1
    tk = ''
    for k, v in rt_dit.iteritems():
        av = v[0]/v[1]
        if av > fv:
            tv = sv
            sv = fv
            fv = av            
            
            tk = sk
            sk = fk
            fk = k
    return ((fk, fv),(sk, sv),(tk, tv))
            
    
tmp = r.mapValues(get_top_3)

In [98]:
tmp.take(1)

[(1666, ((u'i', 1152925), (u'great', 702451), (u'https', 611368)))]

In [50]:
rdd_test.combineByKey(createCombiner, mergeValue, mergeCombiners).collect()

[(194531988, ([362814954803433472], {}, {})),
 (921796843, ([362814954803429378], {}, {})),
 (225700877,
  ([111, 110],
   {u'duermo': (2, 1), u'mejor': (2, 1)},
   {u'duermo': (3, 1), u'mejor': (3, 1)}))]

In [48]:
test_data = [(194531988,
  (194531988, 362814954803433472, 0, 0, [u'psalm', u'119', u'133'])),
 (921796843,(921796843,362814954803429378,0,0,
   [u'want',
    u'pee'
    u'dog',
    u'fell',
    u'asleep',
    u'lap',
    u'look',
    u'cute',
    u'want',
    u'wake',
    u'ugh',
    u'struggl'])),
 (225700877, (225700877, 111, 0, 0, [u'mejor', u'duermo'])),
(225700877, (225700877, 110, 2, 3, [u'mejor', u'duermo']))            
            ]

rdd_test = sc.parallelize(test_data)
rdd_test.collect()

[(194531988,
  (194531988, 362814954803433472, 0, 0, [u'psalm', u'119', u'133'])),
 (921796843,
  (921796843,
   362814954803429378,
   0,
   0,
   [u'want',
    u'peedog',
    u'fell',
    u'asleep',
    u'lap',
    u'look',
    u'cute',
    u'want',
    u'wake',
    u'ugh',
    u'struggl'])),
 (225700877, (225700877, 111, 0, 0, [u'mejor', u'duermo'])),
 (225700877, (225700877, 110, 2, 3, [u'mejor', u'duermo']))]

In [51]:
r1 =Row( date='2010-09-24', favorite_count=0, followers_count=140, friends_count=204, hour=11, media_attached=False, phrase_list=['n/a'], retweet_count=0, sentence_count=1, tag_count=0, tid=362814954803433472, time_zone=u'London', timestamp='2010-09-24 11:04:40+00:00', tweet=u'Psalm 119:133', uid=194531988, word_list=[u'psalm', u'119', u'133'])
r2 =Row( date='2000-99-99', favorite_count=0, followers_count=140, friends_count=204, hour=11, media_attached=False, phrase_list=['n/a'], retweet_count=0, sentence_count=1, tag_count=0, tid=362814954803433472, time_zone=u'London', timestamp='2010-09-24 11:04:40+00:00', tweet=u'Psalm 119:133', uid=194531988, word_list=[u'psalm', u'119', u'133'])
r3 =Row( date='2010-09-24', favorite_count=0, followers_count=140, friends_count=204, hour=11, media_attached=False, phrase_list=['n/a'], retweet_count=0, sentence_count=1, tag_count=0, tid=362814954803433472, time_zone=u'London', timestamp='2010-09-24 11:04:40+00:00', tweet=u'Psalm 119:133', uid=123, word_list=[u'psalm', u'119', u'133'])
r4 =Row( date='1999-09-24', favorite_count=0, followers_count=140, friends_count=204, hour=11, media_attached=False, phrase_list=['n/a'], retweet_count=0, sentence_count=1, tag_count=0, tid=362814954803433472, time_zone=u'London', timestamp='2010-09-24 11:04:40+00:00', tweet=u'Psalm 119:133', uid=123, word_list=[u'psalm', u'119', u'133'])

len(r1)

16

In [None]:
tmp = spark.createDataFrame([r1,r2,r3,r4 ]).rdd
k_tmp = tmp.keyBy(lambda x: x[14])
r = k_tmp.reduceByKey(lambda x, y: min(x[0],y[0]))

In [59]:
r.collect()

[(194531988, u'2000-99-99'), (123, u'1999-09-24')]

In [60]:
z = k_tmp.join(r)

z.collect()

[(194531988,
  (Row(date=u'2010-09-24', favorite_count=0, followers_count=140, friends_count=204, hour=11, media_attached=False, phrase_list=[u'n/a'], retweet_count=0, sentence_count=1, tag_count=0, tid=362814954803433472, time_zone=u'London', timestamp=u'2010-09-24 11:04:40+00:00', tweet=u'Psalm 119:133', uid=194531988, word_list=[u'psalm', u'119', u'133']),
   u'2000-99-99')),
 (194531988,
  (Row(date=u'2000-99-99', favorite_count=0, followers_count=140, friends_count=204, hour=11, media_attached=False, phrase_list=[u'n/a'], retweet_count=0, sentence_count=1, tag_count=0, tid=362814954803433472, time_zone=u'London', timestamp=u'2010-09-24 11:04:40+00:00', tweet=u'Psalm 119:133', uid=194531988, word_list=[u'psalm', u'119', u'133']),
   u'2000-99-99')),
 (123,
  (Row(date=u'2010-09-24', favorite_count=0, followers_count=140, friends_count=204, hour=11, media_attached=False, phrase_list=[u'n/a'], retweet_count=0, sentence_count=1, tag_count=0, tid=362814954803433472, time_zone=u'London'

In [61]:
k_tmp.min(key=lambda x: x[1][0])

(123,
 Row(date=u'1999-09-24', favorite_count=0, followers_count=140, friends_count=204, hour=11, media_attached=False, phrase_list=[u'n/a'], retweet_count=0, sentence_count=1, tag_count=0, tid=362814954803433472, time_zone=u'London', timestamp=u'2010-09-24 11:04:40+00:00', tweet=u'Psalm 119:133', uid=123, word_list=[u'psalm', u'119', u'133']))

In [None]:
# helper-function for combineByKey 
def add_to_dic(dic, word, count):
    if word in dic:
        k_record = dic.get(word)
        dic.update({word: (k_record[0]+count, k_record[1]+1)})
    else:
        # word and tuple of count and # of times for word
        record = {word:(count,1)}
        dic.update(record)

In [86]:
    
# Originally (K,V) ==> (K,C)
# which Turns a V into a C  
def createCombiner(row)
    word_list = row[17]
    fav_count = row[3]
    retweet_count = row[9]
    # load hashmap        
    dic_set = result[0]
    rt_dic= dic_set[0]
    fav_dic = dic_set[1]
    # now update two dictionaries correspondingly
    for w in word_list:
        add_to_dic(rt_dic, w, retweet_count)
        add_to_dic(fav_dic, w, fav_count)
                
    # increment tweet count
    result[1] + 1
    
    pass

# which merges V into C
def mergeValue(row, new_row)
    # merge two dic together and return new dic
    def merge_dic(dic1, dic2):
        if len(dic1) < len(dic2):
            small_dic = dic1
            large_dic = dic2
        else:
            small_dic = dic2
            large_dic = dic1
        
        for k in small_dic.keys():
            # if k in small_dic
            if k in large_dic:
                # merge values together
                kv1 = small_dic.get(k)
                kv2 = large_dic.get(k)
                value = (kv1[0]+kv2[0], kv1[1]+kv1[1])
                large_dic.update({k:value})
            # simply insert to large_dic    
            else:
                large_dic.update({k:small_dic.get(k)})
        # return large_dic        
        return large_dic
    
    
    pass

# combine two C's (new row)
def mergeCombiners(r1, r2):
    pass




list

In [219]:
def ary_type(col_name, col_type):
    return StructField(col_name, ArrayType(StringType(), containsNull=True), True)
# build the table schema
def build_schema():
    schema = \
    StructType([
        StructField("city_name",StringType(), True),
        StructField("country_name",StringType(), True),
        StructField("creation_date",StringType(), True),
        StructField("creation_hour",IntegerType(), True),         
        StructField("creation_timestamp",StringType(), True),       
        StructField("followers_count",IntegerType(), True),
        StructField("friends_count",IntegerType(), True),
        StructField("hashtag_count",IntegerType(), True),
        StructField("media_attached",BooleanType(), False),
        StructField("phrase_token",ArrayType(StringType(), containsNull=True),
                    True),        
        StructField("sentence_count",IntegerType(), True),         
        StructField("tid",LongType(), False),  
        StructField("time_zone",StringType(), True),         
        StructField("tweet",StringType(), True),
        StructField("uid",LongType(), False),
        StructField("word_token_set",ArrayType(StringType(), containsNull=True),
                    True),                     
    ])
    return schema

In [220]:
# fit rdd with schema
schema = build_schema()
df_uncasted = spark.createDataFrame(base_map_rdd, schema)
df_uncasted.printSchema()

root
 |-- city_name: string (nullable = true)
 |-- country_name: string (nullable = true)
 |-- creation_date: string (nullable = true)
 |-- creation_hour: integer (nullable = true)
 |-- creation_timestamp: string (nullable = true)
 |-- followers_count: integer (nullable = true)
 |-- friends_count: integer (nullable = true)
 |-- hashtag_count: integer (nullable = true)
 |-- media_attached: boolean (nullable = false)
 |-- phrase_token: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- sentence_count: integer (nullable = true)
 |-- tid: long (nullable = false)
 |-- time_zone: string (nullable = true)
 |-- tweet: string (nullable = true)
 |-- uid: long (nullable = false)
 |-- word_token_set: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [221]:
df_uncasted.first()

Row(city_name=None, country_name=None, creation_date=u'2010-09-24', creation_hour=11, creation_timestamp=u'2010-09-24 11:04:40+00:00', followers_count=140, friends_count=204, hashtag_count=0, media_attached=False, phrase_token=[u'n/a'], sentence_count=1, tid=362814954803433472, time_zone=u'London', tweet=u'Psalm 119:133', uid=194531988, word_token_set=[u'psalm', u'119', u'133'])

In [224]:
df_final = df_uncasted.select('creation_date',
                              'creation_hour',
                              df_uncasted.creation_timestamp.astype(TimestampType()),
                              'city_name',
                              'country_name',
                              'followers_count',
                              'friends_count',
                              'hashtag_count',
                              'media_attached',
                              'phrase_token',
                              'sentence_count',
                              'tid',
                              'time_zone',
                              'tweet',
                              'uid',
                              'word_token_set'
                             )
df_final.printSchema()

root
 |-- creation_date: string (nullable = true)
 |-- creation_hour: integer (nullable = true)
 |-- creation_timestamp: timestamp (nullable = true)
 |-- city_name: string (nullable = true)
 |-- country_name: string (nullable = true)
 |-- followers_count: integer (nullable = true)
 |-- friends_count: integer (nullable = true)
 |-- hashtag_count: integer (nullable = true)
 |-- media_attached: boolean (nullable = false)
 |-- phrase_token: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- sentence_count: integer (nullable = true)
 |-- tid: long (nullable = false)
 |-- time_zone: string (nullable = true)
 |-- tweet: string (nullable = true)
 |-- uid: long (nullable = false)
 |-- word_token_set: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [230]:
part.write.format("org.apache.spark.sql.cassandra").\
            mode('append').options(table='b0',keyspace='twitter').save() 
    

In [231]:
def save_data_frame(df, table_name):
    df.write.format("org.apache.spark.sql.cassandra").\
            mode('append').options(table=table_name,keyspace='twitter').save()    

In [None]:
# Scrach Space: Previous Tests and Etc

In [50]:
from dateutil.parser import parse
import datetime

# parse twitter time string to (y-m-d,h-m-s)
def parse_time(creation_time):    
    # fcn that converts dt to date-str and time-str
    def cassandra_convert(dt):
        time = dt.strftime("%H:%M:%S")
        return (str(dt.date()), time)
    try:
        dt = parse(creation_time)
    except Exception as e:
        # 1. log and 
        # 2.use current system time instead
        dt = datetime.datetime.now()
    return cassandra_convert(dt)

# rez = None
# creation_time = u'Fri Sep 24 11:04:40 +0000 2010'
# try:
#     rez = parse(creation_time)
# except ValueError:
#     rez = 'default'
# except OverflowError:
#     rez = 'default'
# rez


datetime.datetime(2010, 9, 24, 11, 4, 40, tzinfo=tzlocal())

In [65]:
print (rez.date())
print(rez.time())


2010-09-24
11:04:40
11:04:40
