In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder\
                    .appName('Tweet Analyzer')\
                    .getOrCreate()

In [3]:
data = spark.read\
            .format('json')\
            .option('header', 'false')\
            .load('src/main/resources/tweets.json')

In [5]:
data.printSchema()

root
 |-- contributors: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- created_at: string (nullable = true)
 |-- display_text_range: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- entities: struct (nullable = true)
 |    |-- hashtags: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- indices: array (nullable = true)
 |    |    |    |    |-- element: long (containsNull = true)
 |    |    |    |-- text: string (nullable = true)
 |    |-- media: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- display_url: string (nullable = true)
 |    |    |    |-- expanded_url: string (nullable = true)
 |    |    |    |-- id: long (nullable = true)
 |    |    |    |-- id_str: string (nullable = true)
 |    |    |    |-- indices: array (nullable = true)
 |    |    |    |    |-- element: long (containsNull = true)
 |    |    |    |-- media_url: string (n

In [8]:
data.columns

['contributors',
 'created_at',
 'display_text_range',
 'entities',
 'extended_entities',
 'extended_tweet',
 'favorite_count',
 'favorited',
 'filter_level',
 'id',
 'id_str',
 'in_reply_to_screen_name',
 'in_reply_to_status_id',
 'in_reply_to_status_id_str',
 'in_reply_to_user_id',
 'in_reply_to_user_id_str',
 'is_quote_status',
 'lang',
 'possibly_sensitive',
 'quoted_status',
 'quoted_status_id',
 'quoted_status_id_str',
 'retweet_count',
 'retweeted',
 'retweeted_status',
 'source',
 'text',
 'truncated',
 'user',
 'withheld_copyright',
 'withheld_in_countries']

In [12]:
data.select('id').count()

201

In [13]:
data.select('id').distinct().count()

201

In [57]:
data.select('id', 'retweet_count', 'retweeted', 'retweeted_status', 'text', 'user').show()

+-------------------+-------------+---------+--------------------+--------------------+--------------------+
|                 id|retweet_count|retweeted|    retweeted_status|                text|                user|
+-------------------+-------------+---------+--------------------+--------------------+--------------------+
|1086247780488826880|            0|    false|[[], 2019-01-17T1...|RT @TheOnion: Pre...|[false, false, fa...|
|1086248013771800577|            0|    false|[[], 2019-01-18T0...|RT @TheOnion: "Ch...|[false, false, fa...|
|1086248347541950470|            0|    false|[[], 2019-01-17T1...|RT @TheOnion: Pre...|[false, false, fa...|
|1086248380454633472|            0|    false|[[], 2019-01-18T0...|RT @TheOnion: "Ch...|[false, false, fa...|
|1086249047470546944|            0|    false|[[], 2019-01-18T0...|RT @TheOnion: Rep...|[false, false, fa...|
|1086249073781456896|            0|    false|[[], 2019-01-17T1...|RT @TheOnion: Zam...|[false, false, fa...|
|108624910582590668

In [149]:
retweets = data.select("retweeted_status", 'text')

In [152]:
retweets.registerTempTable("retweets")

In [160]:
sqlContext.sql('SELECT * FROM retweets WHERE retweets.text NOT LIKE "RT%"').show()

+----------------+--------------------+
|retweeted_status|                text|
+----------------+--------------------+
|            null|Insurance Only Co...|
|            null|@TheOnion * May b...|
|            null|@TheOnion That on...|
|            null|@TheOnion The Ike...|
|            null|@TheOnion Well shit.|
|            null|@TheOnion Sadly, ...|
|            null|@TheOnion For wha...|
|            null|@TheOnion The blo...|
|            null|@TheOnion Just li...|
|            null|@TheOnion Is it h...|
|            null|@TheOnion @zei_na...|
|            null|@TheOnion Posted....|
|            null|@TheOnion Dissapo...|
|            null|@TheOnion Store b...|
|            null|@TheOnion This is...|
|            null|@TheOnion Double ...|
|            null|New Employee Has ...|
+----------------+--------------------+



In [158]:
data.count()

201

In [107]:
data.select('retweeted_status').take(1)[0][0].asDict().keys()

dict_keys(['contributors', 'created_at', 'display_text_range', 'entities', 'extended_entities', 'extended_tweet', 'favorite_count', 'favorited', 'filter_level', 'id', 'id_str', 'in_reply_to_screen_name', 'in_reply_to_status_id', 'in_reply_to_status_id_str', 'in_reply_to_user_id', 'in_reply_to_user_id_str', 'is_quote_status', 'lang', 'possibly_sensitive', 'quoted_status', 'quoted_status_id', 'quoted_status_id_str', 'retweet_count', 'retweeted', 'source', 'text', 'truncated', 'user', 'withheld_copyright', 'withheld_in_countries'])

In [110]:
for i in data.select('retweeted_status').take(1)[0][0].asDict().keys():
    print(i, ": ", data.select('retweeted_status').take(1)[0][0].asDict()[i])

contributors :  []
created_at :  2019-01-17T18:23:02.000Z
display_text_range :  [0, 114]
entities :  Row(hashtags=[], media=[Row(display_url='pic.twitter.com/nyU8y1oEDD', expanded_url='https://twitter.com/TheOnion/status/1085965736303099904/photo/1', id=1085965734294048773, id_str='1085965734294048773', indices=[115, 138], media_url='http://pbs.twimg.com/media/DxIgFObX0AU7GnZ.jpg', media_url_https='https://pbs.twimg.com/media/DxIgFObX0AU7GnZ.jpg', sizes=Row(large=Row(h=900, resize='fit', w=1600), medium=Row(h=675, resize='fit', w=1200), small=Row(h=383, resize='fit', w=680), thumb=Row(h=150, resize='crop', w=150)), type='photo', url='https://t.co/nyU8y1oEDD')], symbols=[], urls=[Row(display_url='trib.al/ABjZ5Sp', expanded_url='https://trib.al/ABjZ5Sp', indices=[91, 114], url='https://t.co/LAmqOPp2oe')], user_mentions=[])
extended_entities :  Row(hashtags=[], media=[Row(display_url='pic.twitter.com/nyU8y1oEDD', expanded_url='https://twitter.com/TheOnion/status/1085965736303099904/photo/