In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder\
                    .appName("Tweet analysis")\
                    .getOrCreate()

In [3]:
tweets = spark.read\
              .format('json')\
              .load('tweets2/fake_tweets.json')

In [4]:
tweets.printSchema()

root
 |-- created_at: string (nullable = true)
 |-- favorite_count: long (nullable = true)
 |-- id: long (nullable = true)
 |-- in_reply_to_screen_name: string (nullable = true)
 |-- in_reply_to_status_id: long (nullable = true)
 |-- in_reply_to_user_id: long (nullable = true)
 |-- lang: string (nullable = true)
 |-- retweet_count: long (nullable = true)
 |-- retweeted: boolean (nullable = true)
 |-- retweeted_status: struct (nullable = true)
 |    |-- contributors: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- created_at: string (nullable = true)
 |    |-- display_text_range: array (nullable = true)
 |    |    |-- element: long (containsNull = true)
 |    |-- entities: struct (nullable = true)
 |    |    |-- hashtags: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- indices: array (nullable = true)
 |    |    |    |    |    |-- element: long (containsNull = true)
 |    |    |    |    |--

1. **created_at**
    - UTC time when this Tweet was created. Example:
2. **favorite_count** 
    - Indicates approximately how many times this Tweet has been liked by Twitter users.
3. **in_reply_to_screen_name** 
    - If the represented Tweet is a reply, this field will contain the screen name of the original Tweet’s author. 
4. **in_reply_to_status_id** 
    - If the represented Tweet is a reply, this field will contain the integer representation of the original Tweet’s ID
5. **in_reply_to_user_id** 
    - If the represented Tweet is a reply, this field will contain the integer representation of the original Tweet’s author ID. This will not necessarily always be the user directly mentioned in the Tweet
6. **retweet_count** 
    - Number of times this Tweet has been retweeted.
7. **retweeted** 
    - Indicates whether this Tweet has been Retweeted by the authenticating user.
8. **retweeted_status** 
    -  Users can amplify the broadcast of Tweets authored by other users by retweeting . Retweets can be distinguished from typical Tweets by the existence of a retweeted_status attribute. This attribute contains a representation of the original Tweet that was retweeted. Note that retweets of retweets do not show representations of the intermediary retweet, but only the original Twee

In [5]:
tweets.count()

13280

In [6]:
tweets.columns

['created_at',
 'favorite_count',
 'id',
 'in_reply_to_screen_name',
 'in_reply_to_status_id',
 'in_reply_to_user_id',
 'lang',
 'retweet_count',
 'retweeted',
 'retweeted_status',
 'source',
 'text',
 'user']

In [7]:
tweets.select('id').distinct().count()

13279

In [8]:
tweets.select('in_reply_to_screen_name').show()

+-----------------------+
|in_reply_to_screen_name|
+-----------------------+
|          BreitbartNews|
|                   null|
|                   null|
|                   null|
|                   null|
|                   null|
|                   null|
|          BreitbartNews|
|                   null|
|                   null|
|          BreitbartNews|
|                   null|
|                   null|
|                   null|
|                   null|
|          BreitbartNews|
|                   null|
|          BreitbartNews|
|                   null|
|                   null|
+-----------------------+
only showing top 20 rows



In [9]:
from pyspark.sql.functions import col

originals = tweets.where(col('in_reply_to_screen_name').isNull())
replies = tweets.where(col('in_reply_to_screen_name').isNotNull())

In [10]:
originals.count(), replies.count()

(8157, 5123)

In [11]:
originals.count() + replies.count()

13280

In [12]:
originals = originals.select('created_at',
                 'id','retweet_count', 'retweeted', 'retweeted_status',
                 'text','user')
originals.show(2)

+--------------------+-------------------+-------------+---------+--------------------+--------------------+--------------------+
|          created_at|                 id|retweet_count|retweeted|    retweeted_status|                text|                user|
+--------------------+-------------------+-------------+---------+--------------------+--------------------+--------------------+
|2019-02-06T08:59:...|1093071756909314049|            0|    false|[[], 2019-02-05T2...|RT @TheOnion: Gui...|[false, false, fa...|
|2019-02-06T08:59:...|1093071759383908353|            0|    false|[[], 2019-02-05T1...|RT @TheOnion: Cit...|[false, false, fa...|
+--------------------+-------------------+-------------+---------+--------------------+--------------------+--------------------+
only showing top 2 rows



In [13]:
posts = tweets.where(col('text').startswith('RT') == False).where(col('in_reply_to_screen_name').isNull())

In [14]:
posts = posts.select('created_at',
                     'id',
                     'text',
                     'user')
posts.show(2)

+--------------------+-------------------+--------------------+--------------------+
|          created_at|                 id|                text|                user|
+--------------------+-------------------+--------------------+--------------------+
|2019-02-06T10:02:...|1093087417941151746|Former UFC fighte...|[false, false, fa...|
|2019-02-06T10:02:...|1093087422240321536|Michigan: Muslim ...|[false, false, fa...|
+--------------------+-------------------+--------------------+--------------------+
only showing top 2 rows



In [15]:
posts.show()

+--------------------+-------------------+--------------------+--------------------+
|          created_at|                 id|                text|                user|
+--------------------+-------------------+--------------------+--------------------+
|2019-02-06T10:02:...|1093087417941151746|Former UFC fighte...|[false, false, fa...|
|2019-02-06T10:02:...|1093087422240321536|Michigan: Muslim ...|[false, false, fa...|
|2019-02-06T10:02:...|1093087427172884481|Wisconsin: 7yo di...|[false, false, fa...|
|2019-02-06T10:08:...|1093089021708898304|The PoliticusUSA ...|[false, false, fa...|
|2019-02-06T12:20:...|1093122284007624704|Video: Muslim Con...|[false, false, fa...|
|2019-02-06T12:21:...|1093122580549193730|If Global Warming...|[false, false, fa...|
|2019-02-06T12:39:...|1093126929039777792|Washington Post: ...|[false, false, fa...|
|2019-02-06T12:51:...|1093130063338455044|WaPo Sat On Sexua...|[false, false, fa...|
|2019-02-06T12:53:...|1093130591933992961|WATCH: Nick Sandm...|[f

In [16]:
def extract_user_info(row):
    return sc.parallelize([row[0]['user']]).toDF().select('created_at',
                    'followers_count', 
                    'following',
                    'friends_count',
                    'id',
                    'screen_name',
                    'statuses_count',
                    'verified')

In [17]:
extract_user_info(posts.take(1)).show()

+--------------------+---------------+---------+-------------+--------+-------------+--------------+--------+
|          created_at|followers_count|following|friends_count|      id|  screen_name|statuses_count|verified|
+--------------------+---------------+---------+-------------+--------+-------------+--------------+--------+
|2009-09-22T02:34:...|          14860|    false|         2414|76227785|DCClothesline|        104815|   false|
+--------------------+---------------+---------+-------------+--------+-------------+--------------+--------+



In [18]:
posts_flattened = posts.withColumn('user_created_at', posts.user.created_at)\
    .withColumn('user_followers_count', posts.user.followers_count)\
    .withColumn('user_following', posts.user.following)\
    .withColumn('user_friends_count', posts.user.friends_count)\
    .withColumn('user_id', posts.user.id)\
    .withColumn('user_screen_name', posts.user.screen_name)\
    .withColumn('user_statuses_count', posts.user.statuses_count)\
    .withColumn('user_verified', posts.user.verified)

In [19]:
posts_flattened = posts_flattened.drop('user')

In [20]:
posts_flattened.columns

['created_at',
 'id',
 'text',
 'user_created_at',
 'user_followers_count',
 'user_following',
 'user_friends_count',
 'user_id',
 'user_screen_name',
 'user_statuses_count',
 'user_verified']

In [21]:
tweets = tweets.withColumn('user_created_at', posts.user.created_at)\
    .withColumn('user_followers_count', posts.user.followers_count)\
    .withColumn('user_following', posts.user.following)\
    .withColumn('user_friends_count', posts.user.friends_count)\
    .withColumn('user_id', posts.user.id)\
    .withColumn('user_screen_name', posts.user.screen_name)\
    .withColumn('user_statuses_count', posts.user.statuses_count)\
    .withColumn('user_verified', posts.user.verified)

In [22]:
tweets = tweets.drop('user', 'lang', 'source')

In [23]:
tweets.columns

['created_at',
 'favorite_count',
 'id',
 'in_reply_to_screen_name',
 'in_reply_to_status_id',
 'in_reply_to_user_id',
 'retweet_count',
 'retweeted',
 'retweeted_status',
 'text',
 'user_created_at',
 'user_followers_count',
 'user_following',
 'user_friends_count',
 'user_id',
 'user_screen_name',
 'user_statuses_count',
 'user_verified']

In [24]:
tweets.limit(5).toPandas().head()

Unnamed: 0,created_at,favorite_count,id,in_reply_to_screen_name,in_reply_to_status_id,in_reply_to_user_id,retweet_count,retweeted,retweeted_status,text,user_created_at,user_followers_count,user_following,user_friends_count,user_id,user_screen_name,user_statuses_count,user_verified
0,2019-02-06T08:59:25.000Z,0,1093071654564044806,BreitbartNews,1.093e+18,457984599.0,0,False,,@BreitbartNews She’s just dumb smdh,2017-08-03T05:23:40.000Z,333,False,519,892979259010146306,Lorenzen4Robert,1881,False
1,2019-02-06T08:59:49.000Z,0,1093071756909314049,,,,0,False,"([], 2019-02-05T20:26:08.000Z, [0, 140], ([], ...",RT @TheOnion: Guilt-Ridden Stacey Abrams Wonde...,2015-07-22T13:30:18.000Z,68,False,147,3387352054,RPytleski,4718,False
2,2019-02-06T08:59:50.000Z,0,1093071759383908353,,,,0,False,"([], 2019-02-05T17:20:25.000Z, [0, 102], ([], ...",RT @TheOnion: City Of Boston Erects New Plaque...,2016-07-10T00:22:28.000Z,787,False,521,751934579435511808,zorrawoke,2574,False
3,2019-02-06T09:00:53.000Z,0,1093072023734112256,,,,0,False,"([], 2019-02-05T20:26:08.000Z, [0, 140], ([], ...",RT @TheOnion: Guilt-Ridden Stacey Abrams Wonde...,2013-09-23T17:31:28.000Z,756,False,1109,1897895070,CollinsKaranja3,34385,False
4,2019-02-06T09:00:56.000Z,0,1093072037017460736,,,,0,False,"([], 2019-02-06T05:00:24.000Z, None, ([], [], ...",RT @BreitbartNews: Uh oh. https://t.co/Pj9A9Abi00,2013-03-26T18:20:50.000Z,3545,False,3590,1305192181,Rushelle8,146855,False


In [25]:
fake_tweets = tweets.where(col('text').startswith('RT') == False).where(col('in_reply_to_screen_name').isNull())

In [26]:
fake_tweets.limit(5).toPandas().head()

Unnamed: 0,created_at,favorite_count,id,in_reply_to_screen_name,in_reply_to_status_id,in_reply_to_user_id,retweet_count,retweeted,retweeted_status,text,user_created_at,user_followers_count,user_following,user_friends_count,user_id,user_screen_name,user_statuses_count,user_verified
0,2019-02-06T10:02:03.000Z,0,1093087417941151746,,,,0,False,,"Former UFC fighter, Nick “The Jersey Devil” Ca...",2009-09-22T02:34:29.000Z,14860,False,2414,76227785,DCClothesline,104815,False
1,2019-02-06T10:02:04.000Z,0,1093087422240321536,,,,0,False,,Michigan: Muslim Congresswoman Rashida Tlaib R...,2009-09-22T02:34:29.000Z,14860,False,2414,76227785,DCClothesline,104816,False
2,2019-02-06T10:02:05.000Z,0,1093087427172884481,,,,0,False,,Wisconsin: 7yo dies after being ‘buried in cof...,2009-09-22T02:34:29.000Z,14860,False,2414,76227785,DCClothesline,104817,False
3,2019-02-06T10:08:26.000Z,0,1093089021708898304,,,,0,False,,The PoliticusUSA Daily is out! https://t.co/2J...,2008-05-15T23:31:00.000Z,68421,False,4026,14792049,politicususa,48051,False
4,2019-02-06T12:20:36.000Z,0,1093122284007624704,,,,0,False,,Video: Muslim Congresswoman Ilhan Omar blamed ...,2009-09-22T02:34:29.000Z,14858,False,2414,76227785,DCClothesline,104818,False


In [27]:
tweets.count() - 96

13184

In [28]:
fake_tweets.createOrReplaceTempView('fake_tweets')

In [29]:
spark.sql("SELECT id FROM fake_tweets").toPandas().head(5)

Unnamed: 0,id
0,1093087417941151746
1,1093087422240321536
2,1093087427172884481
3,1093089021708898304
4,1093122284007624704


In [30]:
tweets.createOrReplaceTempView('tweets')

In [31]:
not_original = spark.sql(
    """
        SELECT * FROM tweets
        WHERE tweets.id NOT IN (SELECT id FROM fake_tweets)
    """)

In [32]:
not_original.count()

13184

In [33]:
not_original.toPandas().head()

Unnamed: 0,created_at,favorite_count,id,in_reply_to_screen_name,in_reply_to_status_id,in_reply_to_user_id,retweet_count,retweeted,retweeted_status,text,user_created_at,user_followers_count,user_following,user_friends_count,user_id,user_screen_name,user_statuses_count,user_verified
0,2019-02-06T08:59:25.000Z,0,1093071654564044806,BreitbartNews,1.093e+18,457984599.0,0,False,,@BreitbartNews She’s just dumb smdh,2017-08-03T05:23:40.000Z,333,False,519,892979259010146306,Lorenzen4Robert,1881,False
1,2019-02-06T08:59:49.000Z,0,1093071756909314049,,,,0,False,"([], 2019-02-05T20:26:08.000Z, [0, 140], ([], ...",RT @TheOnion: Guilt-Ridden Stacey Abrams Wonde...,2015-07-22T13:30:18.000Z,68,False,147,3387352054,RPytleski,4718,False
2,2019-02-06T08:59:50.000Z,0,1093071759383908353,,,,0,False,"([], 2019-02-05T17:20:25.000Z, [0, 102], ([], ...",RT @TheOnion: City Of Boston Erects New Plaque...,2016-07-10T00:22:28.000Z,787,False,521,751934579435511808,zorrawoke,2574,False
3,2019-02-06T09:00:53.000Z,0,1093072023734112256,,,,0,False,"([], 2019-02-05T20:26:08.000Z, [0, 140], ([], ...",RT @TheOnion: Guilt-Ridden Stacey Abrams Wonde...,2013-09-23T17:31:28.000Z,756,False,1109,1897895070,CollinsKaranja3,34385,False
4,2019-02-06T09:00:56.000Z,0,1093072037017460736,,,,0,False,"([], 2019-02-06T05:00:24.000Z, None, ([], [], ...",RT @BreitbartNews: Uh oh. https://t.co/Pj9A9Abi00,2013-03-26T18:20:50.000Z,3545,False,3590,1305192181,Rushelle8,146855,False


In [34]:
not_original.createOrReplaceTempView('not_originals')

In [35]:
retweets = spark.sql(
    """
        SELECT id as retweet_id, created_at, text as retweet_text, retweeted_status.retweet_count,
                              retweet_count,
                              retweeted_status.id as original_tweet_id,
                              retweeted_status.text as original_tweet_text,
                              retweeted_status.user.screen_name as original_tweet_author
        FROM not_originals 
        WHERE retweeted_status IS NOT NULL
    """)
            

In [36]:
retweets.toPandas().head()

Unnamed: 0,retweet_id,created_at,retweet_text,retweet_count,retweet_count.1,original_tweet_id,original_tweet_text,original_tweet_author
0,1093071756909314049,2019-02-06T08:59:49.000Z,RT @TheOnion: Guilt-Ridden Stacey Abrams Wonde...,538,0,1092882083956051970,Guilt-Ridden Stacey Abrams Wondering When She ...,TheOnion
1,1093071759383908353,2019-02-06T08:59:50.000Z,RT @TheOnion: City Of Boston Erects New Plaque...,415,0,1092835348202377216,City Of Boston Erects New Plaque Commemorating...,TheOnion
2,1093072023734112256,2019-02-06T09:00:53.000Z,RT @TheOnion: Guilt-Ridden Stacey Abrams Wonde...,539,0,1092882083956051970,Guilt-Ridden Stacey Abrams Wondering When She ...,TheOnion
3,1093072037017460736,2019-02-06T09:00:56.000Z,RT @BreitbartNews: Uh oh. https://t.co/Pj9A9Abi00,495,0,1093011503840722944,Uh oh. https://t.co/Pj9A9Abi00,BreitbartNews
4,1093072158060883969,2019-02-06T09:01:25.000Z,RT @BreitbartNews: Trump calls out VA Gov. Ral...,433,0,1093016479069585408,Trump calls out VA Gov. Ralph Northam's bizarr...,BreitbartNews


In [37]:
retweets.count()

8061

In [38]:
retweets.createOrReplaceTempView('retweets')

In [39]:
valid_retweets = spark.sql(
    """
        SELECT * FROM retweets WHERE original_tweet_id IN (SELECT id FROM fake_tweets)
    """)

In [45]:
valid_retweets.groupBy('original_tweet_id').count().show(100)

+-------------------+-----+
|  original_tweet_id|count|
+-------------------+-----+
|1093155877853585408|  169|
|1093151341239824385|    1|
|1093248996397776898|  261|
|1093140984588840960|    6|
|1093130591933992961|    2|
|1093389498438819840|    2|
|1093177650103889920|    7|
|1093173610477506562|    2|
|1093189905843478528|  113|
|1093194545985191943|  190|
|1093126929039777792|    1|
|1093270193944567808|    1|
|1093175662964326401|    3|
|1093138208035090432|  337|
|1093225717997944833|    7|
|1093237925175791616|   32|
|1093177019016323072|   14|
|1093217791598055427|  297|
|1093174746668195842|   16|
|1093169060777480192|    5|
|1093089021708898304|   13|
|1093242651162394624|    4|
|1093248908418080769|    9|
|1093150594280255489|   18|
|1093202892989837316|   25|
|1093215533162528768|   96|
|1093391939171422208|    2|
|1093173820008087553|    4|
|1093238886690684933|    8|
|1093396973351247872|    1|
|1093147294927634433|    2|
|1093150638937174017|  177|
|1093214039478542341

In [41]:
spark.sql(
    """
        SELECT * FROM retweets WHERE original_tweet_id NOT IN (SELECT id FROM fake_tweets)
    """).toPandas().head(5)

Unnamed: 0,retweet_id,created_at,retweet_text,retweet_count,retweet_count.1,original_tweet_id,original_tweet_text,original_tweet_author
0,1093071756909314049,2019-02-06T08:59:49.000Z,RT @TheOnion: Guilt-Ridden Stacey Abrams Wonde...,538,0,1092882083956051970,Guilt-Ridden Stacey Abrams Wondering When She ...,TheOnion
1,1093071759383908353,2019-02-06T08:59:50.000Z,RT @TheOnion: City Of Boston Erects New Plaque...,415,0,1092835348202377216,City Of Boston Erects New Plaque Commemorating...,TheOnion
2,1093072023734112256,2019-02-06T09:00:53.000Z,RT @TheOnion: Guilt-Ridden Stacey Abrams Wonde...,539,0,1092882083956051970,Guilt-Ridden Stacey Abrams Wondering When She ...,TheOnion
3,1093072037017460736,2019-02-06T09:00:56.000Z,RT @BreitbartNews: Uh oh. https://t.co/Pj9A9Abi00,495,0,1093011503840722944,Uh oh. https://t.co/Pj9A9Abi00,BreitbartNews
4,1093072158060883969,2019-02-06T09:01:25.000Z,RT @BreitbartNews: Trump calls out VA Gov. Ral...,433,0,1093016479069585408,Trump calls out VA Gov. Ralph Northam's bizarr...,BreitbartNews


In [42]:
originals.select('id').show()

+-------------------+
|                 id|
+-------------------+
|1093071756909314049|
|1093071759383908353|
|1093072023734112256|
|1093072037017460736|
|1093072158060883969|
|1093072158933307392|
|1093072313883545600|
|1093072588325208066|
|1093072620558331904|
|1093072624333275136|
|1093072977141354498|
|1093073120850833408|
|1093073404565938176|
|1093073512670154752|
|1093073643473641472|
|1093073792564379649|
|1093073823086403584|
|1093074018478055425|
|1093074141694083073|
|1093074177182044160|
+-------------------+
only showing top 20 rows



In [43]:
fake_tweets.count()

96