In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder\
                    .appName('Analyzing tweet stories data')\
                    .getOrCreate()

In [3]:
original_posts = spark.read\
              .format('json')\
              .load('original_posts.json')

In [4]:
retweet_posts = spark.read\
              .format('json')\
              .load('retweet_posts.json')

In [5]:
original_posts.createOrReplaceTempView('original_posts')

In [6]:
retweet_posts.createOrReplaceTempView('retweet_posts')

In [20]:
spark.sql("""
        SELECT COUNT(user_screen_name) as total_count, user_screen_name 
        FROM original_posts 
        GROUP BY user_screen_name
        ORDER BY total_count DESC
        """).toPandas()

Unnamed: 0,total_count,user_screen_name
0,105,DCClothesline
1,73,politicususa
2,60,TheOnion
3,36,BreitbartNews
4,17,worldnetdaily
5,6,OccupyDemocrats


### Find top 25 posts

In [59]:
top_25 = spark.sql(
    """
        SELECT  COUNT(retweet_status.retweet_status_id) as total, retweet_status.retweet_status_id
        FROM retweet_posts
        GROUP BY retweet_status.retweet_status_id
        ORDER BY total DESC
        LIMIT 25
    """)

In [60]:
top_25.toPandas()

Unnamed: 0,total,retweet_status_id
0,6560,1095080391084523520
1,2717,1095092293537882117
2,2670,1095429857905913857
3,2536,1095321394299179008
4,2448,1095801413568024576
5,2362,1095064441912213504
6,1619,1095758772268662784
7,1397,1095161529308233728
8,1323,1095679869856501760
9,995,1095695867930136576


### Pick post id '1096099784262381568'

In [71]:
post = spark.sql(
    """
        SELECT * FROM original_posts WHERE id = 1096099784262381568
    """)

post.toPandas().head()

Unnamed: 0,created_at,id,text,user_followers_count,user_friends_count,user_id,user_screen_name,user_verified
0,2019-02-14T17:32:07.000Z,1096099784262381568,Man Hoping Girlfriend Doesn't Notice Valentine...,11032690,14,14075928,TheOnion,True


In [72]:
post_retweets = spark.sql(
    """
        SELECT * FROM retweet_posts WHERE retweet_status.retweet_status_id=1096099784262381568
    """)
post_retweets.toPandas().head()

Unnamed: 0,created_at,id,retweet_status,text,user_followers_count,user_friends_count,user_id,user_screen_name,user_verified
0,2019-02-14T17:35:01.000Z,1096100512062009344,"(1096099784262381568, Man Hoping Girlfriend Do...",RT @TheOnion: Man Hoping Girlfriend Doesn't No...,36,112,27731830,andhraite,False
1,2019-02-14T17:38:36.000Z,1096101416295374849,"(1096099784262381568, Man Hoping Girlfriend Do...",RT @TheOnion: Man Hoping Girlfriend Doesn't No...,351,1258,389687876,ColmMagan,False
2,2019-02-14T17:40:32.000Z,1096101899705729029,"(1096099784262381568, Man Hoping Girlfriend Do...",RT @TheOnion: Man Hoping Girlfriend Doesn't No...,193,975,957342374,GenisisDancer,False
3,2019-02-14T18:29:36.000Z,1096114249800314881,"(1096099784262381568, Man Hoping Girlfriend Do...",RT @TheOnion: Man Hoping Girlfriend Doesn't No...,399,282,2717357373,MIKEYJEAN1,False
4,2019-02-14T17:43:29.000Z,1096102641854078977,"(1096099784262381568, Man Hoping Girlfriend Do...",RT @TheOnion: Man Hoping Girlfriend Doesn't No...,222,282,731252286,PBSamlish,False


In [73]:
post.coalesce(1).write.format('json').save('post.json')

In [74]:
post_retweets.coalesce(1).write.format('json').save('post_retweets.json')