In [25]:
df = spark.read.format('json').load("hdfs://orion11:33001/reddit/sampled_reddit/*")
print(df.count())

309199315


In [26]:
from pyspark.sql.functions import year, month, dayofmonth, from_unixtime, desc
from pyspark.sql.types import TimestampType, DateType

botExpr = "[bB][oO][tT]"

df2 = (df
        .filter(~(df.body.like("[deleted]") | df.author.rlike(botExpr))))


In [27]:
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import udf, col

def is_screamer(comment):
    return 1 if comment.isupper() else 0
        
is_screamer_udf = udf(is_screamer, IntegerType())
df3 = df2.withColumn('is_screamer',is_screamer_udf(col('body')))

In [28]:
df3.show()

+--------+----------------+--------------+----------------------+-----------------+--------------------+---------+----------------+-------+-----------+-------------+-----+------+------+-------+--------+-----------+----------+----------+--------------+-------+------------+-----+-----+------------+--------+-----------------+------------+---+------------+-----------+
|archived|          author|author_cakeday|author_flair_css_class|author_flair_text|                body|body_html|controversiality|created|created_utc|distinguished|downs|edited|gilded|     id| link_id|mod_reports|      name| parent_id|removal_reason|replies|retrieved_on|saved|score|score_hidden|stickied|        subreddit|subreddit_id|ups|user_reports|is_screamer|
+--------+----------------+--------------+----------------------+-----------------+--------------------+---------+----------------+-------+-----------+-------------+-----+------+------+-------+--------+-----------+----------+----------+--------------+-------+-------

In [29]:
import pyspark.sql.functions as sf
df4 = df3.groupBy("subreddit").agg(sf.sum('is_screamer').alias('screamer_score'))
screamer_df = df4.select('subreddit', 'screamer_score').orderBy(desc("screamer_score")).limit(5)

In [30]:
screamer_df.show()

+----------+--------------+
| subreddit|screamer_score|
+----------+--------------+
| AskReddit|        243568|
|       nfl|        100707|
|    hockey|         97863|
|The_Donald|         94483|
|     funny|         94309|
+----------+--------------+



In [32]:
import pyspark.sql.functions as sf
df5 = df3.groupBy("author").agg(sf.sum('is_screamer').alias('screamer_score'))
screamer_user_df = df5.select('author', 'screamer_score').orderBy(desc("screamer_score"))

In [33]:
screamer_user_df.show()

+-------------------+--------------+
|             author|screamer_score|
+-------------------+--------------+
|          [deleted]|        135253|
|     atomicimploder|          3516|
|      TheNitromeFan|          2277|
|       KingCaspianX|          1617|
|           the2belo|          1224|
|       Removedpixel|          1185|
|       PoppyOncrack|          1181|
|          QuestoGuy|          1049|
|       redditmortis|          1031|
|  TEA_PARTY_PATRIOT|          1016|
|         davidjl123|           935|
|       NightmareSyx|           895|
|         xHOCKEYx12|           861|
|     PotatoAssassin|           838|
|           Mooraell|           824|
| Release_the_KRAKEN|           786|
|             ambral|           772|
|        Sir_toolman|           745|
|             yangar|           716|
|SomalianRoadBuilder|           701|
+-------------------+--------------+
only showing top 20 rows



In [34]:
screamer_user_df.filter(~(screamer_user_df.author.like("[deleted]"))).show()

+-------------------+--------------+
|             author|screamer_score|
+-------------------+--------------+
|     atomicimploder|          3516|
|      TheNitromeFan|          2277|
|       KingCaspianX|          1617|
|           the2belo|          1224|
|       Removedpixel|          1185|
|       PoppyOncrack|          1181|
|          QuestoGuy|          1049|
|       redditmortis|          1031|
|  TEA_PARTY_PATRIOT|          1016|
|         davidjl123|           935|
|       NightmareSyx|           895|
|         xHOCKEYx12|           861|
|     PotatoAssassin|           838|
|           Mooraell|           824|
| Release_the_KRAKEN|           786|
|             ambral|           772|
|        Sir_toolman|           745|
|             yangar|           716|
|       delatriangle|           701|
|SomalianRoadBuilder|           701|
+-------------------+--------------+
only showing top 20 rows

