In [7]:
import time
from pyspark.sql import SparkSession
from operator import add
from pyspark.sql.functions import explode, split, desc

#################### Setup spark session ####################

# Stop running SparkSession if it exists
if 'spark_session' in locals():
    spark_session.stop()
    
spark_session = SparkSession\
        .builder\
        .master("spark://192.168.2.224:7077") \
        .appName("Experiment")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.dynamicAllocation.shuffleTracking.enabled",True)\
        .config("spark.shuffle.service.enabled", False)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.driver.port",9999)\
        .config("spark.blockManager.port",10005)\
        .getOrCreate()

# RDD API
spark_context = spark_session.sparkContext

# Start time
start_time = time.time()

# Load reddit data
reddit_df = spark_session.read.json("hdfs://192.168.2.224:9000/user/ubuntu/reddit/corpus-webis-tldr-17.json")

#################### Most popular subreddits ####################

# Group by subreddit and count occurrences
popular_subreddits = reddit_df.groupBy("subreddit").count()

# Sort by count in descending order to find the most popular subreddits
popular_subreddits = popular_subreddits.orderBy(desc("count"))

# Show the top 10 most popular subreddits
print("Top 10 of the most popular subreddits\n")
popular_subreddits.show(10)

#################### Most frequent occurring words ####################

# Tokenize the text data in the body column
words = reddit_df.select(explode(split(reddit_df.body, "\\s+")).alias("word"))

# Count occurrences of each word
word_counts = words.groupBy("word").count()

# Sort by count in descending order to find the most frequent words
most_frequent_words = word_counts.orderBy(desc("count"))

# Show the top 10 most frequent words
print("Top 10 of the most frequent occurring words:\n")
most_frequent_words.show(10)

#################### Execution time measurement ####################

# End time
end_time = time.time()

# Calculate duration
duration = end_time - start_time

# Print duration
print("Execution time:", duration, "seconds")

#################### Stop spark session ####################

# release the cores for another application/experiment!
spark_context.stop()

                                                                                

Top 10 of the most popular subreddits



                                                                                

+-------------------+------+
|          subreddit| count|
+-------------------+------+
|          AskReddit|589947|
|      relationships|352049|
|    leagueoflegends|109307|
|               tifu| 52219|
|relationship_advice| 50416|
|              trees| 47286|
|             gaming| 43851|
|            atheism| 43268|
|      AdviceAnimals| 40783|
|              funny| 40171|
+-------------------+------+
only showing top 10 rows

Top 10 of the most frequent occurring words:



                                                                                

+----+--------+
|word|   count|
+----+--------+
| the|38303658|
|  to|34684398|
|   I|34223440|
| and|32121991|
|   a|27233249|
|  of|19675785|
|that|13871944|
|  in|13539182|
|  is|11695201|
|  my|11226285|
+----+--------+
only showing top 10 rows

Execution time: 1883.464242696762 seconds
