In [39]:
import time
from pyspark.sql import SparkSession
from operator import add
from pyspark.sql.functions import explode, split, desc, col

#################### Setup spark session ####################

# Stop running SparkSession if it exists
if 'spark_session' in locals():
    spark_session.stop()
    
spark_session = SparkSession\
        .builder\
        .master("spark://192.168.2.224:7077") \
        .appName("Experiment")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.dynamicAllocation.shuffleTracking.enabled",True)\
        .config("spark.shuffle.service.enabled", False)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.driver.port",9999)\
        .config("spark.blockManager.port",10005)\
        .getOrCreate()

# RDD API
spark_context = spark_session.sparkContext

# Start time
start_time = time.time()

# Load reddit data
reddit_df = spark_session.read.json("hdfs://192.168.2.224:9000/user/ubuntu/reddit/corpus-webis-tldr-17.json")

reddit_df.printSchema()

# End time
end_time = time.time()

# Calculate duration
duration_sec = end_time - start_time
duration_min = duration_sec / 60

# Print duration
print("Time to load the dataset (seconds):", duration_sec, "seconds")
print("Time to load the dataset (minutes):", duration_min, "minutes")

24/03/14 13:00:03 WARN StandaloneSchedulerBackend: Dynamic allocation enabled without spark.executor.cores explicitly set, you may get more executors allocated than expected. It's recommended to set spark.executor.cores explicitly. Please check SPARK-30299 for more details.

root
 |-- author: string (nullable = true)
 |-- body: string (nullable = true)
 |-- content: string (nullable = true)
 |-- content_len: long (nullable = true)
 |-- id: string (nullable = true)
 |-- normalizedBody: string (nullable = true)
 |-- subreddit: string (nullable = true)
 |-- subreddit_id: string (nullable = true)
 |-- summary: string (nullable = true)
 |-- summary_len: long (nullable = true)
 |-- title: string (nullable = true)

Time to load the dataset (seconds): 80.50780272483826 seconds
Time to load the dataset (minutes): 1.3417967120806376 minutes


                                                                                

In [38]:
# Start time
start_time = time.time()

#################### Most popular subreddits ####################

# Group by subreddit and count occurrences
popular_subreddits = reddit_df.groupBy("subreddit").count()

# Sort by count in descending order to find the most popular subreddits
popular_subreddits = popular_subreddits.orderBy(desc("count"))

# Show the top 10 most popular subreddits
print("Top 10 of the most popular subreddits\n")
popular_subreddits.show(10)

#################### Most frequent occurring words ####################

# Tokenize the text data in the body column
words = reddit_df.select(explode(split(reddit_df.body, "\\s+")).alias("word"))

# Count occurrences of each word
word_counts = words.groupBy("word").count()

# Sort by count in descending order to find the most frequent words
most_frequent_words = word_counts.orderBy(desc("count"))

# Show the top 10 most frequent words
print("Top 10 of the most frequent occurring words:\n")
most_frequent_words.show(10)

#################### Authors with the most bad words used ####################

# Tokenize the text in the 'body' column to extract individual words
words_df = reddit_df.select("id", explode(split(reddit_df.body, "\\s+")).alias("word"))

# Load the list of bad words
with open("full-list-of-bad-words_text-file_2022_05_05.txt", "r") as file:
    lines = [line.strip() for line in file.readlines()]
    bad_words = lines

# Filter words to select only bad words
bad_words_df = words_df.filter(words_df.word.isin(bad_words))

# Group by bad words and count occurrences
bad_word_counts_df = bad_words_df.groupBy("word").count()

# Sort by count of bad words in descending order
sorted_bad_word_counts_df = bad_word_counts_df.orderBy(col("count").desc())

# Show the top 10 most used bad words
# sorted_bad_word_counts_df.show(10)

# Group by author and count bad words
bad_words_count_df = bad_words_df.join(reddit_df, "id").groupBy("author").count()

# Sort by count of bad words in descending order
sorted_bad_words_count_df = bad_words_count_df.orderBy(col("count").desc())

# Show the top 10 authors with the most bad words
print("Top 10 authors with the most bad words:\n")
sorted_bad_words_count_df.show(10)

#################### Execution time measurement ####################

# End time
end_time = time.time()

# Calculate duration
duration_sec = end_time - start_time
duration_min = duration_sec / 60

# Print duration
print("Time to run the experiment (seconds):", duration_sec, "seconds")
print("Time to run the experiment (minutes):", duration_min, "minutes")

Top 10 of the most popular subreddits



                                                                                

+-------------------+------+
|          subreddit| count|
+-------------------+------+
|          AskReddit|589947|
|      relationships|352049|
|    leagueoflegends|109307|
|               tifu| 52219|
|relationship_advice| 50416|
|              trees| 47286|
|             gaming| 43851|
|            atheism| 43268|
|      AdviceAnimals| 40783|
|              funny| 40171|
+-------------------+------+
only showing top 10 rows

Top 10 of the most frequent occurring words:



                                                                                

+----+--------+
|word|   count|
+----+--------+
| the|38303658|
|  to|34684398|
|   I|34223440|
| and|32121991|
|   a|27233249|
|  of|19675785|
|that|13871944|
|  in|13539182|
|  is|11695201|
|  my|11226285|
+----+--------+
only showing top 10 rows

Top 10 authors with the most bad words:





+------------------+------+
|            author| count|
+------------------+------+
|         [deleted]|769514|
|      iamtotalcrap|  5721|
|        Furiousmoe|  1951|
|           codayus|  1853|
|           DejaBoo|  1365|
|        pixis-4950|  1347|
|       Death_Star_|  1205|
|        Typhos1234|   977|
|ExceptionToTheRule|   974|
|           p_U_c_K|   948|
+------------------+------+
only showing top 10 rows

Time to run the experiment (seconds): 196.07083868980408 seconds
Time to run the experiment (minutes): 3.267847311496735 minutes


                                                                                

In [None]:
#################### Stop spark session ####################

# release the cores for another application/experiment!
spark_context.stop()