### Consumer

This consumer should recieve the reddit data from the producer and perform data processing

In [None]:
# stop the SparkContext
ssc.stop()

In [None]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

In [None]:
# pip install nltk

- Take the raw Producer Data and save it to files on disk

- Get the references and the number of occurrences (suggested in 60 seconds windows every 5 seconds). Note: You have to let it run for at least 60 seconds and then each window is sliding (5 seconds). You will see the lines count for each window then fluctuates between 19 and 22 as it drops the previous posts and aggregates the new posts. 

##### !!!  Need to add the "Get top 10 important words in window using TF-IDF"  !!!

In [None]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext
from pyspark.sql.types import StructType, StructField, StringType
import json
import re
import time

# Create a SparkSession and StreamingContext
spark_conf = SparkConf().setAppName("reddit")
ss1 = SparkSession.builder.config(conf=spark_conf).getOrCreate()
ssc = StreamingContext(ss1.sparkContext, 5)

# Create a DStream
lines = ssc.socketTextStream("localhost", 9998)

# Apply windowing to the DStream
windowed_lines = lines.window(60, 5)

# Parse the JSON data
comments = windowed_lines.map(lambda json_data: json.loads(json_data))
comments.pprint()

# Define the schema for the DataFrame
schema = StructType([
    StructField("comment", StringType(), True),
    StructField("prev_comment", StringType(), True),
    StructField("post", StringType(), True),
    StructField("author", StringType(), True),
    StructField("created_utc", StringType(), True),
    StructField("link_url", StringType(), True),
    StructField("u_refs", StringType(), True),
    StructField("p_refs", StringType(), True),
])

base_path = "./data/raw/reddit_test6"

def extract_references(comment):
    user_references = re.findall(r'/u/\w+', comment)
    post_references = re.findall(r'/r/\w+', comment)
    return ','.join(user_references), ','.join(post_references)

# Convert each RDD in the DStream to a DataFrame
def process_rdd(time, rdd):
    if not rdd.isEmpty():
        extracted_data = rdd.map(lambda x: {**x, "u_refs": extract_references(x['link_url'])[0], "p_refs": extract_references(x['link_url'])[1]})
        df = ss1.createDataFrame(extracted_data, schema)
        
        # Register the DataFrame as a temporary table named "raw"
        df.createOrReplaceTempView("raw")
        
        # Persist the DataFrame
        df.persist()
        
        # Write the DataFrame to disk
        output_path = f"{base_path}/{time.strftime('%Y%m%d%H%M%S')}"
        df.write.json(output_path)
        
        # Show the DataFrame for debugging
        df.show()

        # Count the occurrences in the window
        count = df.count()
        print(f"Number of occurrences in the window: {count}")

        # Save the count to a separate file
        #count_path = f"{base_path}_counts/{time.strftime('%Y%m%d%H%M%S')}_count.txt"
        #with open(count_path, 'w') as f:
         #   f.write(str(count))

comments.foreachRDD(process_rdd)

# Start the streaming context
ssc.start()

 ### Test Code