### Receive and Save Raw Data

This part shows how to receive Reddit JSON content through a socket and save the raw data to disk using Spark DStreams.

from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext
from pyspark.sql.types import StructType, StructField, StringType
import json
import time

# Create a SparkSession and StreamingContext
spark_conf = SparkConf().setAppName("reddit")
spark = SparkSession.builder.config(conf=spark_conf).getOrCreate()
ssc = StreamingContext(spark.sparkContext, 5)  # 5 seconds batch interval

# Create a DStream
lines = ssc.socketTextStream("localhost", 9998)
comments = lines.map(lambda json_data: json.loads(json_data))

# Define the schema for the DataFrame
schema = StructType([
    StructField("title", StringType(), True),
    StructField("date", StringType(), True),
    StructField("author", StringType(), True),
    StructField("subreddit", StringType(), True)
])

# Base path to save raw data
base_path = "./data/raw/reddit_sub"

# Convert each RDD in the DStream to a DataFrame
def process_rdd(time, rdd):
    if not rdd.isEmpty():
        df = spark.createDataFrame(rdd, schema)
        df.createOrReplaceTempView("comments")
        df.persist()
        output_path = f"{base_path}/{time.strftime('%Y%m%d%H%M%S')}"
        df.write.json(output_path)
        df.show()

comments.foreachRDD(process_rdd)

# Start the streaming context
ssc.start()
# No ssc.awaitTermination() here to make the cell non-blocking and to use other cells in parallel.


### Process and Analyze Data

This part shows how to use Spark Structured Streaming to load data from disk, process it, and perform various transformations.

In [None]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, TimestampType
from pyspark.sql import functions as F

# Create a SparkSession
spark_conf = SparkConf().setAppName("reddit")
spark = SparkSession.builder.config(conf=spark_conf).getOrCreate()

# Path to input data
input_path = "./data/raw/reddit_v5/*/*.json"

# Define the schema for the input data
schema = StructType([
    StructField("title", StringType(), True),
    StructField("date", StringType(), True),
    StructField("author", StringType(), True),
    StructField("subreddit", StringType(), True)
])

# Read the input data as a streaming DataFrame
streaming_df = spark.readStream \
    .format("json") \
    .schema(schema) \
    .option("path", input_path) \
    .load()

# Transform the data
transformed_df = streaming_df \
    .withColumn('created_utc_ts', F.to_timestamp(F.col('date'))) \
    .withWatermark("created_utc_ts", "5 seconds") \
    .groupBy(F.col("author"), F.window(F.col("created_utc_ts"), "60 seconds")) \
    .agg({"created_utc_ts": 'max', "title": 'count'}) \
    .select(F.col("author"), F.col("window"), F.col("max(created_utc_ts)").alias("last_post_time"), F.col("count(title)").alias("post_count"))

# Save output to disk
output_path = "./data/processed/reddit_v1"
checkpt_path = "./metadata/processed/reddit_v1"

transformed_df.writeStream \
    .format("json") \
    .option("checkpointLocation", checkpt_path) \
    .option("path", output_path) \
    .outputMode("append") \
    .start()

# For testing, output to console (uncomment to use)
# query = transformed_df.writeStream \
#     .format("console") \
#     .start()

# Mixing static table with historical content and dynamic table with content from current window
base_path = "./data/raw/reddit_v5/*/*.json"
historical = spark.read.json(base_path)
historical.createOrReplaceTempView('historical')
historical.show()

spark.sql("""
    SELECT * 
    FROM comments as ct
    LEFT JOIN historical as ht on ht.author = ct.author
    """).show()
