In [None]:
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import explode, split, col
from pyspark.ml.feature import HashingTF, IDF
import json
import re

# Define the host and port
host = "127.0.0.1"
port = 9998

# Initialize SparkContext and SparkSession
sc = SparkContext("local[2]", "DisplayLinesV2")
sc.setLogLevel("ERROR")
ssc = StreamingContext(sc, 60)  # Set batch interval to 60 seconds
spark = SparkSession(sc)


def process_rdd(rdd):
    """
    Process each RDD and perform the following actions:
    1. Convert RDD to DataFrame and persist it.
    2. Extract references to users, posts, and URLs.
    3. Calculate TF-IDF.
    4. Find the top words and their TF-IDF scores.
    5. Store metrics in a temporary table.
    """
    
    if not rdd.isEmpty():
        # Convert RDD to DataFrame and persist it.
        records = rdd.map(lambda record: json.loads(record)).collect()
        if records:
            df = spark.createDataFrame(records)
            df.createOrReplaceTempView("raw")
            df.write.json("/data/raw/reddit_ask", mode="append")
            
            # Extract references to users, posts, and URLs.
            references = df.select(
                explode(split(col("text"), " ")).alias("word")
            ).filter(
                (col("word").like("/u/%")) |
                (col("word").like("/r/%")) |
                (col("word").like("http%"))
            )
            
            # User references.
            user_refs = references.filter(col("word").like("/u/%")).groupBy("word").count()
            user_refs.createOrReplaceTempView("user_references")
            user_refs.write.json("/path/to/output/user_references", mode="append")

            # Post references.
            post_refs = references.filter(col("word").like("/r/%")).groupBy("word").count()
            post_refs.createOrReplaceTempView("post_references")
            post_refs.write.json("/path/to/output/post_references", mode="append")
            
            # URL references.
            url_refs = references.filter(col("word").like("http%")).groupBy("word").count()
            url_refs.createOrReplaceTempView("url_references")
            url_refs.write.json("/path/to/output/url_references", mode="append")
            
            # Calculate TF-IDF.
            words_data = df.select(explode(split(col("text"), " ")).alias("words"))
            hashing_tf = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
            idf = IDF(inputCol="rawFeatures", outputCol="features")
            tf = hashing_tf.transform(words_data)
            idf_model = idf.fit(tf)
            tfidf_data = idf_model.transform(tf)
            
            # Find the top words and their TF-IDF scores.
            top_words = tfidf_data.select("words", "features").rdd.flatMap(
                lambda row: [(row.words, float(v)) for v in row.features]
            ).sortBy(lambda x: -x[1]).take(10)
            
            # Convert top words and their TF-IDF scores to DataFrame and persist it.
            top_words_df = spark.createDataFrame(top_words, ["word", "tfidf"])
            top_words_df.createOrReplaceTempView("top_words")
            top_words_df.write.json("/path/to/output/top_words", mode="append")
            
            # Store metrics in a temporary table.
            user_refs.join(post_refs, user_refs.word == post_refs.word, "outer") \

print("Waiting for messages...")
lines = ssc.socketTextStream(host, port)

# Apply processing function to each RDD
lines.foreachRDD(lambda rdd: process_rdd(rdd))

ssc.start()
ssc.awaitTermination()
