## SparkStreaming Hackathon
### Course: Real-time Data Analysis
### Authors: Ruben Tak, Nils Jennissen, David Landeo
This task involves setting up a data streaming pipeline to extract and process posts and comments from Reddit. The data will be structured and sent through a socket, then received and processed by another process. References to users, posts, and external sites will be extracted and counted, and the top 10 important words will be identified using TF-IDF. Optional features include sentiment analysis, additional metrics, saving results to a database, creating a Jupyter Notebook dashboard, and visualizing the results on a web page. The deliverables include Python code, instructions, output data files, and optional Docker setup.

In [1]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext
from pyspark.sql.types import StructType, StructField, StringType
from pyspark.ml.feature import CountVectorizer, IDF
from pyspark.sql.functions import regexp_extract
import json
import time


# Create a SparkSession and StreamingContext
spark_conf = SparkConf().setAppName("reddit")
ss1 = SparkSession.builder.config(conf=spark_conf).getOrCreate()
ssc = StreamingContext(ss1.sparkContext, 40)

# Create a DStream
lines = ssc.socketTextStream("localhost", 9999)
comments = lines.map(lambda json_data: json.loads(json_data))
comments.pprint()

schema = StructType([
    StructField("comment", StringType(), True),
    StructField("parent_comment", StringType(), True),
    StructField("post", StringType(), True),
    StructField("created_utc", StringType(), True),
    StructField("author", StringType(), True),
])

base_path = "./data/raw/reddit_v5"

def process_rdd(time, rdd):
    if not rdd.isEmpty():
        df = ss1.createDataFrame(rdd, schema)
        df.createOrReplaceTempView("raw")
        df.persist()
        output_path = f"{base_path}/{time.strftime('%Y%m%d%H%M%S')}"
        df.write.json(output_path)
        df.write.csv(output_path + ".csv", header=True)

        # Get references to users, posts, and external sites
        references_df = ss1.sql("""
            SELECT 
                comment,
                prev_comment,
                post,
                author,
                created_utc,
                regexp_extract(comment, '/u/([^\\s/]+)', 1).alias('user_reference'),
                regexp_extract(comment, '/r/([^\\s/]+)', 1).alias('post_reference'),
                regexp_extract(comment, 'http[s]?://([^\\s/]+)', 1).alias('site_reference')
            FROM raw
        """)
         # Save processed data to a temporary table
        references_df.createOrReplaceTempView("metrics")
        references_df.write.json(output_path)
        references_df.write.csv(output_path + ".csv", header=True)

        # Preprocess comments
        preprocessed_comments = references_df.select("comment").rdd.flatMap(lambda x: x).map(preprocess_comment)

        # Calculate TF-IDF
        vectorizer = CountVectorizer(inputCol="words", outputCol="raw_features")
        vectorizer_model = vectorizer.fit(preprocessed_comments)
        count_vectorized = vectorizer_model.transform(preprocessed_comments)

        idf = IDF(inputCol="raw_features", outputCol="features")
        idf_model = idf.fit(count_vectorized)
        tfidf = idf_model.transform(count_vectorized)

        # Get top 10 important words
        vocab = vectorizer_model.vocabulary
        top_10_words = idf_model.idf.toArray().argsort()[-10:]
        top_10_words = [vocab[idx] for idx in top_10_words]
        print("Top 10 important words:")
        print(top_10_words)

comments.foreachRDD(process_rdd)

ssc.start()

23/06/21 15:20:49 WARN Utils: Your hostname, Nilss-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.0.108 instead (on interface en0)
23/06/21 15:20:49 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/06/21 15:20:49 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Preprocess and Save Transformed Data

In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType
from pyspark.sql.functions import col, from_unixtime
from pyspark.sql.types import TimestampType
from pyspark.sql import functions as F
from pyspark.sql.window import Window


host = "localhost"
port = 9999

# Create a SparkSession and StreamingContext
spark_conf = SparkConf().setAppName("reddit")
ss1 = SparkSession.builder.config(conf=spark_conf).getOrCreate()

# ---- UPDATE BELOW ACCORDING TO THE LOCATION IN THE FIRST CELL -----
input_path = "./data/raw/reddit_v5/*/*.json"

schema = StructType([
    StructField("comment", StringType(), True),
    StructField("parent_comment", StringType(), True),
    StructField("post", StringType(), True),
    StructField("created_utc", StringType(), True),
    StructField("author", StringType(), True)
])

streaming_df = ss1.readStream \
    .format("json") \
    .schema(schema) \
    .option("path", input_path) \
    .load()


transformed_df = streaming_df \
    .withColumn('created_utc2', F.col('created_utc').cast("float")) \
    .withColumn('created_utc3', F.col('created_utc2').cast("int")) \
    .withColumn('created_utc_ts', F.from_unixtime(F.col('created_utc3')).cast(TimestampType())) \
    .withWatermark("created_utc_ts", "5 seconds") \
    .groupBy(F.col("author"), F.window(F.col("created_utc_ts"), windowDuration="60 seconds")) \
    .agg({"created_utc_ts":'max', "comment":'count'})

# Save output to disk
# ---- UPDATE BELOW EACH TIME YOU RERUN THIS CELL -----
output_path = "./data/processed/reddit_v1"
checkpt_path = "./metadata/processed/reddit_v1"

transformed_df.writeStream \
    .format("json") \
    .option("checkpointLocation", checkpt_path) \
    .option("path", output_path) \
    .outputMode("append") \
    .start()

23/06/21 15:21:19 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


<pyspark.sql.streaming.query.StreamingQuery at 0x10e294430>

In [3]:
query = transformed_df.writeStream \
    .format("json") \
    .option("checkpointLocation", checkpt_path) \
    .option("path", output_path) \
    .outputMode("append") \
    .start()

# Stop the streaming query
query.stop()

23/06/21 09:25:25 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
23/06/21 09:25:25 WARN StreamingQueryManager: Stopping existing streaming query [id=3fb4c12d-0721-4f23-b23e-10e78a952527, runId=d17f5d47-0a56-4317-9aca-d23cb06bc3e5], as a new run is being started.
