### Consumer 

This consumer should recieve the reddit data from the producer and (perform semantic analysis?)

In [8]:
# stop the SparkContext
ssc.stop()

Define the Consumer

In [None]:
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
import json

host = "127.0.0.1"
port = 9999

sc = SparkContext("local[2]", "DisplayLinesV2")
sc.setLogLevel("ERROR")
ssc = StreamingContext(sc, 60)  # Set batch interval to 10 seconds for quicker feedback

def process_rdd(rdd):
    print("Processing RDD")
    if not rdd.isEmpty():
        data = rdd.collect()
        for record in data:
            print(json.loads(record))
print("Waiting for a messages...")
lines = ssc.socketTextStream(host, port)

lines.foreachRDD(lambda rdd: process_rdd(rdd))
lines.pprint()
ssc.start()
ssc.awaitTermination()

NLTK

In [1]:
# pip install nltk

In [3]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Attempt to make it work with stream context

In [None]:
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.sql import SparkSession
from pyspark.sql import Row
import json

host = "127.0.0.1"
port = 9999

# Initialize SparkContext and SparkSession
sc = SparkContext("local[2]", "DisplayLinesV2")
sc.setLogLevel("ERROR")
spark = SparkSession(sc)

# Initialize StreamingContext with a batch interval of 10 seconds
ssc = StreamingContext(sc, 10)

# Define the socket stream
from pyspark import SparkContext
from pyspark.streaming import StreamingContext

host = "127.0.0.1"
port = 9999

sc = SparkContext("local[2]", "SimpleConsumer")
sc.setLogLevel("ERROR")
ssc = StreamingContext(sc, 1)

lines = ssc.socketTextStream(host, port)
lines.pprint()  # Print received lines

ssc.start()
ssc.awaitTermination()
print(lines.pprint())
print("Waiting for messages...")

def process_rdd(rdd):
    print("Processing RDD...")
    print("RDD content: ", rdd.collect())
    if not rdd.isEmpty():
        # Parse each record as JSON
        records = rdd.map(lambda record: json.loads(record))
        
        # Convert RDD to DataFrame
        df = spark.createDataFrame(records.map(lambda x: Row(**x)))
        
        # Register DataFrame as a temporary table
        df.createOrReplaceTempView("raw")
        
        # Show the DataFrame (optional)
        print(df.show())
        
        # Save the DataFrame to disk
        df.write.json("/files", mode="append")

# Apply the processing function to each RDD in the DStream
lines.foreachRDD(lambda rdd: process_rdd(rdd))

# Start the streaming context and wait for termination
ssc.start()
print("Streaming started...")
ssc.awaitTermination()

# Simple stream context

In [None]:
host = "127.0.0.1"
port = 9999
from pyspark import SparkContext
from pyspark.streaming import StreamingContext

sc = SparkContext("local[2]", "DisplayLines")
ssc = StreamingContext(sc, 10)  # 2nd arg = minibatch size in sec. 

lines = ssc.socketTextStream(host, port)

lines.pprint()

ssc.start()
ssc.awaitTermination()

# Working Solution

In [2]:
import socket
import json

from pyspark import SparkContext
from pyspark.sql import SparkSession


sc = SparkContext("local[*]", "SocketStreaming")
sc.setLogLevel("ERROR")


spark = SparkSession.builder \
    .appName("SocketStreaming") \
    .getOrCreate()


host = "127.0.0.1"
port = 9999


s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.connect((host, port))
print("Connected to the producer...")


data_list = []


def process_data(data):
    try:
        
        record = json.loads(data)
        #print(record)
        #print(type(record))
        
        
        data_list.append(record)
        
        
        df = spark.createDataFrame(data_list)
        
        
        json_rdd = df.toJSON()
        json_data = [json_obj for json_obj in json_rdd.collect()]
        
        
        with open("raw.json", "a") as output_file:
            for item in json_data:
                output_file.write(item + ",\n")
        
        
        data_list.clear()
        
        
        #df.show()
        
    except Exception as e:
        print("Error processing data:", e)


while True:
    data = s.recv(1024)
    if not data:
        break
    
    process_data(data.decode("utf-8"))


s.close()


spark.stop()

Connected to the producer...
{'title': 'People who met your life partners on a dating app, how did the first date go? What was your "this is the one" moment?', 'date': '2024-06-08 17:47:47', 'author': 'alex140728', 'subreddit': 'AskReddit'}
<class 'dict'>
+----------+-------------------+---------+--------------------+
|    author|               date|subreddit|               title|
+----------+-------------------+---------+--------------------+
|alex140728|2024-06-08 17:47:47|AskReddit|People who met yo...|
+----------+-------------------+---------+--------------------+

{'title': 'People who cut Family members out of their life, what was the reason and why did you do it?', 'date': '2024-06-08 17:47:54', 'author': 'Yo-KaiWatchFan2102', 'subreddit': 'AskReddit'}
<class 'dict'>
+------------------+-------------------+---------+--------------------+
|            author|               date|subreddit|               title|
+------------------+-------------------+---------+--------------------