## SparkStreaming Hackathon
### Course: Real-time Data Analysis
### Authors: Ruben Tak, Nils Jennissen, David Landeo
This task involves setting up a data streaming pipeline to extract and process posts and comments from Reddit. The data will be structured and sent through a socket, then received and processed by another process. References to users, posts, and external sites will be extracted and counted, and the top 10 important words will be identified using TF-IDF. Optional features include sentiment analysis, additional metrics, saving results to a database, creating a Jupyter Notebook dashboard, and visualizing the results on a web page. The deliverables include Python code, instructions, output data files, and optional Docker setup.

In [1]:
# pip install praw

In [None]:
# remember to use nc -lk 9999 before you run the script
import socket
import json
import praw
import logging
logging.basicConfig(filename='stream_json_error.log', level=logging.ERROR)
from credentials import CLIENT_ID, CLIENT_SECRET

USER_AGENT = 'MyBot/0.0.1'

host = "127.0.0.1"
port = 9999

subred_name = "reddit"

def create_socket(host, port):
    """
    Create a socket and bind it to the specified host and port.
    """
    s = socket.socket()
    s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
    s.bind((host, port))
    print(f"Listening on port: {port}")
    s.listen()
    return s

def stream_json(reddit, subreddit, socket):
    """
    Stream comments from the specified subreddit and send them through the socket.
    """
    # Accept the connection once
    c, addr = socket.accept()
    for comment in subreddit.stream.comments():
        try:
            post = comment.submission
            parent_id = str(comment.parent())
            parent_comment = reddit.comment(parent_id)
            my_object = {
            "comment": comment.body,
            "prev_comment": parent_comment.body,
            "post": post.selftext,
            "post_date": post.created_utc,
            "comment_date": comment.created_utc,
            }
            # Send data with a newline character
            c.send((json.dumps(my_object) + '\n').encode('utf-8'))
            print(f'Sent data: {my_object}')
        except praw.exceptions.PRAWException as ex:
            logging.error(f"Error while streaming comments: {ex}")
            pass
    # Close the connection after streaming comments
    c.close()

def main():
    # Set up Reddit API
    reddit = praw.Reddit(client_id=CLIENT_ID,
                         client_secret=CLIENT_SECRET,
                         user_agent=USER_AGENT)

    subreddit = reddit.subreddit(subred_name)

    # Set up socket
    with create_socket(host, port) as s:
        # Stream comments and send them through the socket
        stream_json(reddit, subreddit, s)

if __name__ == "__main__":
    main()

Listening on port: 9999
