In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from reddit.reddit_streaming import *

creds, config = read_files()
subreddit = config["subreddit"]
kafka_host = config["kafka_host"]
spark_host = config["spark_host"]
aws_client = creds["aws-client"]
aws_secret = creds["aws-secret"]

spark = SparkSession.builder.appName("reddit_" + subreddit + "_read_data") \
                    .master("spark://{}:7077".format(spark_host)) \
                    .config("spark.eventLog.enabled", "true") \
                    .config("spark.eventLog.dir", "file:///opt/workspace/events") \
                    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.0,org.apache.hadoop:hadoop-common:3.3.1,org.apache.hadoop:hadoop-aws:3.3.1,org.apache.hadoop:hadoop-client:3.3.1,io.delta:delta-core_2.12:1.2.1") \
                    .config("spark.hadoop.fs.s3a.access.key", aws_client) \
                    .config("spark.hadoop.fs.s3a.secret.key", aws_secret) \
                    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
                    .config('spark.hadoop.fs.s3a.aws.credentials.provider', 'org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider') \
                    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
                    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
                    .enableHiveSupport() \
                    .getOrCreate()

print("imported modules")

In [3]:
try:
    test = spark.read.format("delta").option("header", True).load("s3a://reddit-stevenhurwitt/" + subreddit)
    test_pandas = test.toPandas()

except KeyboardInterrupt:
    print("loading data took too long... cancelled.")

22/05/12 22:01:54 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

In [4]:
test_pandas

Unnamed: 0,approved_at_utc,subreddit,selftext,author_fullname,saved,mod_reason_title,gilded,clicked,title,subreddit_name_prefixed,...,author_flair_text_color,permalink,parent_whitelist_status,stickied,url,subreddit_subscribers,created_utc,num_crossposts,media,is_video
0,,technology,,t2_7ccf,False,,0,False,Royal Mail drone fleet takes to the air for re...,r/technology,...,,/r/technology/comments/uo0r18/royal_mail_drone...,all_ads,False,https://www.edinburghnews.scotsman.com/news/pe...,11965116,1652361000.0,0,,False
1,,technology,,t2_lz41dbnl,False,,0,False,"real. fucking. conservative. civil liberties, ...",r/technology,...,,/r/technology/comments/uo8w92/real_fucking_con...,all_ads,False,https://arstechnica.com/tech-policy/2022/05/te...,11967337,1652383000.0,0,,False
2,,technology,,t2_5w9r3,False,,0,False,Court lets Texas restrictions on social platfo...,r/technology,...,,/r/technology/comments/unmsdt/court_lets_texas...,all_ads,False,https://www.theverge.com/2022/5/11/23067002/te...,11961514,1652311000.0,0,,False
3,,technology,,t2_1argepoh,False,,0,False,"In a Blow to Free Speech, Texas’ Social Media ...",r/technology,...,,/r/technology/comments/uo7lue/in_a_blow_to_fre...,all_ads,False,https://www.eff.org/deeplinks/2022/05/blow-fre...,11966965,1652380000.0,0,,False
4,,technology,,t2_1argepoh,False,,0,False,Algae-powered computing: scientists create rel...,r/technology,...,,/r/technology/comments/uo6qnu/algaepowered_com...,all_ads,False,https://www.cam.ac.uk/research/news/scientists...,11966736,1652378000.0,0,,False
5,,technology,,t2_4layskvl,False,,0,False,Tesla CEO Elon Musk dismisses hydrogen as tool...,r/technology,...,,/r/technology/comments/uo0fkj/tesla_ceo_elon_m...,all_ads,False,https://www.cnbc.com/2022/05/12/tesla-ceo-elon...,11965022,1652360000.0,0,,False
6,,technology,,t2_c92d05jm,False,,0,False,"Google Revamps Search, Maps Features for Young...",r/technology,...,,/r/technology/comments/uo08gz/google_revamps_s...,all_ads,False,https://www.businessinsider.com/google-revamps...,11964987,1652360000.0,0,,False
7,,technology,,t2_7i1vm0jb,False,,0,False,New Alexa Competitor Will Feature the Voice of...,r/technology,...,,/r/technology/comments/uo1krp/new_alexa_compet...,all_ads,False,https://www.ign.com/articles/alexa-competitor-...,11965320,1652364000.0,0,,False
8,,technology,,t2_3issg5vy,False,,0,False,Astronomers unveil 1st image of black hole at ...,r/technology,...,,/r/technology/comments/uo5kqq/astronomers_unve...,all_ads,False,https://theweek.com/news/1013493/astronomers-u...,11966474,1652374000.0,0,,False
9,,technology,,t2_3issg5vy,False,,0,False,Astronomers unveil 1st image of black hole at ...,r/technology,...,,/r/technology/comments/uo5kqq/astronomers_unve...,all_ads,False,https://theweek.com/news/1013493/astronomers-u...,11966474,1652374000.0,0,,False


In [5]:
spark.stop()