In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from reddit.reddit_streaming import *
import datetime as dt
import numpy as np
import pandas as pd

creds, config = read_files()
subreddit = config["subreddit"]
kafka_host = config["kafka_host"]
spark_host = config["spark_host"]
aws_client = creds["aws-client"]
aws_secret = creds["aws-secret"]                    

print("imported modules")

imported modules


In [8]:
spark = SparkSession.builder.appName("reddit_" + subreddit + "_read_data") \
                    .master("spark://{}:7077".format(spark_host)) \
                    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.0,org.apache.hadoop:hadoop-common:3.3.1,org.apache.hadoop:hadoop-aws:3.3.1,org.apache.hadoop:hadoop-client:3.3.1,io.delta:delta-core_2.12:1.2.1") \
                    .config("spark.hadoop.fs.s3a.access.key", aws_client) \
                    .config("spark.hadoop.fs.s3a.secret.key", aws_secret) \
                    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
                    .config('spark.hadoop.fs.s3a.aws.credentials.provider', 'org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider') \
                    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
                    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
                    .enableHiveSupport() \
                    .getOrCreate()

# .config("spark.eventLog.enabled", "true") \
# .config("spark.eventLog.dir", "file:///opt/workspace/events") \
print("created spark.")

In [9]:
try:
    test = spark.read.format("delta").option("header", True).load("s3a://reddit-stevenhurwitt/" + subreddit)
    test_pandas = test.toPandas()

except KeyboardInterrupt:
    print("loading data took too long... cancelled.")

                                                                                

In [10]:
test_pandas

Unnamed: 0,approved_at_utc,subreddit,selftext,author_fullname,saved,mod_reason_title,gilded,clicked,title,subreddit_name_prefixed,...,author_flair_text_color,permalink,parent_whitelist_status,stickied,url,subreddit_subscribers,created_utc,num_crossposts,media,is_video
0,,technology,,t2_hqstdwu0,False,,0,False,Elon Musk is using a 'dog ate the homework' ex...,r/technology,...,,/r/technology/comments/uqwpba/elon_musk_is_usi...,all_ads,False,https://markets.businessinsider.com/news/stock...,11994694,1.652710e+09,0,,False
1,,technology,,t2_7ccf,False,,0,False,Royal Mail drone fleet takes to the air for re...,r/technology,...,,/r/technology/comments/uo0r18/royal_mail_drone...,all_ads,False,https://www.edinburghnews.scotsman.com/news/pe...,11965116,1.652361e+09,0,,False
2,,technology,,t2_7ccf,False,,0,False,These Nanobots Can Swim Around a Wound and Kil...,r/technology,...,,/r/technology/comments/uow5as/these_nanobots_c...,all_ads,False,https://www.wired.com/story/these-nanobots-can...,11973872,1.652461e+09,0,,False
3,,technology,,t2_6utha,False,,0,False,Did Twitch Violate Texas’ Social Media Law By ...,r/technology,...,,/r/technology/comments/ur2490/did_twitch_viola...,all_ads,False,https://www.techdirt.com/2022/05/16/did-twitch...,11996188,1.652724e+09,0,,False
4,,technology,,t2_lz41dbnl,False,,0,False,"real. fucking. conservative. civil liberties, ...",r/technology,...,,/r/technology/comments/uo8w92/real_fucking_con...,all_ads,False,https://arstechnica.com/tech-policy/2022/05/te...,11967337,1.652383e+09,0,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73,,technology,,t2_83wk7s3r,False,,0,False,Novel Diamond Semiconductors Operate at Highes...,r/technology,...,,/r/technology/comments/uqwzkm/novel_diamond_se...,all_ads,False,https://www.ad-na.com/magazine_en/archives/44,11994761,1.652711e+09,0,,False
74,,technology,,t2_1argepoh,False,,0,False,Apple iPod creator warns the metaverse will en...,r/technology,...,,/r/technology/comments/uo6vg8/apple_ipod_creat...,all_ads,False,https://www.bbc.com/news/business-61423268,11966776,1.652378e+09,0,,False
75,,technology,,t2_a5jwjm1s,False,,0,False,A colony of blue-green algae can power a compu...,r/technology,...,,/r/technology/comments/uo75rs/a_colony_of_blue...,all_ads,False,https://interestingengineering.com/blue-green-...,11966852,1.652379e+09,0,,False
76,,technology,,t2_bf38q9nm,False,,0,False,"Musk, Twitter CEO spar over bot accounts",r/technology,...,,/r/technology/comments/ur2j9k/musk_twitter_ceo...,all_ads,False,https://thehill.com/policy/technology/,11996262,1.652726e+09,0,,False


In [11]:
np.max([dt.datetime.fromtimestamp(t) for t in test_pandas["created_utc"]])

datetime.datetime(2022, 5, 16, 19, 5, 36)

In [5]:
spark.stop()