In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from reddit.reddit_streaming import *
import datetime as dt
import pprint
pp = pprint.PrettyPrinter(indent = 1)
%load_ext sparksql_magic

creds, config = read_files()
subreddit_list = config["subreddit"]
kafka_host = config["kafka_host"]
spark_host = config["spark_host"]
aws_client = creds["aws-client"]
aws_secret = creds["aws-secret"]

subreddit = subreddit_list[0]

spark = SparkSession.builder.appName("reddit_" + subreddit + "_glue_partition") \
                    .master("spark://{}:7077".format(spark_host)) \
                    .config("spark.scheduler.mode", "FAIR") \
                    .config("spark.scheduler.allocation.file", "file:///opt/workspace/redditStreaming/fairscheduler.xml") \
                    .config("spark.executor.memory", "1024m") \
                    .config("spark.executor.cores", "2") \
                    .config("spark.streaming.concurrentJobs", "4") \
                    .config("spark.local.dir", "/opt/workspace/tmp/driver/{}/".format(subreddit)) \
                    .config("spark.worker.dir", "/opt/workspace/tmp/executor/{}/".format(subreddit)) \
                    .config("spark.sql.debug.maxToStringFields", 1000) \
                    .config("spark.eventLog.enabled", "true") \
                    .config("spark.eventLog.dir", "file:///opt/workspace/events") \
                    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.0,org.apache.hadoop:hadoop-common:3.3.1,org.apache.hadoop:hadoop-aws:3.3.1,org.apache.hadoop:hadoop-client:3.3.1,io.delta:delta-core_2.12:1.2.1") \
                    .config("spark.hadoop.fs.s3a.access.key", aws_client) \
                    .config("spark.hadoop.fs.s3a.secret.key", aws_secret) \
                    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
                    .config('spark.hadoop.fs.s3a.aws.credentials.provider', 'org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider') \
                    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
                    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
                    .config("spark.delta.logStore.class", "org.apache.spark.sql.delta.storage.S3SingleDriverLogStore") \
                    .enableHiveSupport() \
                    .getOrCreate()

print("created spark successfully.")


:: loading settings :: url = jar:file:/usr/local/lib/python3.7/dist-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
org.apache.hadoop#hadoop-common added as a dependency
org.apache.hadoop#hadoop-aws added as a dependency
org.apache.hadoop#hadoop-client added as a dependency
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-2c68f2f8-e5b6-404e-8ab1-f451c0e2fcc6;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.2.0 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.2.0 in central
	found org.apache.kafka#kafka-clients;2.8.0 in central
	found org.lz4#lz4-java;1.7.1 in central
	found org.xerial.snappy#snappy-java;1.1.8.4 in central
	found org.slf4j#slf4j-api;1.7.30 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.1 in central
	found org.spark-project.spark#unused;1.0.0 in central
	found org.apache.hadoop#hadoop-client-api;3.

created spark successfully.


In [7]:
df = spark.read.format("delta").option("header", True).load("s3a://reddit-stevenhurwitt/" + subreddit)

# df.createOrReplaceTempView("reddit_{}".format(subreddit))
print("created df.")

created df.


In [8]:
df.toPandas()

                                                                                

Unnamed: 0,approved_at_utc,subreddit,selftext,author_fullname,saved,mod_reason_title,gilded,clicked,title,subreddit_name_prefixed,...,author_flair_text_color,permalink,parent_whitelist_status,stickied,url,subreddit_subscribers,created_utc,num_crossposts,media,is_video
0,,technology,,t2_dvcsbk6c,False,,0,False,US TikTok user data accessed in China despite ...,r/technology,...,,/r/technology/comments/veshmq/us_tiktok_user_d...,all_ads,False,https://www.washingtonexaminer.com/policy/tech...,12220571,1.655507e+09,0,,False
1,,technology,,t2_dvcsbk6c,False,,0,False,US TikTok user data accessed in China despite ...,r/technology,...,,/r/technology/comments/veshmq/us_tiktok_user_d...,all_ads,False,https://www.washingtonexaminer.com/policy/tech...,12220573,1.655507e+09,0,,False
2,,technology,,t2_88icg8jq,False,,0,False,"Chicago expands and activates quantum network,...",r/technology,...,,/r/technology/comments/vflnlz/chicago_expands_...,all_ads,False,https://news.uchicago.edu/story/chicago-quantu...,12228022,1.655607e+09,0,,False
3,,technology,,t2_88icg8jq,False,,0,False,"Chicago expands and activates quantum network,...",r/technology,...,,/r/technology/comments/vflnlz/chicago_expands_...,all_ads,False,https://news.uchicago.edu/story/chicago-quantu...,12228022,1.655607e+09,0,,False
4,,technology,,t2_88icg8jq,False,,0,False,"Meta is launching an avatar store, and designe...",r/technology,...,,/r/technology/comments/veuc8b/meta_is_launchin...,all_ads,False,https://www.theverge.com/2022/6/17/23173128/me...,12220972,1.655513e+09,0,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
533,,technology,,t2_3ed191a2,False,,0,False,thunderbolt 2 connect issues,r/technology,...,,/r/technology/comments/vfic7u/thunderbolt_2_co...,all_ads,False,/r/Thunderbolt/comments/vfi7jj/thunderbolt_2_c...,12226938,1.655596e+09,0,,False
534,,technology,,t2_f6nbn,False,,0,False,All modern CPUs are vulnerable: dynamic freque...,r/technology,...,,/r/technology/comments/vcusy0/all_modern_cpus_...,all_ads,False,https://www.hertzbleed.com/,12204618,1.655300e+09,0,,False
535,,technology,,t2_heajeq9t,False,,0,False,Looks like Orkut is being resurrected from its...,r/technology,...,,/r/technology/comments/vggqxw/looks_like_orkut...,all_ads,False,http://www.orkut.com/index.html,12235575,1.655714e+09,0,,False
536,,technology,,t2_8ytw6jvm,False,,0,False,"To Build Dark Patterns, I Refuse",r/technology,...,,/r/technology/comments/vee7w1/to_build_dark_pa...,all_ads,False,https://wagslane.dev/posts/dark-patterns/,12217953,1.655475e+09,0,,False


In [9]:
df_clean = df.withColumn("approved_at_utc", col("approved_at_utc").cast("timestamp")) \
        .withColumn("banned_at_utc", col("banned_at_utc").cast("timestamp")) \
        .withColumn("created_utc", col("created_utc").cast("timestamp")) \
        .withColumn("created", col("created").cast("timestamp")) \
        .withColumn("post_date", to_date(col("created_utc"), "MM-dd-yyyy")) \
        .dropDuplicates(subset = ["title"])

In [10]:
df_clean_pandas = df_clean.toPandas()
df_clean_pandas.head()

                                                                                

Unnamed: 0,approved_at_utc,subreddit,selftext,author_fullname,saved,mod_reason_title,gilded,clicked,title,subreddit_name_prefixed,...,permalink,parent_whitelist_status,stickied,url,subreddit_subscribers,created_utc,num_crossposts,media,is_video,post_date
0,NaT,technology,,t2_gfv52,False,,0,False,"""Traffic Correlation Attacks"" - How oppressive...",r/technology,...,/r/technology/comments/ur3dpc/traffic_correlat...,all_ads,False,https://windscribbles.com/combating,11996702,2022-05-16 19:05:36,1,,False,2022-05-16
1,NaT,technology,,t2_2uwit82z,False,,0,False,13-Inch MacBook Pro With M2 Chip Outperforms B...,r/technology,...,/r/technology/comments/vdp1lo/13inch_macbook_p...,all_ads,False,https://www.macrumors.com/2022/06/16/m2-chip-o...,12211954,2022-06-16 15:49:20,0,,False,2022-06-16
2,NaT,technology,,t2_irlcj,False,,0,False,2023 could be the year Apple switches the iPho...,r/technology,...,/r/technology/comments/uoy6m7/2023_could_be_th...,all_ads,False,https://arstechnica.com/gadgets/2022/05/report...,11974416,2022-05-13 18:33:36,0,,False,2022-05-13
3,NaT,technology,,t2_3dv87cht,False,,0,False,30 year Perovskite solar cell announcement fro...,r/technology,...,/r/technology/comments/vedxd0/30_year_perovski...,all_ads,False,https://techxplore.com/news/2022-06-year-perov...,12217816,2022-06-17 14:02:40,0,,False,2022-06-17
4,NaT,technology,,t2_guf36,False,,0,False,52% Of Global Car Buyers Are Now Interested In...,r/technology,...,/r/technology/comments/vd4xmb/52_of_global_car...,all_ads,False,https://www.19fortyfive.com/2022/06/52-of-glob...,12207039,2022-06-15 21:15:44,0,,False,2022-06-15


22/06/20 15:25:19 WARN HeartbeatReceiver: Removing executor 2 with no recent heartbeats: 2118300 ms exceeds timeout 120000 ms
22/06/20 15:25:19 WARN HeartbeatReceiver: Removing executor 1 with no recent heartbeats: 2121850 ms exceeds timeout 120000 ms
22/06/20 15:25:20 ERROR TaskSchedulerImpl: Lost executor 2 on 172.26.0.6: Executor heartbeat timed out after 2118300 ms
22/06/20 15:25:20 ERROR TaskSchedulerImpl: Lost executor 1 on 172.26.0.7: Executor heartbeat timed out after 2121850 ms
22/06/20 15:25:21 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_15_38 !
22/06/20 15:25:21 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_15_12 !
22/06/20 15:25:21 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_15_8 !
22/06/20 15:25:21 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_15_30 !
22/06/20 15:25:21 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_15_19 !
22/06/20 15:25:21 WARN BlockManagerMast

In [13]:
df_clean.filter(col("title").contains("TikTok")).toPandas()

                                                                                

In [7]:
filepath = "file:///opt/workspace/technology/"
df.write.format("delta").partitionBy("post_date").mode("overwrite").option("header", True).save(filepath)
print("wrote delta table to local.")



wrote delta table to local.


                                                                                

In [10]:
test = spark.read.format("delta").option("header", True).load(filepath)
test.toPandas()

                                                                                

Unnamed: 0,approved_at_utc,subreddit,selftext,author_fullname,saved,mod_reason_title,gilded,clicked,title,subreddit_name_prefixed,...,permalink,parent_whitelist_status,stickied,url,subreddit_subscribers,created_utc,num_crossposts,media,is_video,post_date
0,NaT,technology,,t2_2uwit82z,False,,0,False,Senator Presses Amazon to Disclose Just How Cr...,r/technology,...,/r/technology/comments/vcdqr2/senator_presses_...,all_ads,False,https://gizmodo.com/amazon-ring-camera-audio-r...,12200483,2022-06-14 21:11:28,0,,False,2022-06-14
1,NaT,technology,,t2_2uwit82z,False,,0,False,Ford halts deliveries of electric Mustang Mach...,r/technology,...,/r/technology/comments/vcd10m/ford_halts_deliv...,all_ads,False,https://www.businessinsider.com/ford-mustang-m...,12200136,2022-06-14 20:39:28,0,,False,2022-06-14
2,NaT,technology,,t2_7i1vm0jb,False,,0,False,Amazon kicked a trans employee off the board o...,r/technology,...,/r/technology/comments/vcfivu/amazon_kicked_a_...,all_ads,False,https://www.businessinsider.com/amazon-trans-b...,12200666,2022-06-14 22:38:56,0,,False,2022-06-14
3,NaT,technology,,t2_mr4nwrpm,False,,0,False,"Wickr, Amazon’s encrypted chat app, has a chil...",r/technology,...,/r/technology/comments/vc63ul/wickr_amazons_en...,all_ads,False,https://www.nbcnews.com/tech/tech-news/wickr-a...,12198321,2022-06-14 15:30:08,0,,False,2022-06-14
4,NaT,technology,,t2_1ied4ho,False,,0,False,China’s Chipmaking Power Grows Despite US Effo...,r/technology,...,/r/technology/comments/vc72ia/chinas_chipmakin...,all_ads,False,https://www.bloomberg.com/news/articles/2022-0...,12198605,2022-06-14 16:14:56,0,,False,2022-06-14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
246,NaT,technology,,t2_8ytw6jvm,False,,0,False,"To Build Dark Patterns, I Refuse",r/technology,...,/r/technology/comments/vee7w1/to_build_dark_pa...,all_ads,False,https://wagslane.dev/posts/dark-patterns/,12217893,2022-06-17 14:17:36,0,,False,2022-06-17
247,NaT,technology,,t2_1argepoh,False,,0,False,Apple iPod creator warns the metaverse will en...,r/technology,...,/r/technology/comments/uo6vg8/apple_ipod_creat...,all_ads,False,https://www.bbc.com/news/business-61423268,11966776,2022-05-12 17:53:04,0,,False,2022-05-12
248,NaT,technology,,t2_a5jwjm1s,False,,0,False,A colony of blue-green algae can power a compu...,r/technology,...,/r/technology/comments/uo75rs/a_colony_of_blue...,all_ads,False,https://interestingengineering.com/blue-green-...,11966852,2022-05-12 18:05:52,0,,False,2022-05-12
249,NaT,technology,,t2_bf38q9nm,False,,0,False,"Musk, Twitter CEO spar over bot accounts",r/technology,...,/r/technology/comments/ur2j9k/musk_twitter_ceo...,all_ads,False,https://thehill.com/policy/technology/,11996262,2022-05-16 18:27:12,0,,False,2022-05-16


In [1]:
spark.stop()

NameError: name 'spark' is not defined