# Read table

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from reddit.reddit_streaming import *
import datetime as dt
import pprint
pp = pprint.PrettyPrinter(indent = 1)
%load_ext sparksql_magic

creds, config = read_files()
subreddit = config["subreddit"]
kafka_host = config["kafka_host"]
spark_host = config["spark_host"]
aws_client = creds["aws-client"]
aws_secret = creds["aws-secret"]

try:
    spark = SparkSession.builder.appName("reddit_" + subreddit + "_read_data") \
                        .master("spark://{}:7077".format(spark_host)) \
                        .config("spark.sql.debug.maxToStringFields", 1000) \
                        .config("spark.eventLog.enabled", "true") \
                        .config("spark.eventLog.dir", "file:///opt/workspace/events") \
                        .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.0,org.apache.hadoop:hadoop-common:3.3.1,org.apache.hadoop:hadoop-aws:3.3.1,org.apache.hadoop:hadoop-client:3.3.1,io.delta:delta-core_2.12:1.2.1") \
                        .config("spark.hadoop.fs.s3a.access.key", aws_client) \
                        .config("spark.hadoop.fs.s3a.secret.key", aws_secret) \
                        .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
                        .config('spark.hadoop.fs.s3a.aws.credentials.provider', 'org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider') \
                        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
                        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
                        .enableHiveSupport() \
                        .getOrCreate()

    print("imported modules")

except Exception as e:
    print(e)

:: loading settings :: url = jar:file:/usr/local/lib/python3.7/dist-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
org.apache.hadoop#hadoop-common added as a dependency
org.apache.hadoop#hadoop-aws added as a dependency
org.apache.hadoop#hadoop-client added as a dependency
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-5d8cfe07-fdd6-462a-916c-ed93d35006d0;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.2.0 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.2.0 in central
	found org.apache.kafka#kafka-clients;2.8.0 in central
	found org.lz4#lz4-java;1.7.1 in central
	found org.xerial.snappy#snappy-java;1.1.8.4 in central
	found org.slf4j#slf4j-api;1.7.30 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.1 in central
	found org.spark-project.spark#unused;1.0.0 in central
	found org.apache.hadoop#hadoop-client-api;3.

imported modules


In [4]:
try:
    test = spark.read.format("delta").option("header", True).load("s3a://reddit-stevenhurwitt/" + subreddit)
    test = test.withColumn("approved_at_utc", col("approved_at_utc").cast("timestamp")) \
                .withColumn("banned_at_utc", col("banned_at_utc").cast("timestamp")) \
                .withColumn("created_utc", col("created_utc").cast("timestamp")) \
                .withColumn("created", col("created").cast("timestamp"))
    test.createOrReplaceTempView("reddit_technology")

except KeyboardInterrupt:
    print("loading data took too long... cancelled.")

In [5]:
%%sparksql

select * from reddit_technology

[Stage 11:>                                                         (0 + 1) / 1]

only showing top 20 row(s)


                                                                                

0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98
approved_at_utc,subreddit,selftext,author_fullname,saved,mod_reason_title,gilded,clicked,title,subreddit_name_prefixed,hidden,pwls,link_flair_css_class,downs,thumbnail_height,top_awarded_type,hide_score,name,quarantine,link_flair_text_color,upvote_ratio,author_flair_background_color,ups,total_awards_received,thumbnail_width,author_flair_template_id,is_original_content,secure_media,is_reddit_media_domain,is_meta,category,link_flair_text,can_mod_post,score,approved_by,is_created_from_ads_ui,author_premium,thumbnail,edited,author_flair_css_class,post_hint,content_categories,is_self,subreddit_type,created,link_flair_type,wls,removed_by_category,banned_by,author_flair_type,domain,allow_live_comments,selftext_html,likes,suggested_sort,banned_at_utc,url_overridden_by_dest,view_count,archived,no_follow,is_crosspostable,pinned,over_18,media_only,link_flair_template_id,can_gild,spoiler,locked,author_flair_text,visited,removed_by,mod_note,distinguished,subreddit_id,author_is_blocked,mod_reason_by,num_reports,removal_reason,link_flair_background_color,id,is_robot_indexable,report_reasons,author,discussion_type,num_comments,send_replies,whitelist_status,contest_mode,author_patreon_flair,author_flair_text_color,permalink,parent_whitelist_status,stickied,url,subreddit_subscribers,created_utc,num_crossposts,media,is_video
,technology,,t2_hqstdwu0,False,,0,False,"Elon Musk is using a 'dog ate the homework' excuse to potentially back out of buying Twitter and there is now a less than 50% chance the deal gets done, Wedbush says",r/technology,False,6,general,0,70,,True,t3_uqwpba,False,dark,0.5,,0,0,140,,False,,False,False,,Social Media,False,0,,False,False,https://b.thumbs.redditmedia.com/11MchXKozugExPgcEel-Y1J5aH_9bIfdUuT4AD8J-9Y.jpg,False,,link,,False,public,2022-05-16 14:02:40,text,6,,,text,markets.businessinsider.com,False,,,,,https://markets.businessinsider.com/news/stocks/elon-musk-twitter-buyout-deal-bots-tesla-financing-wedbush-2022-5?utm_source=feedly&amp;utm_medium=webfeeds,,False,True,True,False,False,False,7d4d8376-a816-11e9-a92d-0e6b9fa95170,True,False,False,,False,,,,t5_2qh16,False,,,,,uqwpba,True,,daisyshortcakes,,0,True,all_ads,False,False,,/r/technology/comments/uqwpba/elon_musk_is_using_a_dog_ate_the_homework_excuse/,all_ads,False,https://markets.businessinsider.com/news/stocks/elon-musk-twitter-buyout-deal-bots-tesla-financing-wedbush-2022-5?utm_source=feedly&amp;utm_medium=webfeeds,11994694,2022-05-16 14:02:40,0,,False
,technology,,t2_7ccf,False,,0,False,Royal Mail drone fleet takes to the air for remote Scottish island deliveries - A fleet of hi-tech drones will be deployed to the skies over the Scotland to ensure the mail gets to islanders on time.,r/technology,False,6,general,0,93,,True,t3_uo0r18,False,dark,1.0,,3,0,140,,False,,False,False,,Robotics/Automation,False,3,,False,True,https://b.thumbs.redditmedia.com/XNFuHinae5U7M9HgZgzG-t2iIgL7Y660dNB6jA7Rztc.jpg,False,,link,,False,public,2022-05-12 13:11:28,text,6,,,text,edinburghnews.scotsman.com,False,,,,,https://www.edinburghnews.scotsman.com/news/people/royal-mail-drone-fleet-takes-to-the-air-for-remote-scottish-island-deliveries-3689487,,False,False,True,False,False,False,776c82c2-a816-11e9-8179-0ec6fe24d256,True,False,False,,False,,,,t5_2qh16,False,,,,,uo0r18,True,,speckz,,0,True,all_ads,False,False,,/r/technology/comments/uo0r18/royal_mail_drone_fleet_takes_to_the_air_for/,all_ads,False,https://www.edinburghnews.scotsman.com/news/people/royal-mail-drone-fleet-takes-to-the-air-for-remote-scottish-island-deliveries-3689487,11965116,2022-05-12 13:11:28,0,,False
,technology,,t2_8tkwx82v,False,,0,False,"Elon Musks Slams Twitter’s ‘Bias Against Half the Country,’ Alleged Inaction on Death Threats to Conservative User",r/technology,False,6,general,0,93,,True,t3_vc7ed8,False,dark,1.0,,1,0,140,,False,,False,False,,Social Media,False,1,,False,False,https://b.thumbs.redditmedia.com/gjUM5WdposEv-tQpJwRLlRHhf2OHF8wixswniNXErYM.jpg,False,,link,,False,public,2022-06-14 16:27:44,text,6,,,text,theepochtimes.com,False,,,,,https://www.theepochtimes.com/elon-musks-slams-twitters-bias-against-half-the-country-and-inaction-on-death-threats-to-conservative-user_4531637.html,,False,True,True,False,False,False,7d4d8376-a816-11e9-a92d-0e6b9fa95170,True,False,False,,False,,,,t5_2qh16,False,,,,,vc7ed8,True,,SnooBooks5387,,0,True,all_ads,False,False,,/r/technology/comments/vc7ed8/elon_musks_slams_twitters_bias_against_half_the/,all_ads,False,https://www.theepochtimes.com/elon-musks-slams-twitters-bias-against-half-the-country-and-inaction-on-death-threats-to-conservative-user_4531637.html,12198682,2022-06-14 16:27:44,0,,False
,technology,,t2_7ccf,False,,0,False,"Inside ID.me's torrid pandemic growth spurt, which led to frantic hiring, ill-equipped staff, and data-security lapses as the company closed lucrative deals with unemployment agencies and the IRS",r/technology,False,6,general,0,70,,True,t3_vccn29,False,dark,1.0,,1,0,140,,False,,False,False,,Privacy,False,1,,False,True,https://b.thumbs.redditmedia.com/VS9rIFmanDih3Y98NGENVCLwfY2HCqzMgZ-HmybA0rI.jpg,False,,link,,False,public,2022-06-14 20:22:24,text,6,,,text,businessinsider.com,False,,,,,https://www.businessinsider.com/id-me-customer-service-workers-hiring-secuirty-privacy-stress-data-2022-6,,False,True,True,False,False,False,73e6711c-a816-11e9-a993-0e21b1dd13b2,True,False,False,,False,,,,t5_2qh16,False,,,,,vccn29,True,,speckz,,0,True,all_ads,False,False,,/r/technology/comments/vccn29/inside_idmes_torrid_pandemic_growth_spurt_which/,all_ads,False,https://www.businessinsider.com/id-me-customer-service-workers-hiring-secuirty-privacy-stress-data-2022-6,12200050,2022-06-14 20:22:24,0,,False
,technology,,t2_7ccf,False,,0,False,"Microsoft Is Playing Nice With Unionizing Workers. Can the Tech Giant Be Trusted? - The company says it won’t interfere with union elections at Activision Blizzard, a video game company it’s acquiring. But it’s complicated.",r/technology,False,6,general,0,72,,True,t3_vc4o4m,False,dark,1.0,,2,0,140,,False,,False,False,,Business,False,2,,False,True,https://b.thumbs.redditmedia.com/G2DdqFzZPEOjbPrfKccoVtYJAXPvJ5e-hml5lEA6hUw.jpg,False,,link,,False,public,2022-06-14 14:26:08,text,6,,,text,newrepublic.com,False,,,,,https://newrepublic.com/article/166809/microsoft-activision-blizzard-union-agreement,,False,False,True,False,False,False,49cac61c-a816-11e9-be34-0ebbab5890a0,True,False,False,,False,,,,t5_2qh16,False,,,,,vc4o4m,True,,speckz,,0,True,all_ads,False,False,,/r/technology/comments/vc4o4m/microsoft_is_playing_nice_with_unionizing_workers/,all_ads,False,https://newrepublic.com/article/166809/microsoft-activision-blizzard-union-agreement,12197951,2022-06-14 14:26:08,0,,False
,technology,,t2_15sn60,False,,0,False,"Taylor Moore talked about various programs for image generation, shared his thoughts on whether Artists can be replaced by AI, and demonstrated lots of cool-looking images generated by him using DALL-E 2, Midjourney, and Disco Diffusion.",r/technology,False,6,general,0,73,,True,t3_vcx2m4,False,dark,1.0,,1,0,140,,False,,False,False,,Artificial Intelligence,False,1,,False,False,https://a.thumbs.redditmedia.com/RD7tFaK7c4qCRc3rDBhi-y6hLkdztn70_xRtLuHYLQ0.jpg,False,,link,,False,public,2022-06-15 15:25:52,text,6,,,text,80.lv,False,,,,,https://80.lv/articles/an-overview-of-various-ai-powered-text-to-image-tools/,,False,True,True,False,False,False,4264611c-a816-11e9-ae36-0e68c8f218d4,True,False,False,,False,,,,t5_2qh16,False,,,,,vcx2m4,True,,80lv,,0,True,all_ads,False,False,,/r/technology/comments/vcx2m4/taylor_moore_talked_about_various_programs_for/,all_ads,False,https://80.lv/articles/an-overview-of-various-ai-powered-text-to-image-tools/,12205196,2022-06-15 15:25:52,1,,False
,technology,,t2_71y7j38q,False,,0,False,This is a first for American Railroading: a leasing company is rebuilding 1970s F40PHs specifically for commuter railroads instead of purchasing brand new locomotives.,r/technology,False,6,general,0,67,,True,t3_vcz5m4,False,dark,1.0,,2,0,140,,False,,False,False,,Transportation,False,2,,False,False,https://b.thumbs.redditmedia.com/wfuXrOtgm5QOgV1rYNPvZxfh7hQ8zQ6VMA5MN3Aw_8M.jpg,False,,link,,False,public,2022-06-15 16:59:44,text,6,,,text,trains.com,False,,,,,https://www.trains.com/trn/news-reviews/news-wire/new-company-to-lease-locomotives-for-commuter-rail-operations/,,False,True,True,False,False,False,8bbeae4e-a816-11e9-bb93-0e20ab5bc1a0,True,False,False,,False,,,,t5_2qh16,False,,,,,vcz5m4,True,,Pensyfan19,,0,False,all_ads,False,False,,/r/technology/comments/vcz5m4/this_is_a_first_for_american_railroading_a/,all_ads,False,https://www.trains.com/trn/news-reviews/news-wire/new-company-to-lease-locomotives-for-commuter-rail-operations/,12205747,2022-06-15 16:59:44,0,,False
,technology,,t2_7ccf,False,,0,False,These Nanobots Can Swim Around a Wound and Kill Bacteria - Researchers have created autonomous particles covered with patches of protein “motors.” They hope these bots will tote lifesaving drugs through bodily fluids.,r/technology,False,6,general,0,73,,True,t3_uow5as,False,dark,1.0,,3,0,140,,False,,False,False,,Nanotech/Materials,False,3,,False,True,https://b.thumbs.redditmedia.com/4vRGGTRE3ditM-qD5CTVhieWeir0IsH2xKyP3MvDOMs.jpg,False,,link,,False,public,2022-05-13 16:55:28,text,6,,,text,wired.com,False,,,,,https://www.wired.com/story/these-nanobots-can-swim-around-a-wound-and-kill-bacteria/,,False,False,True,False,False,False,65de0cb0-a816-11e9-b338-0ed62c4da54a,True,False,False,,False,,,,t5_2qh16,False,,,,,uow5as,True,,speckz,,0,True,all_ads,False,False,,/r/technology/comments/uow5as/these_nanobots_can_swim_around_a_wound_and_kill/,all_ads,False,https://www.wired.com/story/these-nanobots-can-swim-around-a-wound-and-kill-bacteria/,11973872,2022-05-13 16:55:28,0,,False
,technology,,t2_6utha,False,,0,False,Did Twitch Violate Texas’ Social Media Law By Removing Mass Murderer’s Live Stream Of His Killing Spree?,r/technology,False,6,general,0,73,,True,t3_ur2490,False,dark,1.0,,3,0,140,,False,,False,False,,Social Media,False,3,,False,False,https://b.thumbs.redditmedia.com/r1qgYqnAfpNdK-UjgjD89X2FpRis10gjK9uDuUdg85s.jpg,False,,link,,False,public,2022-05-16 18:08:00,text,6,,,text,techdirt.com,False,,,,,https://www.techdirt.com/2022/05/16/did-twitch-violate-texas-social-media-law-by-removing-mass-murderers-live-stream-of-his-killing-spree/,,False,False,True,False,False,False,7d4d8376-a816-11e9-a92d-0e6b9fa95170,True,False,False,,False,,,,t5_2qh16,False,,,,,ur2490,True,,seven_seven,,0,True,all_ads,False,False,,/r/technology/comments/ur2490/did_twitch_violate_texas_social_media_law_by/,all_ads,False,https://www.techdirt.com/2022/05/16/did-twitch-violate-texas-social-media-law-by-removing-mass-murderers-live-stream-of-his-killing-spree/,11996188,2022-05-16 18:08:00,0,,False


In [6]:
%%sparksql

select CAST(created_utc AS DATE), count(*) from reddit_technology group by CAST(created_utc AS DATE) order by CAST(created_utc AS DATE) asc

                                                                                

0,1
created_utc,count(1)
2022-05-11,3
2022-05-12,33
2022-05-13,27
2022-05-16,15
2022-06-14,44
2022-06-15,33


In [7]:
# spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
# df = spark.sql("select * from reddit_technology")
# df_pandas = df.toPandas()
# df_pandas

In [8]:
# spark.stop()