# Read table

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from reddit.reddit_streaming import *
import datetime as dt
import pprint
pp = pprint.PrettyPrinter(indent = 1)
%load_ext sparksql_magic

creds, config = read_files()
subreddit = config["subreddit"]
kafka_host = config["kafka_host"]
spark_host = config["spark_host"]
aws_client = creds["aws-client"]
aws_secret = creds["aws-secret"]

try:
    spark = SparkSession.builder.appName("reddit_" + subreddit + "_read_data") \
                        .master("spark://{}:7077".format(spark_host)) \
                        .config("spark.sql.debug.maxToStringFields", 1000) \
                        .config("spark.eventLog.enabled", "true") \
                        .config("spark.eventLog.dir", "file:///opt/workspace/events") \
                        .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.0,org.apache.hadoop:hadoop-common:3.3.1,org.apache.hadoop:hadoop-aws:3.3.1,org.apache.hadoop:hadoop-client:3.3.1,io.delta:delta-core_2.12:1.2.1") \
                        .config("spark.hadoop.fs.s3a.access.key", aws_client) \
                        .config("spark.hadoop.fs.s3a.secret.key", aws_secret) \
                        .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
                        .config('spark.hadoop.fs.s3a.aws.credentials.provider', 'org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider') \
                        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
                        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
                        .enableHiveSupport() \
                        .getOrCreate()

    print("imported modules")

except Exception as e:
    print(e)

:: loading settings :: url = jar:file:/usr/local/lib/python3.7/dist-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
org.apache.hadoop#hadoop-common added as a dependency
org.apache.hadoop#hadoop-aws added as a dependency
org.apache.hadoop#hadoop-client added as a dependency
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-5d8cfe07-fdd6-462a-916c-ed93d35006d0;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.2.0 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.2.0 in central
	found org.apache.kafka#kafka-clients;2.8.0 in central
	found org.lz4#lz4-java;1.7.1 in central
	found org.xerial.snappy#snappy-java;1.1.8.4 in central
	found org.slf4j#slf4j-api;1.7.30 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.1 in central
	found org.spark-project.spark#unused;1.0.0 in central
	found org.apache.hadoop#hadoop-client-api;3.

imported modules


In [11]:
try:
    df = spark.read.format("delta").option("header", True).load("s3a://reddit-stevenhurwitt/" + subreddit)
    df = df.withColumn("approved_at_utc", col("approved_at_utc").cast("timestamp")) \
                .withColumn("banned_at_utc", col("banned_at_utc").cast("timestamp")) \
                .withColumn("created_utc", col("created_utc").cast("timestamp")) \
                .withColumn("created", col("created").cast("timestamp"))
    
    df.createOrReplaceTempView("reddit_{}".format(subreddit))

except KeyboardInterrupt:
    print("loading data took too long... cancelled.")

                                                                                

In [12]:
%%sparksql

select CAST(created_utc AS DATE), count(*) from reddit_technology 
group by CAST(created_utc AS DATE) 
order by CAST(created_utc AS DATE) asc

                                                                                

0,1
created_utc,count(1)
2022-05-11,3
2022-05-12,33
2022-05-13,27
2022-05-16,15
2022-06-14,44
2022-06-15,42
2022-06-16,6


In [14]:
%%sparksql

select * from reddit_technology where CAST(created_utc AS DATE) = '2022-06-16'

                                                                                

0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98
approved_at_utc,subreddit,selftext,author_fullname,saved,mod_reason_title,gilded,clicked,title,subreddit_name_prefixed,hidden,pwls,link_flair_css_class,downs,thumbnail_height,top_awarded_type,hide_score,name,quarantine,link_flair_text_color,upvote_ratio,author_flair_background_color,ups,total_awards_received,thumbnail_width,author_flair_template_id,is_original_content,secure_media,is_reddit_media_domain,is_meta,category,link_flair_text,can_mod_post,score,approved_by,is_created_from_ads_ui,author_premium,thumbnail,edited,author_flair_css_class,post_hint,content_categories,is_self,subreddit_type,created,link_flair_type,wls,removed_by_category,banned_by,author_flair_type,domain,allow_live_comments,selftext_html,likes,suggested_sort,banned_at_utc,url_overridden_by_dest,view_count,archived,no_follow,is_crosspostable,pinned,over_18,media_only,link_flair_template_id,can_gild,spoiler,locked,author_flair_text,visited,removed_by,mod_note,distinguished,subreddit_id,author_is_blocked,mod_reason_by,num_reports,removal_reason,link_flair_background_color,id,is_robot_indexable,report_reasons,author,discussion_type,num_comments,send_replies,whitelist_status,contest_mode,author_patreon_flair,author_flair_text_color,permalink,parent_whitelist_status,stickied,url,subreddit_subscribers,created_utc,num_crossposts,media,is_video
,technology,,t2_nkilhnr6,False,,0,False,AI is learning how to create itself,r/technology,False,6,general,0,70,,True,t3_vdaoma,False,dark,1.0,,2,0,140,,False,,False,False,,Machine Learning,False,2,,False,False,https://b.thumbs.redditmedia.com/rCZjhci9q0dlH-BdiMpmtAPNf6-_nzEzcCCX86VilDk.jpg,False,,link,,False,public,2022-06-16 01:57:20,text,6,,,text,technologyreview.com,False,,,,,https://www.technologyreview.com/2021/05/27/1025453/artificial-intelligence-learning-create-itself-agi/?utm_source=Facebook&amp;utm_medium=tr_social&amp;utm_campaign=site_visitor.unpaid.engagement,,False,False,True,False,False,False,63088948-a816-11e9-a4e1-0e7250e20740,True,False,False,,False,,,,t5_2qh16,False,,,,,vdaoma,True,,Krazyscientist,,0,True,all_ads,False,False,,/r/technology/comments/vdaoma/ai_is_learning_how_to_create_itself/,all_ads,False,https://www.technologyreview.com/2021/05/27/1025453/artificial-intelligence-learning-create-itself-agi/?utm_source=Facebook&amp;utm_medium=tr_social&amp;utm_campaign=site_visitor.unpaid.engagement,12208393,2022-06-16 01:57:20,1,,False
,technology,,t2_7ccf,False,,0,False,Facebook Is Receiving Sensitive Medical Information from Hospital Websites - Experts say some hospitals’ use of an ad tracking tool may violate a federal law protecting health information,r/technology,False,6,general,0,73,,True,t3_vdmkci,False,dark,1.0,,3,0,140,,False,,False,False,,Privacy,False,3,,False,True,https://a.thumbs.redditmedia.com/UuH2_cUD64o4fn97vx2zCj9r9xBYathK6M1q_m1Mwq4.jpg,False,,link,,False,public,2022-06-16 13:54:08,text,6,,,text,themarkup.org,False,,,,,https://themarkup.org/pixel-hunt/2022/06/16/facebook-is-receiving-sensitive-medical-information-from-hospital-websites,,False,False,True,False,False,False,73e6711c-a816-11e9-a993-0e21b1dd13b2,True,False,False,,False,,,,t5_2qh16,False,,,,,vdmkci,True,,speckz,,0,True,all_ads,False,False,,/r/technology/comments/vdmkci/facebook_is_receiving_sensitive_medical/,all_ads,False,https://themarkup.org/pixel-hunt/2022/06/16/facebook-is-receiving-sensitive-medical-information-from-hospital-websites,12211319,2022-06-16 13:54:08,0,,False
,technology,,t2_4wegnp53,False,,0,False,WSJ News Exclusive | Elon Musk Expected to Reiterate Desire to Own Twitter in Meeting Thursday,r/technology,False,6,general,0,70,,True,t3_vdadoc,False,dark,1.0,,1,0,140,,False,,False,False,,Social Media,False,1,,False,False,https://b.thumbs.redditmedia.com/siFK-uc_si7Zb5hKT2TfrhcEefkUnyoNPztrmE2EWNs.jpg,False,,link,,False,public,2022-06-16 01:40:16,text,6,,,text,wsj.com,False,,,,,https://www.wsj.com/amp/articles/elon-musk-expected-to-reiterate-desire-to-own-twitter-in-meeting-thursday-11655333603,,False,True,True,False,False,False,7d4d8376-a816-11e9-a92d-0e6b9fa95170,True,False,False,,False,,,,t5_2qh16,False,,,,,vdadoc,True,,dontloseyourway1610,,1,False,all_ads,False,False,,/r/technology/comments/vdadoc/wsj_news_exclusive_elon_musk_expected_to/,all_ads,False,https://www.wsj.com/amp/articles/elon-musk-expected-to-reiterate-desire-to-own-twitter-in-meeting-thursday-11655333603,12208318,2022-06-16 01:40:16,0,,False
,technology,,t2_lnlop4tb,False,,0,False,Japan makes online insults punishable by 1 year in jail in wake of reality star's death,r/technology,False,6,general,0,73,,True,t3_vd8y9a,False,dark,1.0,,2,0,140,,False,,False,False,,Society,False,2,,False,False,https://a.thumbs.redditmedia.com/Watv4fV4M2JveeeJcG1FH8kZDfQfRQJNIp9Q2UjpUl4.jpg,False,,link,,False,public,2022-06-16 00:25:36,text,6,,,text,nbcnews.com,False,,,,,https://www.nbcnews.com/news/world/japan-online-insults-jail-cyberbullying-reality-star-death-rcna33669,,False,True,True,False,False,False,80187d9a-a816-11e9-a79b-0ea66e3761e6,True,False,False,,False,,,,t5_2qh16,False,,,,,vd8y9a,True,,187Shotta,,0,True,all_ads,False,False,,/r/technology/comments/vd8y9a/japan_makes_online_insults_punishable_by_1_year/,all_ads,False,https://www.nbcnews.com/news/world/japan-online-insults-jail-cyberbullying-reality-star-death-rcna33669,12207967,2022-06-16 00:25:36,0,,False
,technology,,t2_88icg8jq,False,,0,False,Boring Company receives approval for expanding its tunnels to downtown Las Vegas,r/technology,False,6,general,0,73,,True,t3_vda9sf,False,dark,1.0,,3,0,140,,False,,False,False,,Transportation,False,3,,False,True,https://a.thumbs.redditmedia.com/hFIjrMDEOkQuhpf5Eh3qcDvkyTvGx6Yw318s2wpThS0.jpg,False,,link,,False,public,2022-06-16 01:36:00,text,6,,,text,theverge.com,False,,,,,https://www.theverge.com/2022/6/15/23170170/the-boring-company-las-vegas-loop-tunnels-expand,,False,False,True,False,False,False,8bbeae4e-a816-11e9-bb93-0e20ab5bc1a0,True,False,False,,False,,,,t5_2qh16,False,,,,,vda9sf,True,,TorukMaktoM,,0,True,all_ads,False,False,,/r/technology/comments/vda9sf/boring_company_receives_approval_for_expanding/,all_ads,False,https://www.theverge.com/2022/6/15/23170170/the-boring-company-las-vegas-loop-tunnels-expand,12208302,2022-06-16 01:36:00,0,,False
,technology,,t2_55nbx,False,,0,False,Inside Kraken’s Culture War Stoked by Its C.E.O.,r/technology,False,6,general,0,73,,True,t3_vd8wb6,False,dark,1.0,,1,0,140,,False,,False,False,,Crypto,False,1,,False,False,https://b.thumbs.redditmedia.com/3p9A7sEoR388DInnhHdt_CzkAWHVg1NwFKQ0INbh52Y.jpg,False,,link,,False,public,2022-06-16 00:23:28,text,6,,,text,nytimes.com,False,,,,,https://www.nytimes.com/2022/06/15/technology/kraken-crypto-culture.html?referringSource=articleShare,,False,True,True,False,False,False,4c702ff6-a816-11e9-b084-0ebd2fbd78b0,True,False,False,,False,,,,t5_2qh16,False,,,,,vd8wb6,True,,BalboaBaggins,,0,True,all_ads,False,False,,/r/technology/comments/vd8wb6/inside_krakens_culture_war_stoked_by_its_ceo/,all_ads,False,https://www.nytimes.com/2022/06/15/technology/kraken-crypto-culture.html?referringSource=articleShare,12207958,2022-06-16 00:23:28,0,,False


In [7]:
# spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
# df = spark.sql("select * from reddit_technology")
# df_pandas = df.toPandas()
# df_pandas

In [8]:
# spark.stop()