# Read Table

In [81]:
import os
import yaml
import json
base = os.getcwd()
print(base)

with open("config.yaml", "r") as f:
    config = yaml.safe_load(f)
    subreddit = config["subreddit"]
    post_type = config["post_type"]
    kafka_host = config["kafka_host"]
    # debug = config["debug"]
    debug = True
    f.close()

with open("creds.json", "r") as g:
    creds = json.load(g)
    g.close()
    
import time
import boto3
import kafka
import pprint
import requests
import numpy as np
import pandas as pd
import datetime as dt
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.column import *
from kafka import KafkaProducer
from kafka.errors import KafkaTimeoutError, NoBrokersAvailable

# from reddit.reddit_streaming import *
# import reddit

pp = pprint.PrettyPrinter(indent = 1)
print("imported modules.")

/opt/workspace/redditStreaming/src/main/python
imported modules.


In [82]:
# ! pip install /opt/workspace/redditStreaming/src/main/python/reddit/dist/reddit-0.1.0-py3-none-any.whl --force-reinstall

In [83]:
def get_bearer():
    """
    gets bearer token from reddit.

    returns: header for request
    """
    base = os.getcwd()

    creds_path_container = os.path.join("/opt", "workspace", "redditStreaming", "creds.json")

    creds_dir = "/".join(base.split("/")[:-3])
    creds_path = os.path.join(creds_dir, "creds.json")

    try:
        with open(creds_path, "r") as f:
            creds = json.load(f)
            f.close()

    except FileNotFoundError:
        with open(creds_path_container, "r") as f:
            creds = json.load(f)
            f.close()

    except:
        print("credentials file not found.")
        sys.exit()

    auth = requests.auth.HTTPBasicAuth(creds["client-id"], creds["secret-id"])
    data = {
            'grant_type': 'password',
            'username': creds["user"],
            'password': creds["password"]
            }
    headers = {'User-Agent': 'reddit-streaming/0.0.1'}

    response = requests.post('https://www.reddit.com/api/v1/access_token',
                    auth=auth, data=data, headers=headers)

    try:
        token = response.json()["access_token"]
        headers = {**headers, **{'Authorization': f"bearer {token}"}}
        return(headers)

    except Exception as e:
        print(e)
        print(response.json())
        pass

In [84]:
def get_subreddit(subreddit, limit, post_type, before, headers):
    """
    gets data for a given subreddit.

    params: subreddit (str) - name of subreddit
            limit (int) - number of results to return
            post_type (str) - type of posts (hot, new, controversial, top, etc)
            header (dict) - request header w/ bearer token

    returns: response (json) - body of api response
    """

    request_url = "https://oauth.reddit.com/r/{}/{}".format(subreddit, post_type)
    options = {"limit":str(limit), "before":str(before)}
    try:
        response = requests.get(request_url, 
                            headers = headers,
                            params = options)

        response_json = response.json()
        return(response_json)
    
    except Exception as e:
        pp.pprint(e)

def my_serializer(message):
    return json.dumps(message).encode('utf-8')


def subset_response(response):
    """
    remove nested data structures from response data

    params:
        response (json)

    return:
        data (dict)
        after_token (str)
    """
    data = response["data"]["children"][0]["data"] #subset for just the post data
    after_token = response["data"]["after"] #save "after" token to get posts after this one
    i = 0

    ## this looks really hacky, think of a better way to do this...
    try:
        #exclude nested data for schema simplicity
        data.pop("preview")
        data.pop("link_flair_richtext")
        data.pop("media_embed")
        data.pop("user_reports")
        data.pop("secure_media_embed")
        data.pop("author_flair_richtext")
        data.pop("gildings")
        data.pop("all_awardings")
        data.pop("awarders")
        data.pop("treatment_tags")
        data.pop("mod_reports")

    except:
        data.pop("link_flair_richtext")
        data.pop("media_embed")
        data.pop("user_reports")
        data.pop("secure_media_embed")
        data.pop("author_flair_richtext")
        data.pop("gildings")
        data.pop("all_awardings")
        data.pop("awarders")
        data.pop("treatment_tags")
        data.pop("mod_reports")

    return(data, after_token)

In [85]:
def poll_subreddit(subreddit, post_type, header, host, index, debug):
    """
    infinite loop to poll api & push new responses to kafka

    params:
        subreddit (str) - name of subreddit
        post_type (str) - type of posts (new, hot, controversial, etc)
        header (dict) - request header w/ bearer token
        host (str) - kafka host name
        port (int) - kafka port num
        debug (bool) - debug mode (True/False)

    """
    try:
        broker = ["{}:9092".format(host)]
        # topic = "reddit_" + subreddit

        producer = KafkaProducer(
                    bootstrap_servers=broker,
                    value_serializer=my_serializer
                    # api_version = (0, 10, 2)
                )
    
    except kafka.errors.NoBrokersAvailable:
        print("no kafka broker available.")
        sys.exit()

    params = {}
    params["topic"] = ["reddit_{}".format(s) for s in subreddit]
    topic = params["topic"][index]

    token_list = []

    for i, s in enumerate(subreddit):
        my_response = get_subreddit(s, 1, post_type, "", header)
        my_data, after_token = subset_response(my_response)
        token_list.append(after_token)
        # with open("sample_response.json", "w") as f:
        #     json.dump(my_data, f, indent = 1)

        if after_token is not None:
            producer.send(params["topic"][i], my_data)                          

            if debug:
                print("subreddit: {}, post date: {}, post title: {}, token: {}.".format(s, dt.datetime.fromtimestamp(my_data["created"]), my_data["title"], after_token))

    params["token"] = token_list
    if None in token_list:
        time.sleep(5)

    else:
        time.sleep(30)

    while True:
        token_list = []
        for i, s in enumerate(subreddit):
            after_token = params["token"][i]
            try:
                next_response = get_subreddit(s, 1, post_type, after_token, header)
                my_data, after_token = subset_response(next_response)

                ## weird bug where it hits the api too fast(?) and no after token is returned
                ## this passes None, which gives the current post & correct access token
                if after_token is not None:
                    producer.send(params["topic"][i], my_data)

                    if debug:
                        print("subreddit: {}, post date: {}, post title: {}, token: {}.".format(s, dt.datetime.fromtimestamp(my_data["created"]), my_data["title"], after_token))
                
                token_list.append(after_token) 
                
                time.sleep(5)

            except json.decoder.JSONDecodeError:
                # when the bearer token expires (after 24 hrs), we do not receive a response
                print("bearer token expired, reauthenticating...")
                header = get_bearer()
                after_token = params["token"][i]

                next_response = get_subreddit(s, 1, post_type, after_token, header)
                my_data, after_token = subset_response(next_response)

                if after_token is not None:
                    producer.send(params["topic"][i], my_data)

                    if debug:
                        print("subreddit: {}, post datetime: {}, post title: {}, token: {}.".format(s, dt.datetime.fromtimestamp(my_data["created"]), my_data["title"], after_token))
                
                token_list.append(after_token)
                time.sleep(5)
                pass

            except IndexError:
                # this means empty response is returned, take a nap
                # time.sleep(120)
                # print("no more data for subreddit: {}.".format(s))
                token_list.append(params["token"][i])
                time.sleep(3)
                pass

            except Exception as e:
                # catch all for api exceptions (SSL errors, ConnectionError, etc)
                print(e)
                token_list.append(params["token"][i])
                # pass
                time.sleep(60)
                pass

        params["token"] = token_list
        if None in token_list:
            time.sleep(5)

        else:
            time.sleep(110)
    

In [86]:
subreddit = "technology"
post_type = "new"
kafka_host = config["kafka_host"]
spark_host = config["spark_host"]
postgres_host = config["postgres_host"]

my_header = get_bearer()
pp.pprint(my_header)
print("authenticated w/ bearer token good for 24 hrs.")
# poll_subreddit(subreddit, post_type, my_header, kafka_host, 0, True)


{'Authorization': 'bearer 11321522-cc71rlLtu_4MMRmll48g3mv4TAH5GQ',
 'User-Agent': 'reddit-streaming/0.0.1'}
authenticated w/ bearer token good for 24 hrs.


## Spark

In [87]:
# ! pip install pyyaml --force-reinstall

In [88]:
try:
    pp = pprint.PrettyPrinter(indent = 1)
    %load_ext sparksql_magic

    # creds, config = read_files()
    with open("config.yaml", "r") as g:
        config = yaml.safe_load(g)
        g.close()

    with open("creds.json", "r") as f:
        creds = json.load(f)
        f.close()

    subreddit_list = config["subreddit"]
    kafka_host = config["kafka_host"]
    spark_host = config["spark_host"]
    aws_client = creds["aws_client"]
    aws_secret = creds["aws_secret"]

except Exception as e:
    print("EXCEPTION: {}".format(e))

The sparksql_magic extension is already loaded. To reload it, use:
  %reload_ext sparksql_magic


In [89]:
for subreddit in subreddit_list:
    spark = SparkSession.builder.appName("reddit_" + subreddit + "_read_data") \
                        .master("spark://{}:7077".format(spark_host)) \
                        .config("spark.scheduler.mode", "FAIR") \
                        .config("spark.scheduler.allocation.file", "file:///opt/workspace/redditStreaming/fairscheduler.xml") \
                        .config("spark.executor.memory", "1024m") \
                        .config("spark.executor.cores", "2") \
                        .config("spark.streaming.concurrentJobs", "4") \
                        .config("spark.local.dir", "/opt/workspace/tmp/driver/{}/".format(subreddit)) \
                        .config("spark.worker.dir", "/opt/workspace/tmp/executor/{}/".format(subreddit)) \
                        .config("spark.sql.debug.maxToStringFields", 1000) \
                        .config("spark.eventLog.enabled", "true") \
                        .config("spark.eventLog.dir", "file:///opt/workspace/events") \
                        .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.0,org.apache.hadoop:hadoop-common:3.3.1,org.apache.hadoop:hadoop-aws:3.3.1,org.apache.hadoop:hadoop-client:3.3.1,io.delta:delta-core_2.12:1.2.1") \
                        .config("spark.hadoop.fs.s3a.access.key", aws_client) \
                        .config("spark.hadoop.fs.s3a.secret.key", aws_secret) \
                        .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
                        .config('spark.hadoop.fs.s3a.aws.credentials.provider', 'org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider') \
                        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
                        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
                        .config("spark.delta.logStore.class", "org.apache.spark.sql.delta.storage.S3SingleDriverLogStore") \
                        .enableHiveSupport() \
                        .getOrCreate()

    print("created spark successfully.")

    df = spark.read.format("delta").option("header", True).load("s3a://reddit-stevenhurwitt/" + subreddit + "_clean/")
    
    df.createOrReplaceTempView("reddit_{}".format(subreddit))
    print("created table reddit_{}.".format(subreddit))

created spark successfully.


Py4JJavaError: An error occurred while calling o93.load.
: java.util.concurrent.ExecutionException: org.apache.hadoop.fs.s3a.UnknownStoreException: s3a://reddit-stevenhurwitt/technology_clean/_delta_log
	at com.google.common.util.concurrent.AbstractFuture$Sync.getValue(AbstractFuture.java:306)
	at com.google.common.util.concurrent.AbstractFuture$Sync.get(AbstractFuture.java:293)
	at com.google.common.util.concurrent.AbstractFuture.get(AbstractFuture.java:116)
	at com.google.common.util.concurrent.Uninterruptibles.getUninterruptibly(Uninterruptibles.java:135)
	at com.google.common.cache.LocalCache$Segment.getAndRecordStats(LocalCache.java:2410)
	at com.google.common.cache.LocalCache$Segment.loadSync(LocalCache.java:2380)
	at com.google.common.cache.LocalCache$Segment.lockedGetOrLoad(LocalCache.java:2342)
	at com.google.common.cache.LocalCache$Segment.get(LocalCache.java:2257)
	at com.google.common.cache.LocalCache.get(LocalCache.java:4000)
	at com.google.common.cache.LocalCache$LocalManualCache.get(LocalCache.java:4789)
	at org.apache.spark.sql.delta.DeltaLog$.getDeltaLogFromCache$1(DeltaLog.scala:577)
	at org.apache.spark.sql.delta.DeltaLog$.apply(DeltaLog.scala:584)
	at org.apache.spark.sql.delta.DeltaLog$.forTable(DeltaLog.scala:487)
	at org.apache.spark.sql.delta.catalog.DeltaTableV2.deltaLog$lzycompute(DeltaTableV2.scala:78)
	at org.apache.spark.sql.delta.catalog.DeltaTableV2.deltaLog(DeltaTableV2.scala:78)
	at org.apache.spark.sql.delta.catalog.DeltaTableV2.$anonfun$snapshot$3(DeltaTableV2.scala:107)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.sql.delta.catalog.DeltaTableV2.snapshot$lzycompute(DeltaTableV2.scala:107)
	at org.apache.spark.sql.delta.catalog.DeltaTableV2.snapshot(DeltaTableV2.scala:95)
	at org.apache.spark.sql.delta.catalog.DeltaTableV2.toBaseRelation(DeltaTableV2.scala:165)
	at org.apache.spark.sql.delta.sources.DeltaDataSource.$anonfun$createRelation$4(DeltaDataSource.scala:187)
	at org.apache.spark.sql.delta.metering.DeltaLogging.recordFrameProfile(DeltaLogging.scala:120)
	at org.apache.spark.sql.delta.metering.DeltaLogging.recordFrameProfile$(DeltaLogging.scala:118)
	at org.apache.spark.sql.delta.sources.DeltaDataSource.recordFrameProfile(DeltaDataSource.scala:50)
	at org.apache.spark.sql.delta.sources.DeltaDataSource.createRelation(DeltaDataSource.scala:164)
	at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:350)
	at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:228)
	at org.apache.spark.sql.DataFrameReader.$anonfun$load$2(DataFrameReader.scala:210)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:210)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:185)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:750)
Caused by: org.apache.hadoop.fs.s3a.UnknownStoreException: s3a://reddit-stevenhurwitt/technology_clean/_delta_log
	at org.apache.hadoop.fs.s3a.S3AUtils.translateException(S3AUtils.java:257)
	at org.apache.hadoop.fs.s3a.S3AUtils.translateException(S3AUtils.java:170)
	at org.apache.hadoop.fs.s3a.S3AFileSystem.s3GetFileStatus(S3AFileSystem.java:3348)
	at org.apache.hadoop.fs.s3a.S3AFileSystem.innerGetFileStatus(S3AFileSystem.java:3185)
	at org.apache.hadoop.fs.s3a.S3AFileSystem.getFileStatus(S3AFileSystem.java:3053)
	at org.apache.hadoop.fs.FileSystem.exists(FileSystem.java:1760)
	at org.apache.hadoop.fs.s3a.S3AFileSystem.exists(S3AFileSystem.java:4263)
	at org.apache.spark.sql.delta.storage.S3SingleDriverLogStore.listFromInternal(S3SingleDriverLogStore.scala:120)
	at org.apache.spark.sql.delta.storage.S3SingleDriverLogStore.listFrom(S3SingleDriverLogStore.scala:140)
	at org.apache.spark.sql.delta.SnapshotManagement.listFrom(SnapshotManagement.scala:68)
	at org.apache.spark.sql.delta.SnapshotManagement.listFrom$(SnapshotManagement.scala:67)
	at org.apache.spark.sql.delta.DeltaLog.listFrom(DeltaLog.scala:64)
	at org.apache.spark.sql.delta.SnapshotManagement.listDeltaAndCheckpointFiles(SnapshotManagement.scala:96)
	at org.apache.spark.sql.delta.SnapshotManagement.$anonfun$getLogSegmentForVersion$1(SnapshotManagement.scala:138)
	at org.apache.spark.sql.delta.metering.DeltaLogging.recordFrameProfile(DeltaLogging.scala:120)
	at org.apache.spark.sql.delta.metering.DeltaLogging.recordFrameProfile$(DeltaLogging.scala:118)
	at org.apache.spark.sql.delta.DeltaLog.recordFrameProfile(DeltaLog.scala:64)
	at org.apache.spark.sql.delta.SnapshotManagement.getLogSegmentForVersion(SnapshotManagement.scala:135)
	at org.apache.spark.sql.delta.SnapshotManagement.getLogSegmentForVersion$(SnapshotManagement.scala:132)
	at org.apache.spark.sql.delta.DeltaLog.getLogSegmentForVersion(DeltaLog.scala:64)
	at org.apache.spark.sql.delta.SnapshotManagement.getLogSegmentFrom(SnapshotManagement.scala:63)
	at org.apache.spark.sql.delta.SnapshotManagement.getLogSegmentFrom$(SnapshotManagement.scala:61)
	at org.apache.spark.sql.delta.DeltaLog.getLogSegmentFrom(DeltaLog.scala:64)
	at org.apache.spark.sql.delta.SnapshotManagement.$anonfun$getSnapshotAtInit$1(SnapshotManagement.scala:248)
	at org.apache.spark.sql.delta.metering.DeltaLogging.recordFrameProfile(DeltaLogging.scala:120)
	at org.apache.spark.sql.delta.metering.DeltaLogging.recordFrameProfile$(DeltaLogging.scala:118)
	at org.apache.spark.sql.delta.DeltaLog.recordFrameProfile(DeltaLog.scala:64)
	at org.apache.spark.sql.delta.SnapshotManagement.getSnapshotAtInit(SnapshotManagement.scala:246)
	at org.apache.spark.sql.delta.SnapshotManagement.getSnapshotAtInit$(SnapshotManagement.scala:245)
	at org.apache.spark.sql.delta.DeltaLog.getSnapshotAtInit(DeltaLog.scala:64)
	at org.apache.spark.sql.delta.SnapshotManagement.$init$(SnapshotManagement.scala:53)
	at org.apache.spark.sql.delta.DeltaLog.<init>(DeltaLog.scala:69)
	at org.apache.spark.sql.delta.DeltaLog$.$anonfun$apply$3(DeltaLog.scala:567)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$.allowInvokingTransformsInAnalyzer(AnalysisHelper.scala:323)
	at org.apache.spark.sql.delta.DeltaLog$.$anonfun$apply$2(DeltaLog.scala:567)
	at org.apache.spark.sql.delta.metering.DeltaLogging.recordFrameProfile(DeltaLogging.scala:120)
	at org.apache.spark.sql.delta.metering.DeltaLogging.recordFrameProfile$(DeltaLogging.scala:118)
	at org.apache.spark.sql.delta.DeltaLog$.recordFrameProfile(DeltaLog.scala:437)
	at org.apache.spark.sql.delta.metering.DeltaLogging.$anonfun$recordDeltaOperation$5(DeltaLogging.scala:114)
	at com.databricks.spark.util.DatabricksLogging.recordOperation(DatabricksLogging.scala:77)
	at com.databricks.spark.util.DatabricksLogging.recordOperation$(DatabricksLogging.scala:67)
	at org.apache.spark.sql.delta.DeltaLog$.recordOperation(DeltaLog.scala:437)
	at org.apache.spark.sql.delta.metering.DeltaLogging.recordDeltaOperation(DeltaLogging.scala:113)
	at org.apache.spark.sql.delta.metering.DeltaLogging.recordDeltaOperation$(DeltaLogging.scala:98)
	at org.apache.spark.sql.delta.DeltaLog$.recordDeltaOperation(DeltaLog.scala:437)
	at org.apache.spark.sql.delta.DeltaLog$.createDeltaLog$1(DeltaLog.scala:566)
	at org.apache.spark.sql.delta.DeltaLog$.$anonfun$apply$4(DeltaLog.scala:577)
	at com.google.common.cache.LocalCache$LocalManualCache$1.load(LocalCache.java:4792)
	at com.google.common.cache.LocalCache$LoadingValueReference.loadFuture(LocalCache.java:3599)
	at com.google.common.cache.LocalCache$Segment.loadSync(LocalCache.java:2379)
	... 37 more
Caused by: com.amazonaws.services.s3.model.AmazonS3Exception: The specified bucket does not exist (Service: Amazon S3; Status Code: 404; Error Code: NoSuchBucket; Request ID: TXMJ0FH2Q2HC5GRN; S3 Extended Request ID: ymbPzMKHaj2PumQq+GC0iM6CnqNHLffiWJ1lNYWo6dTtZQWTVc/8Aioci/iSAo1ps+mS4BtwVk8=; Proxy: null), S3 Extended Request ID: ymbPzMKHaj2PumQq+GC0iM6CnqNHLffiWJ1lNYWo6dTtZQWTVc/8Aioci/iSAo1ps+mS4BtwVk8=
	at com.amazonaws.http.AmazonHttpClient$RequestExecutor.handleErrorResponse(AmazonHttpClient.java:1828)
	at com.amazonaws.http.AmazonHttpClient$RequestExecutor.handleServiceErrorResponse(AmazonHttpClient.java:1412)
	at com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeOneRequest(AmazonHttpClient.java:1374)
	at com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeHelper(AmazonHttpClient.java:1145)
	at com.amazonaws.http.AmazonHttpClient$RequestExecutor.doExecute(AmazonHttpClient.java:802)
	at com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeWithTimer(AmazonHttpClient.java:770)
	at com.amazonaws.http.AmazonHttpClient$RequestExecutor.execute(AmazonHttpClient.java:744)
	at com.amazonaws.http.AmazonHttpClient$RequestExecutor.access$500(AmazonHttpClient.java:704)
	at com.amazonaws.http.AmazonHttpClient$RequestExecutionBuilderImpl.execute(AmazonHttpClient.java:686)
	at com.amazonaws.http.AmazonHttpClient.execute(AmazonHttpClient.java:550)
	at com.amazonaws.http.AmazonHttpClient.execute(AmazonHttpClient.java:530)
	at com.amazonaws.services.s3.AmazonS3Client.invoke(AmazonS3Client.java:5227)
	at com.amazonaws.services.s3.AmazonS3Client.invoke(AmazonS3Client.java:5173)
	at com.amazonaws.services.s3.AmazonS3Client.invoke(AmazonS3Client.java:5167)
	at com.amazonaws.services.s3.AmazonS3Client.listObjectsV2(AmazonS3Client.java:963)
	at org.apache.hadoop.fs.s3a.S3AFileSystem.lambda$listObjects$7(S3AFileSystem.java:2116)
	at org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.lambda$trackDurationOfOperation$5(IOStatisticsBinding.java:499)
	at org.apache.hadoop.fs.s3a.Invoker.retryUntranslated(Invoker.java:412)
	at org.apache.hadoop.fs.s3a.Invoker.retryUntranslated(Invoker.java:375)
	at org.apache.hadoop.fs.s3a.S3AFileSystem.listObjects(S3AFileSystem.java:2107)
	at org.apache.hadoop.fs.s3a.S3AFileSystem.s3GetFileStatus(S3AFileSystem.java:3322)
	... 84 more


## Athena

In [94]:
pp = pprint.PrettyPrinter(indent = 1)

subreddit = "technology" 
athena = boto3.client('athena', region_name = "us-east-1")
df = athena.start_query_execution(
         QueryString = "select * from reddit.{}".format(subreddit),
         ResultConfiguration = {
             'OutputLocation': "s3://reddit-stevenhurwitt/_athena_results"
         })

# pp.pprint(df)

query_execution_id = df["QueryExecutionId"]
request_id = df["ResponseMetadata"]["RequestId"]
print("query execution id : {}".format(query_execution_id))
print("request id: {}".format(request_id))

ClientError: An error occurred (UnrecognizedClientException) when calling the StartQueryExecution operation: The security token included in the request is invalid.

In [91]:
response = athena.get_query_results(QueryExecutionId=query_execution_id, MaxResults=1000)
df = response["ResultSet"]["Rows"][0]["Data"]
# df = pd.DataFrame.from_records(df)
df

NameError: name 'athena' is not defined

In [92]:
pp.pprint(df)

NameError: name 'df' is not defined

In [93]:
pd.DataFrame.from_dict(df, orient = "columns")

NameError: name 'df' is not defined

## technology

In [10]:
%%sparksql

select post_date, count(*) from reddit_technology 
group by post_date 
order by post_date asc

UsageError: Cell magic `%%sparksql` not found.


In [11]:
%%sparksql

select * from reddit_technology order by created_utc asc limit 10

UsageError: Cell magic `%%sparksql` not found.


## news

In [12]:
%%sparksql

select post_date, count(*) from reddit_news
group by post_date 
order by post_date asc

UsageError: Cell magic `%%sparksql` not found.


In [13]:
%%sparksql

select * from reddit_news order by created_utc asc limit 10

UsageError: Cell magic `%%sparksql` not found.


## worldnews

In [14]:
%%sparksql

select post_date, count(*) from reddit_worldnews
group by post_date
order by post_date asc

UsageError: Cell magic `%%sparksql` not found.


In [15]:
%%sparksql

select * from reddit_worldnews order by created_utc asc limit 10

UsageError: Cell magic `%%sparksql` not found.


## ProgrammerHumor

In [16]:
%%sparksql

select post_date, count(*) from reddit_ProgrammerHumor
group by post_date
order by post_date asc

UsageError: Cell magic `%%sparksql` not found.


In [17]:
%%sparksql

select * from reddit_ProgrammerHumor order by created_utc asc limit 10

UsageError: Cell magic `%%sparksql` not found.


### stop spark

In [18]:
spark.stop()
print("stopped spark successfully.")

NameError: name 'spark' is not defined