# database upload

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
import psycopg2
import datetime as dt
from delta import *
import boto3
import pprint
import yaml
import time
import json
import sys
import ast
import os

pp = pprint.PrettyPrinter(indent = 1)
print("imported modules.")

The code failed because of a fatal error:
	Error sending http request and maximum retry encountered..

Some things to try:
a) Make sure Spark has enough available resources for Jupyter to create a Spark context.
b) Contact your Jupyter administrator to make sure the Spark magics library is configured correctly.
c) Restart the kernel.


## creds

In [2]:
creds_path = os.path.join("/opt", "workspace", "redditStreaming", "creds.json")

try:
    with open(creds_path, "r") as f:
        creds = json.load(f)
        print("read creds.json.")
        f.close()

except:
    creds_path = "/home/steven/Documents/reddit-streaming/redditStreaming/creds.json"
    with open(creds_path, "r") as f:
        creds = json.load(f)
        print("read creds.json.")
        f.close()

print("read creds successfully.")

read creds.json.
read creds successfully.


## spark

In [8]:
spark_host = "spark-master"
# spark_host = "spark-master"
aws_client = creds["aws_client"]
aws_secret = creds["aws_secret"]
index = 0
subreddit = "technology"

# initialize spark session
try:
    spark = SparkSession.builder.appName("reddit_{}".format(subreddit)) \
                .master("spark://{}:7077".format(spark_host)) \
                .config("spark.scheduler.mode", "FAIR") \
                .config("spark.scheduler.allocation.file", "file:///opt/workspace/redditStreaming/fairscheduler.xml") \
                .config("spark.executor.memory", "4096m") \
                .config("spark.executor.cores", "4") \
                .config("spark.local.dir", "/opt/workspace/tmp/driver/{}/".format(subreddit)) \
                .config("spark.worker.dir", "/opt/workspace/tmp/executor/{}/".format(subreddit)) \
                .config("spark.eventLog.enabled", "true") \
                .config("spark.eventLog.dir", "file:///opt/workspace/events/{}/".format(subreddit)) \
                .config("spark.sql.debug.maxToStringFields", 1000) \
                .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.1,org.apache.hadoop:hadoop-common:3.3.1,org.apache.hadoop:hadoop-aws:3.3.1,org.apache.hadoop:hadoop-client:3.3.1,io.delta:delta-core_2.12:1.2.1,org.postgresql:postgresql:42.5.0") \
                .config("spark.hadoop.fs.s3a.access.key", aws_client) \
                .config("spark.hadoop.fs.s3a.secret.key", aws_secret) \
                .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
                .config('spark.hadoop.fs.s3a.aws.credentials.provider', 'org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider') \
                .config('spark.hadoop.fs.s3a.buffer.dir', '/opt/workspace/tmp/blocks') \
                .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
                .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
                .config("spark.delta.logStore.class", "org.apache.spark.sql.delta.storage.S3SingleDriverLogStore") \
                .enableHiveSupport() \
                .getOrCreate()

    sc = spark.sparkContext
    # .config('spark.hadoop.fs.s3a.fast.upload.buffer', 'bytebuffer') \

    sc.setLogLevel('WARN')
    sc.setLocalProperty("spark.scheduler.pool", "pool{}".format(str(index)))
    # sc._jsc.hadoopConfiguration().set("fs.s3a.awsAccessKeyId", aws_client)
    # sc._jsc.hadoopConfiguration().set("fs.s3a.awsSecretAccessKey", aws_secret)
    # sc._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "s3.us-east-2.amazonaws.com")
    print("created spark successfully")

except Exception as e:
    print(e)

22/12/12 01:59:10 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).
created spark successfully


In [14]:
spark.stop()

## read clean df

In [4]:
def write_spark_jdbc(subreddit):

    creds_path = os.path.join("/opt", "workspace", "redditStreaming", "creds.json")

    try:
        with open(creds_path, "r") as f:
            creds = json.load(f)
            print("read creds.json.")
            f.close()

    except:
        creds_path = "/home/steven/Documents/reddit-streaming/redditStreaming/creds.json"
        with open(creds_path, "r") as f:
            creds = json.load(f)
            print("read creds.json.")
            f.close()

    secretmanager_client = boto3.client("secretsmanager", 
                                    region_name = "us-east-2", 
                                    aws_access_key_id = creds["aws_client"], 
                                    aws_secret_access_key = creds["aws_secret"])
    
    df = spark.read.format("delta").option("header", True).load("s3a://reddit-streaming-stevenhurwitt/" + subreddit + "_clean")

    db_creds = ast.literal_eval(secretmanager_client.get_secret_value(SecretId="dev/reddit/postgres")["SecretString"])
    connect_str = "jdbc:postgresql://{}:5432/reddit".format(db_creds["host"])

    try:
        df.write.format("jdbc") \
            .mode("overwrite") \
            .option("url", connect_str) \
            .option("dbtable", "reddit.{}".format(subreddit)) \
            .option("user", db_creds["username"]) \
            .option("password", db_creds["password"]) \
            .option("driver", "org.postgresql.Driver") \
            .save()

        print("wrote df to postgresql table.")

    except Exception as e:
        print(e)

In [11]:
with open("config.yaml") as g:
    config = yaml.safe_load(g)
    g.close()

In [12]:
config["subreddit"]

['technology',
 'ProgrammerHumor',
 'news',
 'worldnews',
 'BikiniBottomTwitter',
 'BlackPeopleTwitter',
 'WhitePeopleTwitter',
 'aws']

In [13]:
for s in config["subreddit"]:
    print("subreddit: {}".format(s))
    write_spark_jdbc(s)

subreddit: technology
read creds.json.


                                                                                

wrote df to postgresql table.
subreddit: ProgrammerHumor
read creds.json.


                                                                                

wrote df to postgresql table.
subreddit: news
read creds.json.


                                                                                

wrote df to postgresql table.
subreddit: worldnews
read creds.json.


                                                                                

wrote df to postgresql table.
subreddit: BikiniBottomTwitter
read creds.json.


                                                                                

wrote df to postgresql table.
subreddit: BlackPeopleTwitter
read creds.json.


                                                                                

wrote df to postgresql table.
subreddit: WhitePeopleTwitter
read creds.json.


                                                                                

wrote df to postgresql table.
subreddit: aws
read creds.json.


                                                                                

wrote df to postgresql table.


## write to postgres table

In [6]:
with open("config.yaml", "r") as g:
    config = yaml.safe_load(g)
    g.close()

### create schema

In [17]:
conn = psycopg2.connect(host = config["postgres_host"], user = config["postgres_user"], password = config["postgres_password"], database="postgres")


In [25]:
connect_str = "jdbc:postgresql://{}:5432/postgres".format(config["postgres_host"])

try:
    df.write.format("jdbc") \
        .mode("overwrite") \
        .option("url", connect_str) \
        .option("dbtable", "public.{}".format(subreddit)) \
        .option("user", config["postgres_user"]) \
        .option("password", config["postgres_password"]) \
        .option("driver", "org.postgresql.Driver") \
        .save()

    print("wrote df to postgresql table.")

except Exception as e:
    print(e)



wrote df to postgresql table.


                                                                                

## stop spark

In [24]:
try:
    spark.stop()

except Exception as e:
    print(e)

SparkSession does not exist in the JVM
