## PostgreSQL Query as PySpark Dataframe and PySpark Dataframe as PostgreSQL Table

In [1]:
from pathlib import Path
import configparser
import os
import pyspark
from pyspark.sql import SparkSession

In [2]:
config_file = os.getenv("CONFIG_PATH")

In [3]:
config = configparser.ConfigParser()
try:
    config.read(config_file)
except ConfigFileNotFound:
    print("config.ini file not found")

In [4]:
postgres_jdbc_driver = Path(config['postgresql']['jdbc_driver_path'])

In [5]:
# Read in the Postgresql database credentials for DSN-less connection
pg_host = config["postgresql"]["host"]
pg_port = config["postgresql"]["port"]
pg_db = config["postgresql"]["database"]
pg_user = config["postgresql"]["username"]
pg_pwd = config["postgresql"]["password"]

In [6]:
url = f'jdbc:postgresql://{pg_host}:{pg_port}/{pg_db}'
driver = 'org.postgresql.Driver'

In [7]:
spark = (
    SparkSession
    .builder.master("local[*]")
    .appName("Postgres")
    .config("spark.jars", postgres_jdbc_driver)
    .getOrCreate()
)

#### PostgreSQL Query as Dataframe

In [8]:
query = "SELECT CURRENT_DATE"

In [10]:
jdbcDF = (spark.read
          .format("jdbc")
          .option("driver", driver)
          .option("url", url)
          .option("user", pg_user)
          .option("password", pg_pwd)
          .option("query", query)
          .load()
         )

In [11]:
jdbcDF.show()

+------------+
|current_date|
+------------+
|  2022-02-26|
+------------+



In [12]:
jdbcDF.schema

StructType(List(StructField(current_date,DateType,true)))

#### Dataframe as PostgreSQL Table

In [15]:
(jdbcDF
 .select("current_date").write.format("jdbc")
 .option("url", url)
 .option("driver", driver)
 .option("dbtable", "my_table")
 .option("user", pg_user)
 .option("password", pg_pwd)
 # https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrameWriter.mode.html#pyspark.sql.DataFrameWriter.mode
 .mode("overwrite")
 .save()
)

In [16]:
spark.stop()

#### Using Context Manager (with)

In [19]:
with (SparkSession.builder.master("local[*]").appName("Postgres").config("spark.jars", postgres_jdbc_driver).getOrCreate()) as spark:
    query = "SELECT CURRENT_DATE"
    jdbcDF = (
        spark.read
        .format("jdbc")
        .option("driver", driver)
        .option("url", url)
        .option("user", pg_user)
        .option("password", pg_pwd)
        .option("query", query)
        .load()
    )
    jdbcDF.show()
    
    (jdbcDF
     .select("current_date").write.format("jdbc")
     .option("url", url)
     .option("driver", driver)
     .option("dbtable", "my_table")
     .option("user", pg_user)
     .option("password", pg_pwd)
     # https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrameWriter.mode.html#pyspark.sql.DataFrameWriter.mode
     .mode("overwrite")
     .save()
    )
    print("Completed saving dataframe as Postgres table")

+------------+
|current_date|
+------------+
|  2022-02-26|
+------------+

Completed saving dataframe as Postgres table
