## PostgreSQL Query as PySpark Dataframe and PySpark Dataframe as PostgreSQL Table

In [1]:
from pathlib import Path
import configparser
import os
import pyspark
from pyspark.sql import SparkSession

In [2]:
config_file = os.getenv("CONFIG_PATH")

In [3]:
config_file

'C:\\Users\\danie\\.config\\config.ini'

Below is an example of what a config.ini file could contain:

[postgresql]<br>
jdbc_driver_path=.jdbc\postgresql-42.7.5.jar<br>
host=your_host<br>
username=your_username<br>
password=your_password<br>
database=your_db_name<br>
port=5432

In [4]:
def get_postgresql_credentials(path: str):
    config = configparser.ConfigParser()
    
    try:
        config.read(path)
    except ConfigFileNotFound:
        print("config.ini file not found")

    return {
        "host": config["postgresql"]["host"],
        "port": int(config["postgresql"]["port"]),
        "database": config["postgresql"]["database"],
        "username": config["postgresql"]["username"],
        "password": config["postgresql"]["password"],
        "driver": "org.postgresql.Driver",
        "jdbc_driver_path": Path.home() / config["postgresql"]["jdbc_driver_path"]
    }

In [5]:
credentials = get_postgresql_credentials(config_file)

In [6]:
spark = (
    SparkSession
    .builder.master("local[*]")
    .appName("Postgres")
    .config("spark.jars", credentials["jdbc_driver_path"])
    .getOrCreate()
)

#### PostgreSQL Query as Dataframe

In [7]:
query = "SELECT CURRENT_DATE"

In [8]:
jdbcDF = (spark.read
          .format("jdbc")
          .option("driver", credentials["driver"])
          .option("url", f'jdbc:postgresql://{credentials["host"]}:{credentials["port"]}/{credentials["database"]}')
          .option("user", credentials["username"])
          .option("password", credentials["password"])
          .option("query", query)
          .load()
         )

In [9]:
jdbcDF.show()

+------------+
|current_date|
+------------+
|  2025-02-17|
+------------+



In [10]:
jdbcDF.schema

StructType([StructField('current_date', DateType(), True)])

#### Dataframe as PostgreSQL Table

In [11]:
(jdbcDF
    .select("current_date").write.format("jdbc")
    .option("url", f'jdbc:postgresql://{credentials["host"]}:{credentials["port"]}/{credentials["database"]}')
    .option("driver", credentials["driver"])
    .option("dbtable", "my_table")
    .option("user", credentials["username"])
    .option("password", credentials["password"])
    # https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrameWriter.mode.html#pyspark.sql.DataFrameWriter.mode
    .mode("overwrite")
    .save()
)

In [12]:
spark.stop()

#### Using Context Manager (with)

In [13]:
with (SparkSession.builder.master("local[*]").appName("Postgres").config("spark.jars", credentials["jdbc_driver_path"]).getOrCreate()) as spark:
    query = "SELECT CURRENT_DATE"
    jdbcDF = (
        spark.read
        .format("jdbc")
        .option("driver", credentials["driver"])
        .option("url", f'jdbc:postgresql://{credentials["host"]}:{credentials["port"]}/{credentials["database"]}')
        .option("user", credentials["username"])
        .option("password", credentials["password"])
        .option("query", query)
        .load()
    )
    jdbcDF.show()
    
    (jdbcDF
        .select("current_date").write.format("jdbc")
        .option("url", f'jdbc:postgresql://{credentials["host"]}:{credentials["port"]}/{credentials["database"]}')
        .option("driver", credentials["driver"])
        .option("dbtable", "my_table")
        .option("user", credentials["username"])
        .option("password", credentials["password"])
        # https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrameWriter.mode.html#pyspark.sql.DataFrameWriter.mode
        .mode("overwrite")
        .save()
    )
    print("Completed saving dataframe as Postgres table")

+------------+
|current_date|
+------------+
|  2025-02-17|
+------------+

Completed saving dataframe as Postgres table
