# # Read Using PySpark

In [1]:
from pyspark.sql import SparkSession
import os
from loguru import logger
from pyspark.sql.functions import count
import re

spark = SparkSession.builder \
    .appName("Load-CSV-Pg")\
    .config("spark.jars.packages", "org.postgresql:postgresql:42.6.0")\
    .getOrCreate() 

jdbc_url = "jdbc:postgresql://localhost:5432/postgres"

properties = {
    "user": "postgres",
    "password": "postgres",
    "driver": "org.postgresql.Driver"
}

In [3]:
df = spark.read.jdbc(
    url=jdbc_url,
    table="myschema.products",
    properties=properties
)
df.show(1)

+------+--------------------+--------------------+---------------+--------+-----+--------+-----+-------------+------+----------+------------+-----------+
| Index|                Name|         Description|          Brand|Category|Price|Currency|Stock|          EAN| Color|      Size|Availability|Internal ID|
+------+--------------------+--------------------+---------------+--------+-----+--------+-----+-------------+------+----------+------------+-----------+
|769551|Advanced Scale He...|Specific simple r...|Barton-Arellano| Cycling|  926|     USD|  648|1571405406951|Tomato|100x200 mm|    in_stock|         33|
+------+--------------------+--------------------+---------------+--------+-----+--------+-----+-------------+------+----------+------------+-----------+
only showing top 1 row



#### Using .format("jdbc") (Enterprise Method)

In [5]:
df = (
    spark.read.format("jdbc")
    .option("url", jdbc_url)
    .option("dbtable", "myschema.products")
    .option("user", "postgres")
    .option("password", "postgres")
    .option("driver", "org.postgresql.Driver")
    .option("partitionColumn", "Index")     # Must be numeric column
    .option("lowerBound", 1)
    .option("upperBound", 1000000)
    .option("numPartitions", 8)
    .load()
)
df.show(1)

+-----+--------------------+--------------------+-------------+---------+-----+--------+-----+-------------+-----+-----------+------------+-----------+
|Index|                Name|         Description|        Brand| Category|Price|Currency|Stock|          EAN|Color|       Size|Availability|Internal ID|
+-----+--------------------+--------------------+-------------+---------+-----+--------+-----+-------------+-----+-----------+------------+-----------+
|86152|Fast Lock Freezer...|Free position we ...|Avery-Barnett|Furniture|  938|     USD|  516|8429788643315| Navy|Extra Large|   backorder|         16|
+-----+--------------------+--------------------+-------------+---------+-----+--------+-----+-------------+-----+-----------+------------+-----------+
only showing top 1 row



#### Using SQL Query Instead of Table

In [6]:
query = "(SELECT * FROM myschema.products limit 1) AS tmp"

df = spark.read.jdbc(
    url=jdbc_url,
    table=query,
    properties=properties
)

df.show(1)

+------+--------------------+--------------------+---------------+--------+-----+--------+-----+-------------+------+----------+------------+-----------+
| Index|                Name|         Description|          Brand|Category|Price|Currency|Stock|          EAN| Color|      Size|Availability|Internal ID|
+------+--------------------+--------------------+---------------+--------+-----+--------+-----+-------------+------+----------+------------+-----------+
|769551|Advanced Scale He...|Specific simple r...|Barton-Arellano| Cycling|  926|     USD|  648|1571405406951|Tomato|100x200 mm|    in_stock|         33|
+------+--------------------+--------------------+---------------+--------+-----+--------+-----+-------------+------+----------+------------+-----------+



# # List Tables in PostgreSQL Using PySpark

In [29]:
query = """
(
SELECT table_name
FROM information_schema.tables
WHERE table_schema = 'myschema'
) AS tbl
"""

df = spark.read.jdbc(
    url=jdbc_url,
    table=query,
    properties=properties
)
df.show()

+----------------+
|      table_name|
+----------------+
|        products|
|     products_1m|
|  salesrecords1m|
|flight_data_2010|
|flight_data_2015|
+----------------+



In [30]:
tables = [row.table_name for row in df.collect()]
print(tables)

['products', 'products_1m', 'salesrecords1m', 'flight_data_2010', 'flight_data_2015']


In [None]:
for row in df.collect():
    table_name = row["table_name"]

    df = spark.read.jdbc(
        url=jdbc_url,
        table=f"myschema.{table_name}",
        properties=properties
    )

    print(f"Read Table: {table_name}")
    print(f"Total Records: {df.count()}")
    df.show(2)


Read Table: products
Total Records: 1000000
+------+--------------------+--------------------+---------------+---------------+-----+--------+-----+-------------+------+----------+------------+-----------+
| Index|                Name|         Description|          Brand|       Category|Price|Currency|Stock|          EAN| Color|      Size|Availability|Internal ID|
+------+--------------------+--------------------+---------------+---------------+-----+--------+-----+-------------+------+----------+------------+-----------+
|769551|Advanced Scale He...|Specific simple r...|Barton-Arellano|        Cycling|  926|     USD|  648|1571405406951|Tomato|100x200 mm|    in_stock|         33|
|769552|   Mouse Radio Light|   Brother see page.|     Massey LLC|Office Supplies|  590|     USD|  851|9832739851667|   Red|     Small|    in_stock|         12|
+------+--------------------+--------------------+---------------+---------------+-----+--------+-----+-------------+------+----------+------------+---

# # How PostgreSQL Handles Schemas:

1. Set search_path:
- `SET search_path TO myschema;`

# # Show Tables in PostgreSQL:
1. Using SQL Query:
    ```
    SELECT table_name
    FROM information_schema.tables
    WHERE table_schema = 'myschema';
    ```
2. Using pg_catalog
    ```
    SELECT tablename
    FROM pg_catalog.pg_tables
    WHERE schemaname = 'myschema';
    ```
3. List Tables From All Schemas:
    ```
    SELECT table_schema, table_name
    FROM information_schema.tables
    WHERE table_type = 'BASE TABLE'
    ORDER BY table_schema, table_name;
    ```

# # To See All Schemas:
`SELECT schema_name FROM information_schema.schemata;`
