In [31]:
from pathlib import Path
import configparser
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

In [2]:
config_file = '/home/pybokeh/config.ini'

In [3]:
config = configparser.ConfigParser()
try:
    config.read(config_file)
except ConfigFileNotFound:
    print("config.ini file not found")

In [4]:
postgres_jdbc_driver = Path(config['postgresql']['jdbc_driver_path'])

In [5]:
# Read in the Postgresql database credentials for DSN-less connection
pg_host = config["postgresql"]["host"]
pg_port = config["postgresql"]["port"]
pg_db = config["postgresql"]["database"]
pg_user = config["postgresql"]["username"]
pg_pwd = config["postgresql"]["password"]

In [6]:
url = f'jdbc:postgresql://{pg_host}:{pg_port}/{pg_db}'
driver = 'org.postgresql.Driver'

In [7]:
spark = (
    SparkSession
    .builder.master("local[*]")
    .appName("Postgres")
    .config("spark.jars", postgres_jdbc_driver)
    .getOrCreate()
)

23/05/21 01:51:04 WARN Utils: Your hostname, pybokeh-Lemur resolves to a loopback address: 127.0.1.1; using 192.168.1.147 instead (on interface wlp2s0)
23/05/21 01:51:04 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
23/05/21 01:51:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [18]:
query = "SELECT * from public.us_counties_pop_est_2010_2019_raw"

In [25]:
us_counties_pop_est_2010_2019_raw = (
    spark.read
    .format("jdbc")
    .option("driver", driver)
    .option("url", url)
    .option("user", pg_user)
    .option("password", pg_pwd)
    .option("query", "SELECT * from public.us_counties_pop_est_2010_2019_raw")
    .load()
)

In [26]:
regions = (
    spark.read
    .format("jdbc")
    .option("driver", driver)
    .option("url", url)
    .option("user", pg_user)
    .option("password", pg_pwd)
    .option("query", "SELECT * from public.regions")
    .load()
)

In [27]:
divisions = (
    spark.read
    .format("jdbc")
    .option("driver", driver)
    .option("url", url)
    .option("user", pg_user)
    .option("password", pg_pwd)
    .option("query", "SELECT * from public.divisions")
    .load()
)

In [53]:
us_counties_pop_est_2010_2019_raw.columns[:20]

['sumlev',
 'region',
 'division',
 'state',
 'county',
 'stname',
 'ctyname',
 'census2010pop',
 'estimatesbase2010',
 'popestimate2010',
 'popestimate2011',
 'popestimate2012',
 'popestimate2013',
 'popestimate2014',
 'popestimate2015',
 'popestimate2016',
 'popestimate2017',
 'popestimate2018',
 'popestimate2019',
 'npopchg_2010']

In [37]:
us_counties_pop_est_2010_2019_raw.select(
    col('state').alias('state_fips'),
    col('county').alias('county_fips'),
    col('stname').alias('state_name'),
    col('ctyname').alias('county_name'),
    col('region'),
    col('division'),
).show()

+----------+-----------+----------+---------------+------+--------+
|state_fips|county_fips|state_name|    county_name|region|division|
+----------+-----------+----------+---------------+------+--------+
|        01|        000|   Alabama|        Alabama|     3|       6|
|        01|        001|   Alabama| Autauga County|     3|       6|
|        01|        003|   Alabama| Baldwin County|     3|       6|
|        01|        005|   Alabama| Barbour County|     3|       6|
|        01|        007|   Alabama|    Bibb County|     3|       6|
|        01|        009|   Alabama|  Blount County|     3|       6|
|        01|        011|   Alabama| Bullock County|     3|       6|
|        01|        013|   Alabama|  Butler County|     3|       6|
|        01|        015|   Alabama| Calhoun County|     3|       6|
|        01|        017|   Alabama|Chambers County|     3|       6|
|        01|        019|   Alabama|Cherokee County|     3|       6|
|        01|        021|   Alabama| Chilton Coun

In [29]:
regions.show()

+------+-----------+
|region|region_name|
+------+-----------+
|     1|  Northeast|
|     2|    Midwest|
|     3|      South|
|     4|       West|
+------+-----------+



In [30]:
divisions.show()

+--------+------------------+
|division|     division_name|
+--------+------------------+
|       1|       New England|
|       2|   Middle Atlantic|
|       3|East North Central|
|       4|West North Central|
|       5|    South Atlantic|
|       6|East South Central|
|       7|West South Central|
|       8|           Montain|
|       9|           Pacific|
+--------+------------------+



#### Let's merge the 3 tables

In [57]:
(
    us_counties_pop_est_2010_2019_raw.select(
    col('state').alias('state_fips'),
    col('county').alias('county_fips'),
    col('stname').alias('state_name'),
    col('ctyname').alias('county_name'),
    col('region'),
    col('division'),
    ).join(
        regions,
        us_counties_pop_est_2010_2019_raw.region == regions.region,
        'left'
    ).join(
        divisions,
        us_counties_pop_est_2010_2019_raw.division == divisions.division,
        'left'
    )
    .drop(regions.region)
    .drop(divisions.division)
).show()

+----------+-----------+----------+---------------+------+--------+-----------+------------------+
|state_fips|county_fips|state_name|    county_name|region|division|region_name|     division_name|
+----------+-----------+----------+---------------+------+--------+-----------+------------------+
|        01|        000|   Alabama|        Alabama|     3|       6|      South|East South Central|
|        01|        001|   Alabama| Autauga County|     3|       6|      South|East South Central|
|        01|        003|   Alabama| Baldwin County|     3|       6|      South|East South Central|
|        01|        005|   Alabama| Barbour County|     3|       6|      South|East South Central|
|        01|        007|   Alabama|    Bibb County|     3|       6|      South|East South Central|
|        01|        009|   Alabama|  Blount County|     3|       6|      South|East South Central|
|        01|        011|   Alabama| Bullock County|     3|       6|      South|East South Central|
|        0

#### If the resulting dataframe above is what we want, we can then write this dataframe as a PostgreSQL table

In [61]:
(
    us_counties_pop_est_2010_2019_raw.select(
    col('state').alias('state_fips'),
    col('county').alias('county_fips'),
    col('stname').alias('state_name'),
    col('ctyname').alias('county_name'),
    col('region'),
    col('division'),
    ).join(
        regions,
        us_counties_pop_est_2010_2019_raw.region == regions.region,
        'left'
    ).join(
        divisions,
        us_counties_pop_est_2010_2019_raw.division == divisions.division,
        'left'
    )
    .drop(regions.region)
    .drop(divisions.division)
    # The following are needed to write this dataframe as a PostgreSQL table
    .write.format("jdbc")
    .option("url", url)
    .option("driver", driver)
    .option("dbtable", "public.us_pop_shortened")
    .option("user", pg_user)
    .option("password", pg_pwd)
    .mode("overwrite")
    .save()
)