In [45]:
from pyspark.sql import SparkSession

# Import PySpark Resources
from pyspark.sql.functions import *
from pyspark.sql.types import *

# My Imports
import os

spark = (SparkSession.builder
         .appName("Postgres data loading")
         .config('spark.jars', 'driver/postgresql-42.6.0.jar')
         .getOrCreate())


In [46]:
file = 'data/new/20230331_employees_details.json'
print(os.getcwd())

print(file)

/Users/Marcusso/Documents/git/my_airflow_project/pyspark
data/new/20230331_employees_details.json


In [47]:
# Create a PySpark dataframe
employees_raw = spark.read.option('inferSchema', True).option(
    'multiline', 'true').json(file)

employees_raw.show()

+--------------------+----------+---------+---------+--------------------+
|          attributes|        id|isDeleted|     name|        salaryValues|
+--------------------+----------+---------+---------+--------------------+
|{2023-02-15T15:09...|abd1234rty|    false|Bob Smith|[{USD, Base, 5676...|
+--------------------+----------+---------+---------+--------------------+



In [33]:

other_types = [column for column, datatype in employees_raw.dtypes if 'struct' not in datatype and 'array' not in datatype]
struct_types = [f'{column}.*' for column, datatype in employees_raw.dtypes if 'struct' in datatype and 'array' not in datatype]
array_types = [column for column, datatype in employees_raw.dtypes if 'array' in datatype]

# Columns separating structs and array types
new = other_types + struct_types + array_types

second_change = employees_raw.select(new)

for i in array_types:
    second_change = second_change.select('*',explode(i).alias(f'{i}_ex')).drop(i).withColumnRenamed(f'{i}_ex',i)
    second_change = second_change.select('*',f'{i}.*').drop(i)

salary = second_change.select(col('id').alias('id_emp'),'currency','type','value').groupBy('id_emp','currency').pivot('type').sum('value').drop('value','type').dropDuplicates()

last_change = second_change.alias('emp').join(salary.alias('sal'), on=col('emp.id')==col('sal.id_emp'), how='inner').drop('id_emp','currency','type','value').dropDuplicates()
last_change.show()

+----------+---------+---------+--------------------+--------+----------------+-----+-----+
|        id|isDeleted|     name|            joinedOn|position|satisfactionScoe| Base|Bonus|
+----------+---------+---------+--------------------+--------+----------------+-----+-----+
|abd1234rty|    false|Bob Smith|2023-02-15T15:09:...| Manager|            10.5|56767| 5000|
+----------+---------+---------+--------------------+--------+----------------+-----+-----+



In [42]:
host = "localhost"
port = "5432"
database = "ensurwave"
username = "postgres"
password = "postgres"
url = f"jdbc:postgresql://{host}:{port}/{database}"
properties = {
    "user": username,
    "password": password
}

# Escreva os dados de volta no PostgreSQL

(last_change
    .write
    .format('jdbc')
    .option('url', url)
    .mode('append')
    .option('dbtable', 'ludmila3')
    .option('user', username)
    .option('password', password)
    .option('driver', 'org.postgresql.Driver')
    .save())

# Pare a sess√£o Spark
# spark.stop()


In [44]:
aaa = (spark
    .read
    .format('jdbc')
    .option('url', url)
    .option('dbtable', 'ludmila3')
    .option('user', username)
    .option('password', password)
    .option('driver', 'org.postgresql.Driver').load())
aaa.show()
aaa.count()

+----------+---------+---------+--------------------+--------+----------------+-----+-----+
|        id|isDeleted|     name|            joinedOn|position|satisfactionScoe| Base|Bonus|
+----------+---------+---------+--------------------+--------+----------------+-----+-----+
|abd1234rty|    false|Bob Smith|2023-02-15T15:09:...| Manager|            10.5|56767| 5000|
|abd1234rty|    false|Bob Smith|2023-02-15T15:09:...| Manager|            10.5|56767| 5000|
|abd1234rty|    false|Bob Smith|2023-02-15T15:09:...| Manager|            10.5|56767| 5000|
|abd1234rty|    false|Bob Smith|2023-02-15T15:09:...| Manager|            10.5|56767| 5000|
|abd1234rty|    false|Bob Smith|2023-02-15T15:09:...| Manager|            10.5|56767| 5000|
|abd1234rty|    false|Bob Smith|2023-02-15T15:09:...| Manager|            10.5|56767| 5000|
+----------+---------+---------+--------------------+--------+----------------+-----+-----+



6

---

In [48]:
schema_raw_data = str(spark.read.option('inferSchema',True).option('multiline','true').json(file).schema)

In [49]:

# Mounting the dataframe to save the history of schemas used.
data = [{'schema':schema_raw_data, 'version': 1}]
current_schema = spark.createDataFrame(data)

In [51]:

host = "localhost"
port = "5432"
database = "ensurwave"
username = "postgres"
password = "postgres"
url = f"jdbc:postgresql://{host}:{port}/{database}"

current_schema.show()

                                                                                

+--------------------+-------+
|              schema|version|
+--------------------+-------+
|StructType([Struc...|      1|
+--------------------+-------+



In [52]:

(current_schema
    .write
    .format('jdbc')
    .option('url', url)
    .mode('overwrite')
    .option('dbtable', 'schema_history')
    .option('user', username)
    .option('password', password)
    .option('driver', 'org.postgresql.Driver')
    .save())