In [50]:
# Import Pyself. Resources
from pyspark.sql.functions import *

# My Imports
import os
import shutil
from datetime import datetime

# PySpark Imports
from pyspark.sql import SparkSession

class MyFunctions(object):

    def __init__(self):
        self.spark = (SparkSession.builder
                      .config('spark.jars', 'driver/postgresql-42.6.0.jar')
                      .config('spark.driver.extraClassPath', 'driver/postgresql-42.6.0.jar')
                      .appName("MyProject").getOrCreate())

        self.host = "localhost"
        self.port = "5432"
        self.database = "ensurwave"
        self.username = "postgres"
        self.password = "postgres"
        self.url = f"jdbc:postgresql://{self.host}:{self.port}/{self.database}"

In [62]:
# load_date=datetime.now().strptime(datetime.now().strftime("%Y%m%d"),"%Y%m%d")
load_date=datetime.now().strftime("%Y-%m-%d %H:%M:%S")
load_date


'2023-04-13 12:50:33'

In [59]:
os.getcwd()

'/Users/Marcusso/Documents/git/my_airflow_project/include'

In [53]:
mf = MyFunctions()

In [54]:
# Create a PySpark dataframe
employees_raw = mf.spark.read.option('inferSchema', True).option(
    'multiline', 'true').json('data/new/20230331_employees_details.json')

employees_raw.show()

+--------------------+----------+---------+---------+--------------------+
|          attributes|        id|isDeleted|     name|        salaryValues|
+--------------------+----------+---------+---------+--------------------+
|{2023-02-15T15:09...|abd1234rty|    false|Bob Smith|[{USD, Base, 5676...|
+--------------------+----------+---------+---------+--------------------+



In [72]:

other_types = [column for column, datatype in employees_raw.dtypes if 'struct' not in datatype and 'array' not in datatype]
struct_types = [f'{column}.*' for column, datatype in employees_raw.dtypes if 'struct' in datatype and 'array' not in datatype]
array_types = [column for column, datatype in employees_raw.dtypes if 'array' in datatype]

# Columns separating structs and array types
new = other_types + struct_types + array_types

second_change = employees_raw.select(new)

for i in array_types:
    second_change = second_change.select('*',explode(i).alias(f'{i}_ex')).drop(i).withColumnRenamed(f'{i}_ex',i)
    second_change = second_change.select('*',f'{i}.*').drop(i)

salary = second_change.select(col('id').alias('id_emp'),'currency','type','value').groupBy('id_emp','currency').pivot('type').sum('value').drop('value','type').dropDuplicates()

last_change = second_change.alias('emp').join(salary.alias('sal'), on=col('emp.id')==col('sal.id_emp'), how='inner').drop('id_emp','currency','type','value').dropDuplicates()
last_change = last_change.withColumn('loaddate', to_timestamp(lit(datetime.now().strftime("%Y%m%d %H:%M:%S")),'yyyyMMdd H:m:s'))
last_change.show()
last_change.printSchema

+----------+---------+---------+--------------------+--------+----------------+-----+-----+-------------------+
|        id|isDeleted|     name|            joinedOn|position|satisfactionScoe| Base|Bonus|           loaddate|
+----------+---------+---------+--------------------+--------+----------------+-----+-----+-------------------+
|abd1234rty|    false|Bob Smith|2023-02-15T15:09:...| Manager|            10.5|56767| 5000|2023-04-13 12:52:46|
+----------+---------+---------+--------------------+--------+----------------+-----+-----+-------------------+



<bound method DataFrame.printSchema of DataFrame[id: string, isDeleted: boolean, name: string, joinedOn: string, position: string, satisfactionScoe: double, Base: bigint, Bonus: bigint, loaddate: timestamp]>

In [13]:
# (last_change
#     .write
#     .format('jdbc')
#     .option('url', mf.url)
#     .mode('overwrite')
#     .option('dbtable', 'employee')
#     .option('user', mf.username)
#     .option('password', mf.password)
#     .option('driver', 'org.postgresql.Driver')
#     .save())

# Pare a sessão Spark
# spark.stop()


In [17]:
(mf.spark.read.format('jdbc').option('url', mf.url)
    .option('dbtable', 'schema_version')
    .option('user', mf.username)
    .option('password', mf.password)
    .option('driver', 'org.postgresql.Driver').load().show())

+--------------------+-------+
|              schema|version|
+--------------------+-------+
|StructType([Struc...|      1|
+--------------------+-------+

