In [95]:
# Import Pyself. Resources
from pyspark.sql.functions import *

# My Imports
import os
import shutil
from datetime import datetime
import hashlib

# PySpark Imports
from pyspark.sql import SparkSession

class MyFunctions(object):

    def __init__(self):
        self.spark = (SparkSession.builder
                      .config('spark.jars', 'driver/postgresql-42.6.0.jar')
                      .config('spark.driver.extraClassPath', 'driver/postgresql-42.6.0.jar')
                      .appName("MyProject").getOrCreate())

        self.host = "localhost"
        self.port = "5432"
        self.database = "ensurwave"
        self.username = "postgres"
        self.password = "postgres"
        self.url = f"jdbc:postgresql://{self.host}:{self.port}/{self.database}"

        self.file = 'data/new/20230331_employees_details.json'

In [97]:
mf = MyFunctions()

In [99]:
# Getting the schema
schema_raw_data = str(mf.spark.read.option('inferSchema',True).option('multiline','true').json(mf.file).schema)

current_no_space = schema_raw_data.replace(' ','')
current_no_space = [i for i in current_no_space]
current_no_space.sort()
current_no_space = ''.join(current_no_space)

current_hash_no_space = hashlib.sha256(current_no_space.encode('utf-8'))
current_hash_no_space = current_hash_no_space.hexdigest()

# Mounting the dataframe to save the history of schemas used.
data = [{'id' : 1, 'schema' : schema_raw_data, 'hash' : current_hash_no_space, 'version' : 1}]
current_schema = mf.spark.createDataFrame(data)
current_schema = current_schema.select('id','schema','hash','version').withColumn('load_timestamp', to_timestamp(lit(datetime.now().strftime("%Y%m%d %H:%M:%S")),'yyyyMMdd H:m:s'))
current_schema.show(truncate=False)

+---+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------+-------+-------------------+
|id |schema                                                                                                                                                                                                                                                                                                                                                                                

In [102]:
# Get Current table in 
try:
    table_history = (mf.spark.read.format('jdbc').option('url', mf.url)
                        .option('dbtable', 'schema_history')
                        .option('user', mf.username)
                        .option('password', mf.password)
                        .option('driver', 'org.postgresql.Driver').load())

    max_value = table_history.select(max(col('id')).alias('max_id'), max(col('version')).alias('max_version')).collect()[0]
    max_id, max_version = max_value
    new_id, new_version = max_id + 1, max_version + 1

    new_value = [{'id':new_id, 'schema':schema_raw_data, 'hash' : current_hash_no_space, 'version':new_version}]

    df_new_value = mf.spark.createDataFrame(new_value)
    df_new_value = df_new_value.select('id','schema','hash','version').withColumn('load_timestamp', to_timestamp(lit(datetime.now().strftime("%Y%m%d %H:%M:%S")),'yyyyMMdd H:m:s'))

    old_hash_schema = table_history.filter(f"id == ({max_id})").select('hash').collect()[0][0]
    
    if old_hash_schema != current_hash_no_space:
        df_persist = table_history.unionAll(df_new_value)

        df_persist.write.mode('overwrite').saveAsTable('df_persist')

        tb_persist = mf.spark.table('df_persist').orderBy(col('id'))

        (tb_persist
            .write
            .mode('overwrite')
            .format('jdbc')
            .option('url', mf.url)
            .option('dbtable', 'schema_history')
            .option('user', mf.username)
            .option('password', mf.password)
            .option('driver', 'org.postgresql.Driver')
            .save())
        shutil.rmtree(f'{os.getcwd()}/spark-warehouse')
# Creating Table
except:
    print('Nova Tabela')
    (current_schema
        .write
        .format('jdbc')
        .option('url', mf.url)
        .mode('overwrite')
        .option('dbtable', 'schema_history')
        .option('user', mf.username)
        .option('password', mf.password)
        .option('driver', 'org.postgresql.Driver')
        .save())

In [103]:
current = (mf.spark.read
    .format('jdbc')
    .option('url', mf.url)
    .option('dbtable', 'schema_history')
    .option('user', mf.username)
    .option('password', mf.password)
    .option('driver', 'org.postgresql.Driver')
    .load())
current.show(truncate=False)

+---+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------+-------+-------------------+
|id |schema                                                                                                                                                                                                                                                                                                                                                                                

In [166]:

other_types = [column for column, datatype in employees_raw.dtypes if 'struct' not in datatype and 'array' not in datatype]
struct_types = [f'{column}.*' for column, datatype in employees_raw.dtypes if 'struct' in datatype and 'array' not in datatype]
array_types = [column for column, datatype in employees_raw.dtypes if 'array' in datatype]

# Columns separating structs and array types
new = other_types + struct_types + array_types

second_change = employees_raw.select(new)

for i in array_types:
    second_change = second_change.select('*',explode(i).alias(f'{i}_ex')).drop(i).withColumnRenamed(f'{i}_ex',i)
    second_change = second_change.select('*',f'{i}.*').drop(i)

salary = second_change.select(col('id').alias('id_emp'),'currency','type','value').groupBy('id_emp','currency').pivot('type').sum('value').drop('value','type').dropDuplicates()

last_change = second_change.alias('emp').join(salary.alias('sal'), on=col('emp.id')==col('sal.id_emp'), how='inner').drop('id_emp','currency','type','value').dropDuplicates()
last_change = last_change.withColumn('loaddate', to_timestamp(lit(datetime.now().strftime("%Y%m%d %H:%M:%S")),'yyyyMMdd H:m:s'))
last_change.show()
last_change.printSchema

NameError: name 'employees_raw' is not defined

In [None]:
# (last_change
#     .write
#     .format('jdbc')
#     .option('url', mf.url)
#     .mode('overwrite')
#     .option('dbtable', 'employee')
#     .option('user', mf.username)
#     .option('password', mf.password)
#     .option('driver', 'org.postgresql.Driver')
#     .save())

# Pare a sessão Spark
# spark.stop()


In [None]:
(mf.spark.read.format('jdbc').option('url', mf.url)
    .option('dbtable', 'schema_version')
    .option('user', mf.username)
    .option('password', mf.password)
    .option('driver', 'org.postgresql.Driver').load().show())

+--------------------+-------+
|              schema|version|
+--------------------+-------+
|StructType([Struc...|      1|
+--------------------+-------+

