In [1]:
from importlib import reload

from pyspark.sql.functions import col
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql.window import Window as W

from hhop import get_spark_builder
spark_builder = get_spark_builder('custom_name'); 
spark = spark_builder.getOrCreate(); sc = spark.sparkContext; sc.setLogLevel("ERROR")

from hhop import DFExtender, SchemaManager, TablePartitionDescriber, SCD2Helper #main classes
from hhop import read_table, write_table, write_read_table, union_all, deduplicate_df, get_table_location # useful functions
from hhop import HhopException
display(spark)

23/07/18 23:56:24 WARN Utils: Your hostname, Pavels-MacBook-Air.local resolves to a loopback address: 127.0.0.200; using 192.168.0.103 instead (on interface en0)
23/07/18 23:56:24 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
23/07/18 23:56:24 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


## to_scd2

In [2]:
df1_transactions = spark.read.csv('../hhop/scd2_data/to_scd2.csv', sep=';', header=True)

In [11]:
df1_transactions = df1_transactions.withColumnRenamed('pk1', 'PK1')

In [12]:
df1_transactions.printSchema()

root
 |-- PK1: string (nullable = true)
 |-- pk2: string (nullable = true)
 |-- nonpk1: string (nullable = true)
 |-- nonpk2: string (nullable = true)
 |-- nonpk3: string (nullable = true)
 |-- nonpk_extra: string (nullable = true)
 |-- ts: string (nullable = true)



In [35]:
class DFCleaner:
    """WIP
    Helps with comparing and raising exceptions on columns of DFs
    """

    def __init__(self, df, **group_cols):
        self.df = df
        self.group_cols = group_cols
        self.df_columns = self.lower_list(self.df.columns)

    def lower_list(self, l):
        lower_list = [str.lower(elem) for elem in l]
        return lower_list

    def standardize_df(self, df):
        self.lower_columns(df)
        # check on duplicates
        return self.df

    def remember_cols(self):
        self.cols_snapshot = self.df_columns

    def restore_cols(self):
        return self.cols_snapshot

    def mass_rename(self, suffix, is_append_suffix, group_cols_include=None, group_cols_exclude=None):
        pass
        # for all (or group_cols_include)
        # if not group_cols_exclude
        # withColumnRenamed

        # re.sub(colname, r'suffix$', '')

    def tech_cols_something(self):
        pass

    def rename_to(self, old_new_mapping: dict):
        self._old_new_mapping = old_new_mapping
        for old, new in old_new_mapping.items():
            self.df = self.df.withColumnRenamed(old, new)

    def rename_back(self):
        for old, new in self._old_new_mapping.items():
            self.df = self.df.withColumnRenamed(new, old)


class DFValidator:
    "To validate 2 dataframes from class DFCleaner"
    def __init__(self) -> None:
        pass

    @classmethod
    def compare_group_cols(cls, df1: DFCleaner, df2: DFCleaner, group_cols: list):
        pass

    @staticmethod
    def compare_iterables(it1, it2, raise_exception=False):
        it1, it2 = set(it1), set(it2)
        diff = it1 - it2 # and vice versa



c = DFCleaner(df1_transactions, pk=['pK1', 'pk2'])
print(c.group_cols)
print(c.df_columns)

{'pk': ['pK1', 'pk2']}
['pk1', 'pk2', 'nonpk1', 'nonpk2', 'nonpk3', 'nonpk_extra', 'ts']


In [3]:
df1_transactions.orderBy(['pk1', 'pk2', 'ts']).show(100, False)

+---+----+------+------+------+-----------+-------------------+
|pk1|pk2 |nonpk1|nonpk2|nonpk3|nonpk_extra|ts                 |
+---+----+------+------+------+-----------+-------------------+
|v1 |null|null  |null  |c2    |r3         |2023-05-07 15:00:00|
|v1 |null|null  |null  |null  |null       |2023-05-10 15:00:00|
|v1 |null|null  |fds   |null  |null       |2023-05-11 15:00:00|
|v1 |null|null  |fds   |asdf  |null       |2023-05-12 15:00:00|
|v1 |c1  |a1    |b1    |c1    |r1         |2023-05-01 10:00:00|
|v1 |c1  |a1    |b1    |c1    |r2         |2023-05-01 12:00:00|
|v1 |c1  |a1    |b1    |c1    |r3         |2023-05-02 12:00:00|
|v1 |c1  |a1    |b1    |c2    |null       |2023-05-03 12:00:00|
|v1 |c1  |a1    |b2    |c2    |null       |2023-05-03 15:00:00|
|v1 |c1  |null  |b2    |c2    |r3         |2023-05-05 15:00:00|
|v1 |c1  |null  |b2    |c2    |r3         |2023-05-06 15:00:00|
|v1 |c1  |null  |null  |c2    |r3         |2023-05-07 15:00:00|
|v1 |c1  |null  |null  |null  |null     

In [4]:
df1_transactions_s = SCD2Helper(
    df1_transactions, 
    pk=['pk1', 'pk2'], 
    non_pk=['nonpk1', 'nonpk2', 'nonpk3'],
    time_col='ts',
)

In [5]:
df1_scd2 = df1_transactions_s.df_to_scd2().cache()

In [6]:
df1_scd2.orderBy(['pk1', 'pk2', 'ts']).show(100, False)

+---+----+------+------+------+-----------+-------------------+--------------------------------+---------------+-------------+
|pk1|pk2 |nonpk1|nonpk2|nonpk3|nonpk_extra|ts                 |row_hash                        |row_actual_from|row_actual_to|
+---+----+------+------+------+-----------+-------------------+--------------------------------+---------------+-------------+
|v1 |null|null  |null  |c2    |r3         |2023-05-07 15:00:00|56e6807f4b745e20dffeb1b731e5a6d4|2023-05-07     |2023-05-09   |
|v1 |null|null  |null  |null  |null       |2023-05-10 15:00:00|6654c734ccab8f440ff0825eb443dc7f|2023-05-10     |2023-05-10   |
|v1 |null|null  |fds   |null  |null       |2023-05-11 15:00:00|2d2722576095dd7996570b307d777539|2023-05-11     |2023-05-11   |
|v1 |null|null  |fds   |asdf  |null       |2023-05-12 15:00:00|b08363345cd7c1cb14e6f4747ce1563d|2023-05-12     |9999-12-31   |
|v1 |c1  |a1    |b1    |c1    |r1         |2023-05-01 10:00:00|93e6cc4b8b0445cf261e9417106ae6f0|2023-05-01     

## validate_scd2

In [7]:
df1_scd2.printSchema()

root
 |-- pk1: string (nullable = true)
 |-- pk2: string (nullable = true)
 |-- nonpk1: string (nullable = true)
 |-- nonpk2: string (nullable = true)
 |-- nonpk3: string (nullable = true)
 |-- nonpk_extra: string (nullable = true)
 |-- ts: string (nullable = true)
 |-- row_hash: string (nullable = false)
 |-- row_actual_from: string (nullable = true)
 |-- row_actual_to: string (nullable = false)



In [8]:
res = df1_scd2.validate_scd2()
print(res, res[0], res[3])

Number of records: 10
All tests passed
Errors_In_SCD2_table(duplicates_by_pk=0, invalid_dates=0, broken_history=0, duplicates_by_version=0) 0 0


In [9]:
df1_scd2_wrong_copy = SCD2Helper(
    df1_scd2.withColumn('row_actual_to', F.when(col('row_actual_to') == '9999-12-31', F.lit('1000-01-01'))), 
    pk=['pk1', 'pk2'], 
    non_pk=['nonpk1', 'nonpk2', 'nonpk3'],
    time_col='ts',
)
res = df1_scd2_wrong_copy.validate_scd2()
print(res, res[0], res[3])

There are 2 PK duplicates by ['pk1', 'pk2', 'row_actual_to'] Look at `.basic_pk_check.df_duplicates_pk`
10 rows with invalid dates, look at `.df_invalid_dates`
Number of records: 10
Errors_In_SCD2_table(duplicates_by_pk=2, invalid_dates=10, broken_history=0, duplicates_by_version=0) 2 0


In [10]:
df1_scd2_wrong_copy.df_invalid_dates.show()

+---+----+------+------+------+-----------+-------------------+--------------------+---------------+-------------+---------------+-------------+-------------------+
|pk1| pk2|nonpk1|nonpk2|nonpk3|nonpk_extra|                 ts|            row_hash|row_actual_from|row_actual_to|valid_date_from|valid_date_to|incorrect_direction|
+---+----+------+------+------+-----------+-------------------+--------------------+---------------+-------------+---------------+-------------+-------------------+
| v1|  c1|    a1|    b1|    c1|         r1|2023-05-01 10:00:00|93e6cc4b8b0445cf2...|     2023-05-01|         null|           true|        false|              false|
| v1|  c1|    a1|    b2|    c2|       null|2023-05-03 15:00:00|a6244d3c7c2aed33c...|     2023-05-03|         null|           true|        false|              false|
| v1|  c1|  null|    b2|    c2|         r3|2023-05-05 15:00:00|17f599be9e07976c2...|     2023-05-05|         null|           true|        false|              false|
| v1|  c1|

In [11]:
import hhop
reload(hhop)
reload(hhop.hhop)
reload(hhop.hhop.main)
import hhop
reload(hhop)
reload(hhop.hhop)
reload(hhop.hhop.main)
from hhop import SCD2Helper

## Fill history

In [12]:
df1_scd2.show(10, False)

+---+----+------+------+------+-----------+-------------------+--------------------------------+---------------+-------------+
|pk1|pk2 |nonpk1|nonpk2|nonpk3|nonpk_extra|ts                 |row_hash                        |row_actual_from|row_actual_to|
+---+----+------+------+------+-----------+-------------------+--------------------------------+---------------+-------------+
|v1 |c1  |a1    |b1    |c1    |r1         |2023-05-01 10:00:00|93e6cc4b8b0445cf261e9417106ae6f0|2023-05-01     |2023-05-02   |
|v1 |c1  |a1    |b2    |c2    |null       |2023-05-03 15:00:00|a6244d3c7c2aed33c4d9525fbef29c1d|2023-05-03     |2023-05-04   |
|v1 |c1  |null  |b2    |c2    |r3         |2023-05-05 15:00:00|17f599be9e07976c2036361c9ad8f633|2023-05-05     |2023-05-06   |
|v1 |c1  |null  |null  |c2    |r3         |2023-05-07 15:00:00|a363a9dd6d5b30865ab5813581941516|2023-05-07     |2023-05-09   |
|v1 |c1  |null  |null  |null  |null       |2023-05-10 15:00:00|da58ea33b20d82042d9969c46c16c3b8|2023-05-10     

In [13]:
df1_scd2_add_more_holes = SCD2Helper(
    df1_scd2.filter(~col('row_hash').isin("a363a9dd6d5b30865ab5813581941516", 'a363a9dd6d5b30865ab5813581941516', 'a6244d3c7c2aed33c4d9525fbef29c1d')), 
    pk=['pk1', 'pk2'], 
    non_pk=['nonpk1', 'nonpk2', 'nonpk3'],
)
df1_scd2_add_more_holes.show(10, False)

+---+----+------+------+------+-----------+-------------------+--------------------------------+---------------+-------------+
|pk1|pk2 |nonpk1|nonpk2|nonpk3|nonpk_extra|ts                 |row_hash                        |row_actual_from|row_actual_to|
+---+----+------+------+------+-----------+-------------------+--------------------------------+---------------+-------------+
|v1 |c1  |a1    |b1    |c1    |r1         |2023-05-01 10:00:00|93e6cc4b8b0445cf261e9417106ae6f0|2023-05-01     |2023-05-02   |
|v1 |c1  |null  |b2    |c2    |r3         |2023-05-05 15:00:00|17f599be9e07976c2036361c9ad8f633|2023-05-05     |2023-05-06   |
|v1 |c1  |null  |null  |null  |null       |2023-05-10 15:00:00|da58ea33b20d82042d9969c46c16c3b8|2023-05-10     |2023-05-12   |
|v1 |null|null  |null  |c2    |r3         |2023-05-07 15:00:00|56e6807f4b745e20dffeb1b731e5a6d4|2023-05-07     |2023-05-09   |
|v1 |null|null  |null  |null  |null       |2023-05-10 15:00:00|6654c734ccab8f440ff0825eb443dc7f|2023-05-10     

In [14]:
df1_filled_history = df1_scd2_add_more_holes.fill_scd2_history()
df1_filled_history.orderBy(['pk1', 'pk2', 'row_actual_from']).show(100, False)

+---+----+------+------+------+-----------+-------------------+--------------------------------+---------------+-------------+
|pk1|pk2 |nonpk1|nonpk2|nonpk3|nonpk_extra|ts                 |row_hash                        |row_actual_from|row_actual_to|
+---+----+------+------+------+-----------+-------------------+--------------------------------+---------------+-------------+
|v1 |null|null  |null  |null  |null       |null               |56e6807f4b745e20dffeb1b731e5a6d4|1000-01-01     |2023-05-06   |
|v1 |null|null  |null  |c2    |r3         |2023-05-07 15:00:00|56e6807f4b745e20dffeb1b731e5a6d4|2023-05-07     |2023-05-09   |
|v1 |null|null  |null  |null  |null       |2023-05-10 15:00:00|6654c734ccab8f440ff0825eb443dc7f|2023-05-10     |2023-05-10   |
|v1 |null|null  |fds   |null  |null       |2023-05-11 15:00:00|2d2722576095dd7996570b307d777539|2023-05-11     |2023-05-11   |
|v1 |null|null  |fds   |asdf  |null       |2023-05-12 15:00:00|b08363345cd7c1cb14e6f4747ce1563d|2023-05-12     

In [15]:
df1_filled_history.validate_scd2()

time_col is not provided, checking duplicated versions is skipped
Number of records: 12


Errors_In_SCD2_table(duplicates_by_pk=0, invalid_dates=0, broken_history=0, duplicates_by_version=1)

## merge scd2 history

In [16]:
df1_scd2_fewer_non_pk = SCD2Helper(
    df1_scd2,
    pk=['pk1', 'pk2'], 
    non_pk=['nonpk2'],
)

In [17]:
df1_merged_history = df1_scd2_fewer_non_pk.merge_scd2_history().cache()

In [18]:
df1_scd2.orderBy('pk1', 'pk2', 'row_actual_from').show(10, False)

+---+----+------+------+------+-----------+-------------------+--------------------------------+---------------+-------------+
|pk1|pk2 |nonpk1|nonpk2|nonpk3|nonpk_extra|ts                 |row_hash                        |row_actual_from|row_actual_to|
+---+----+------+------+------+-----------+-------------------+--------------------------------+---------------+-------------+
|v1 |null|null  |null  |c2    |r3         |2023-05-07 15:00:00|56e6807f4b745e20dffeb1b731e5a6d4|2023-05-07     |2023-05-09   |
|v1 |null|null  |null  |null  |null       |2023-05-10 15:00:00|6654c734ccab8f440ff0825eb443dc7f|2023-05-10     |2023-05-10   |
|v1 |null|null  |fds   |null  |null       |2023-05-11 15:00:00|2d2722576095dd7996570b307d777539|2023-05-11     |2023-05-11   |
|v1 |null|null  |fds   |asdf  |null       |2023-05-12 15:00:00|b08363345cd7c1cb14e6f4747ce1563d|2023-05-12     |9999-12-31   |
|v1 |c1  |a1    |b1    |c1    |r1         |2023-05-01 10:00:00|93e6cc4b8b0445cf261e9417106ae6f0|2023-05-01     

In [19]:
df1_merged_history.orderBy('pk1', 'pk2', 'row_actual_from').show(10, False)

+---+----+------+------+------+-----------+-------------------+--------------------------------+---------------+-------------+
|pk1|pk2 |nonpk1|nonpk2|nonpk3|nonpk_extra|ts                 |row_hash                        |row_actual_from|row_actual_to|
+---+----+------+------+------+-----------+-------------------+--------------------------------+---------------+-------------+
|v1 |null|null  |null  |c2    |r3         |2023-05-07 15:00:00|6654c734ccab8f440ff0825eb443dc7f|2023-05-07     |2023-05-10   |
|v1 |null|null  |fds   |null  |null       |2023-05-11 15:00:00|2d2722576095dd7996570b307d777539|2023-05-11     |9999-12-31   |
|v1 |c1  |a1    |b1    |c1    |r1         |2023-05-01 10:00:00|096c3c37214aa93e8c988eddef82cf00|2023-05-01     |2023-05-02   |
|v1 |c1  |a1    |b2    |c2    |null       |2023-05-03 15:00:00|e3a0efcb4f913e410841e9a50cc55b0b|2023-05-03     |2023-05-06   |
|v1 |c1  |null  |null  |c2    |r3         |2023-05-07 15:00:00|da58ea33b20d82042d9969c46c16c3b8|2023-05-07     

## join scd2 tables

In [20]:
df1, df2 = [spark.read.csv(f'../hhop/scd2_data/df_scd2_join_{i}.csv', sep=';', header=True) for i in range(1, 3)]

In [21]:
df1.show(10, False)

+---+---+--------+-------------------+
|pk1|pk2|email_id|ts                 |
+---+---+--------+-------------------+
|v1 |c1 |e1      |2023-05-01 10:00:00|
|v1 |c1 |e2      |2023-05-04 12:00:00|
|v1 |c1 |e3      |2023-05-10 12:00:00|
|v1 |c1 |e1      |2023-05-12 12:00:00|
|v1 |c2 |e1      |2023-05-01 10:00:00|
|v1 |c3 |e2      |2023-05-04 12:00:00|
|v1 |c3 |e3      |2023-05-10 12:00:00|
|v1 |c3 |e1      |2023-05-12 12:00:00|
+---+---+--------+-------------------+



In [22]:
df2.show(10, False)

+---+---+--------+-------------------+
|pk1|pk2|phone_id|ts                 |
+---+---+--------+-------------------+
|v1 |c1 |e1      |2023-04-01 10:00:00|
|v1 |c1 |e2      |2023-05-06 12:00:00|
|v1 |c1 |e3      |2023-05-12 12:00:00|
|v1 |c1 |e1      |2023-05-13 12:00:00|
|v1 |c2 |e1      |2023-04-01 10:00:00|
|v1 |c2 |e2      |2023-05-06 12:00:00|
|v1 |c2 |e3      |2023-05-12 12:00:00|
|v1 |c2 |e1      |2023-05-13 12:00:00|
+---+---+--------+-------------------+



In [23]:
df1_scd2_j, df2_scd2_j = [SCD2Helper(df, ['pk1', 'pk2'], [non_pk_col], 'ts').df_to_scd2().cache() for df, non_pk_col in zip((df1, df2), ('email_id', 'phone_id'))]
df1_scd2_j, df2_scd2_j = [SCD2Helper(df.drop('ts'), ['pk1', 'pk2'], [non_pk_col], 'ts') for df, non_pk_col in zip((df1_scd2_j, df2_scd2_j),('email_id', 'phone_id'))]

In [24]:
df1_scd2_j.show(10)

+---+---+--------+--------------------+---------------+-------------+
|pk1|pk2|email_id|            row_hash|row_actual_from|row_actual_to|
+---+---+--------+--------------------+---------------+-------------+
| v1| c1|      e1|e14f0e80db49cd150...|     2023-05-01|   2023-05-03|
| v1| c1|      e2|9862c1fb9265b0369...|     2023-05-04|   2023-05-09|
| v1| c1|      e3|543b4e1fe15d3cd37...|     2023-05-10|   2023-05-11|
| v1| c1|      e1|e14f0e80db49cd150...|     2023-05-12|   9999-12-31|
| v1| c3|      e2|5f5d71094a0572ea7...|     2023-05-04|   2023-05-09|
| v1| c3|      e3|9743390e49e720967...|     2023-05-10|   2023-05-11|
| v1| c3|      e1|796f048cc1ff82db2...|     2023-05-12|   9999-12-31|
| v1| c2|      e1|db078b8d7b629e8c3...|     2023-05-01|   9999-12-31|
+---+---+--------+--------------------+---------------+-------------+



In [25]:
df2_scd2_j.show(100, False)

+---+---+--------+--------------------------------+---------------+-------------+
|pk1|pk2|phone_id|row_hash                        |row_actual_from|row_actual_to|
+---+---+--------+--------------------------------+---------------+-------------+
|v1 |c1 |e1      |e14f0e80db49cd1501de87adf05f6022|2023-04-01     |2023-05-05   |
|v1 |c1 |e2      |9862c1fb9265b03695dc9a727406c43e|2023-05-06     |2023-05-11   |
|v1 |c1 |e3      |543b4e1fe15d3cd37fc7b9454156f4e1|2023-05-12     |2023-05-12   |
|v1 |c1 |e1      |e14f0e80db49cd1501de87adf05f6022|2023-05-13     |9999-12-31   |
|v1 |c2 |e1      |db078b8d7b629e8c3e11aeaf24952480|2023-04-01     |2023-05-05   |
|v1 |c2 |e2      |284ed4afc0045d818e840896714656ca|2023-05-06     |2023-05-11   |
|v1 |c2 |e3      |87795052bb06129a6007a0dfaad2efef|2023-05-12     |2023-05-12   |
|v1 |c2 |e1      |db078b8d7b629e8c3e11aeaf24952480|2023-05-13     |9999-12-31   |
+---+---+--------+--------------------------------+---------------+-------------+



In [26]:
df1_scd2_j.join_scd2(df2_scd2_j).orderBy('pk1', 'pk2', 'row_actual_from').show(100, False)

+---+---+--------+--------+---------------+-------------+
|pk1|pk2|email_id|phone_id|row_actual_from|row_actual_to|
+---+---+--------+--------+---------------+-------------+
|v1 |c1 |e1      |e1      |2023-05-01     |2023-05-03   |
|v1 |c1 |e2      |e1      |2023-05-04     |2023-05-05   |
|v1 |c1 |e2      |e2      |2023-05-06     |2023-05-09   |
|v1 |c1 |e3      |e2      |2023-05-10     |2023-05-11   |
|v1 |c1 |e1      |e3      |2023-05-12     |2023-05-12   |
|v1 |c1 |e1      |e1      |2023-05-13     |9999-12-31   |
|v1 |c2 |e1      |e1      |2023-05-01     |2023-05-05   |
|v1 |c2 |e1      |e2      |2023-05-06     |2023-05-11   |
|v1 |c2 |e1      |e3      |2023-05-12     |2023-05-12   |
|v1 |c2 |e1      |e1      |2023-05-13     |9999-12-31   |
|v1 |c3 |e2      |null    |2023-05-04     |2023-05-09   |
|v1 |c3 |e3      |null    |2023-05-10     |2023-05-11   |
|v1 |c3 |e1      |null    |2023-05-12     |9999-12-31   |
+---+---+--------+--------+---------------+-------------+



join with filled history, however I recommend to cache or write this dataframes to HDFS if they are too large

In [27]:
df1_scd2_j.fill_scd2_history().join_scd2(df2_scd2_j.fill_scd2_history()).orderBy('pk1', 'pk2', 'row_actual_from').show(100, False)

+---+---+--------+--------+---------------+-------------+
|pk1|pk2|email_id|phone_id|row_actual_from|row_actual_to|
+---+---+--------+--------+---------------+-------------+
|v1 |c1 |null    |null    |1000-01-01     |2023-03-31   |
|v1 |c1 |null    |e1      |2023-04-01     |2023-04-30   |
|v1 |c1 |e1      |e1      |2023-05-01     |2023-05-03   |
|v1 |c1 |e2      |e1      |2023-05-04     |2023-05-05   |
|v1 |c1 |e2      |e2      |2023-05-06     |2023-05-09   |
|v1 |c1 |e3      |e2      |2023-05-10     |2023-05-11   |
|v1 |c1 |e1      |e3      |2023-05-12     |2023-05-12   |
|v1 |c1 |e1      |e1      |2023-05-13     |9999-12-31   |
|v1 |c2 |null    |null    |1000-01-01     |2023-03-31   |
|v1 |c2 |null    |e1      |2023-04-01     |2023-04-30   |
|v1 |c2 |e1      |e1      |2023-05-01     |2023-05-05   |
|v1 |c2 |e1      |e2      |2023-05-06     |2023-05-11   |
|v1 |c2 |e1      |e3      |2023-05-12     |2023-05-12   |
|v1 |c2 |e1      |e1      |2023-05-13     |9999-12-31   |
|v1 |c3 |null 

## merge SCD2 update

In [28]:
spark.stop()